-
Notifications
You must be signed in to change notification settings - Fork 10
/
feature_LSDF.R
executable file
·162 lines (158 loc) · 5.92 KB
/
feature_LSDF.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
#' Locality Sensitive Discriminant Feature
#'
#' Locality Sensitive Discriminant Feature (LSDF) is a semi-supervised feature selection method.
#' It utilizes both labeled and unlabeled data points in that labeled points are used to maximize
#' the margin between data opints from different classes, while labeled ones are used to discover
#' the geometrical structure of the data space.
#'
#' @param X an \eqn{(n\times p)} matrix or data frame whose rows are observations
#' and columns represent independent variables.
#' @param label a length-\eqn{n} vector of data class labels. It should contain \code{NA} elements for missing label.
#' @param ndim an integer-valued target dimension.
#' @param type a vector of neighborhood graph construction. Following types are supported;
#' \code{c("knn",k)}, \code{c("enn",radius)}, and \code{c("proportion",ratio)}.
#' Default is \code{c("proportion",0.1)}, connecting about 1/10 of nearest data points
#' among all data points. See also \code{\link{aux.graphnbd}} for more details.
#' @param preprocess an additional option for preprocessing the data.
#' Default is "null". See also \code{\link{aux.preprocess}} for more details.
#' @param gamma within-class weight parameter for same-class data.
#'
#' @return a named list containing
#' \describe{
#' \item{Y}{an \eqn{(n\times ndim)} matrix whose rows are embedded observations.}
#' \item{featidx}{a length-\eqn{ndim} vector of indices with highest scores.}
#' \item{trfinfo}{a list containing information for out-of-sample prediction.}
#' \item{projection}{a \eqn{(p\times ndim)} whose columns are basis for projection.}
#' }
#'
#' @examples
#' ## generate data of 3 types with clear difference
#' set.seed(100)
#' dt1 = aux.gensamples(n=20)-50
#' dt2 = aux.gensamples(n=20)
#' dt3 = aux.gensamples(n=20)+50
#'
#' ## merge the data and create a label correspondingly
#' X = rbind(dt1,dt2,dt3)
#' label = rep(1:3, each=20)
#'
#' ## copy a label and let 20% of elements be missing
#' nlabel = length(label)
#' nmissing = round(nlabel*0.20)
#' label_missing = label
#' label_missing[sample(1:nlabel, nmissing)]=NA
#'
#' ## try different neighborhood sizes
#' out1 = do.lsdf(X, label_missing, type=c("proportion",0.10))
#' out2 = do.lsdf(X, label_missing, type=c("proportion",0.25))
#' out3 = do.lsdf(X, label_missing, type=c("proportion",0.50))
#'
#' ## visualize
#' opar <- par(no.readonly=TRUE)
#' par(mfrow=c(1,3))
#' plot(out1$Y, pch=19, col=label, main="10% connectivity")
#' plot(out2$Y, pch=19, col=label, main="25% connectivity")
#' plot(out3$Y, pch=19, col=label, main="50% connectivity")
#' par(opar)
#'
#' @references
#' \insertRef{cai_locality_2007}{Rdimtools}
#'
#' @rdname feature_LSDF
#' @author Kisung You
#' @concept feature_methods
#' @export
do.lsdf <- function(X, label, ndim=2, type=c("proportion",0.1),
preprocess=c("null","center","scale","cscale","whiten","decorrelate"), gamma=100){
#------------------------------------------------------------------------
## PREPROCESSING
# 1. data matrix
aux.typecheck(X)
n = nrow(X)
p = ncol(X)
# 2. label : check and return a de-factored vector
# For this example, there should be no degenerate class of size 1.
if (missing(label)){
stop("* Semi-Supervised Learning : 'label' is required. For it not provided, consider using Unsupervised methods.")
}
label = check_label(label, n)
ulabel = unique(label)
if (all(!is.na(ulabel))){
message("* Semi-Supervised Learning : there is no missing labels. Consider using Supervised methods.")
}
if (any(is.infinite(ulabel))){
stop("* Semi-Supervised Learning : Inf is not allowed in label.")
}
# 3. ndim
ndim = as.integer(ndim)
if (!check_ndim(ndim,p)){
stop("* do.lsdf : 'ndim' is a positive integer in [1,#(covariates)].")
}
# 4. type
nbdtype = type
nbdsymmetric = "union"
# 5. preprocess
if (missing(preprocess)){
algpreprocess = "null"
} else {
algpreprocess = match.arg(preprocess)
}
# 6. gamma
gamma = as.double(gamma)
if (!check_NumMM(gamma,1,1e+10)){stop("* do.lsdf : 'gamma' is a large positive real number.")}
#------------------------------------------------------------------------
## COMPUTATION : PRELIMINARY
# 1. preprocessing of data : note that output pX still has (n-by-p) format
tmplist = aux.preprocess.hidden(X,type=algpreprocess,algtype="linear")
trfinfo = tmplist$info
pX = tmplist$pX
# 2. build neighborhood information
nbdstruct = aux.graphnbd(pX,method="euclidean",
type=nbdtype,symmetric=nbdsymmetric)
nbdmask = nbdstruct$mask
#------------------------------------------------------------------------
## COMPUTATION : MAIN COMPUTATION FOR LSDF
# 1. build Within- and between-class weights
Sb = array(0,c(n,n))
Sw = array(0,c(n,n))
for (i in 1:(n-1)){
class1 = label[i]
for (j in (i+1):n){
class2 = label[j]
if (((!is.na(class1))&&(!is.na(class2)))&&(class1==class2)){
Sw[i,j] = gamma
Sw[j,i] = gamma
} else if ((isTRUE(nbdmask[i,j])||isTRUE(nbdmask[j,i]))&&(is.na(class1)||is.na(class2))){
Sw[i,j] = 1.0
Sw[j,i] = 1.0
}
if (((!is.na(class1))&&(!is.na(class2)))&&(class1!=class2)){
Sb[i,j] = 1.0
Sb[j,i] = 1.0
}
}
}
# 2. laplacian graphs
Lw = diag(rowSums(Sw))-Sw
Lb = diag(rowSums(Sb))-Sb
# 3. compute feature scores
fscore = rep(0,p)
for (j in 1:p){
fr = as.vector(pX[,j])
term1 = sum(as.vector(Lb%*%matrix(fr))*fr)
term2 = sum(as.vector(Lw%*%matrix(fr))*fr)
fscore[j] = term1/term2
}
# 4. find the largest ones
idxvec = base::order(fscore, decreasing=TRUE)[1:ndim]
# 5. find the projection matrix
projection = aux.featureindicator(p,ndim,idxvec)
#------------------------------------------------------------------------
## RETURN
result = list()
result$Y = pX%*%projection
result$featidx = idxvec
result$trfinfo = trfinfo
result$projection = projection
return(result)
}