/
nonlinear_CGE.R
executable file
·184 lines (169 loc) · 6 KB
/
nonlinear_CGE.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#' Constrained Graph Embedding
#'
#' Constrained Graph Embedding (CGE) is a semi-supervised embedding method that incorporates
#' partially available label information into the graph structure that find embeddings
#' consistent with the labels.
#'
#' @param X an \eqn{(n\times p)} matrix or data frame whose rows are observations
#' @param label a length-\eqn{n} vector of data class labels. It should contain \code{NA} elements for missing label.
#' @param ndim an integer-valued target dimension.
#' @param type a vector of neighborhood graph construction. Following types are supported;
#' \code{c("knn",k)}, \code{c("enn",radius)}, and \code{c("proportion",ratio)}.
#' Default is \code{c("proportion",0.1)}, connecting about 1/10 of nearest data points
#' among all data points. See also \code{\link{aux.graphnbd}} for more details.
#' @param preprocess an additional option for preprocessing the data.
#' Default is \code{"null"}. See also \code{\link{aux.preprocess}} for more details.
#'
#' @return a named list containing
#' \describe{
#' \item{Y}{an \eqn{(n\times ndim)} matrix whose rows are embedded observations.}
#' \item{trfinfo}{a list containing information for out-of-sample prediction.}
#' }
#'
#' @examples
#' ## use iris data
#' data(iris)
#' X = as.matrix(iris[,2:4])
#' label = as.integer(iris[,5])
#' lcols = as.factor(label)
#'
#' ## copy a label and let 10% of elements be missing
#' nlabel = length(label)
#' nmissing = round(nlabel*0.10)
#' label_missing = label
#' label_missing[sample(1:nlabel, nmissing)]=NA
#'
#' ## try different neighborhood sizes
#' out1 = do.cge(X, label_missing, type=c("proportion",0.10))
#' out2 = do.cge(X, label_missing, type=c("proportion",0.25))
#' out3 = do.cge(X, label_missing, type=c("proportion",0.50))
#'
#' ## visualize
#' opar = par(no.readonly=TRUE)
#' par(mfrow=c(1,3))
#' plot(out1$Y, main="10% connected", pch=19, col=lcols)
#' plot(out2$Y, main="25% connected", pch=19, col=lcols)
#' plot(out3$Y, main="50% connected", pch=19, col=lcols)
#' par(opar)
#'
#' @references
#' \insertRef{he_graph_2009}{Rdimtools}
#'
#' @rdname nonlinear_CGE
#' @author Kisung You
#' @concept nonlinear_methods
#' @export
do.cge <- function(X, label, ndim=2, type=c("proportion",0.1),
preprocess=c("null","center","scale","cscale","whiten","decorrelate")){
#------------------------------------------------------------------------
## PREPROCESSING
# 1. data matrix
aux.typecheck(X)
n = nrow(X)
p = ncol(X)
# 2. label : check and return a de-factored vector
# For this example, there should be no degenerate class of size 1.
if (missing(label)){
stop("* Semi-Supervised Learning : 'label' is required. For it not provided, consider using Unsupervised methods.")
}
label = check_label(label, n)
ulabel = unique(label)
if (all(!is.na(ulabel))){
message("* Semi-Supervised Learning : there is no missing labels. Consider using Supervised methods.")
}
if (any(is.infinite(ulabel))){
stop("* Semi-Supervised Learning : Inf is not allowed in label.")
}
# 3. ndim
ndim = as.integer(ndim)
if (!check_ndim(ndim,p)){
stop("* do.cge : 'ndim' is a positive integer in [1,#(covariates)].")
}
# 4. type
nbdtype = type
nbdsymmetric = "union"
# 5. preprocess
if (missing(preprocess)){
algpreprocess = "null"
} else {
algpreprocess = match.arg(preprocess)
}
#------------------------------------------------------------------------
## COMPUTATION : PRELIMINARY
# 1. preprocessing of data : note that output pX still has (n-by-p) format
tmplist = aux.preprocess.hidden(X,type=algpreprocess,algtype="linear")
trfinfo = tmplist$info
pX = tmplist$pX
# 2. re-arrange with label information
id.labeled = which(!is.na(label))
id.notlabd = which(is.na(label))
p = length(label[id.labeled])
pXnew = rbind(pX[id.labeled,], pX[id.notlabd,]) # first row section is for labeled data
label = c(label[id.labeled], label[id.notlabd]) #
ulabe = unique(label[1:p])
n = ncol(pXnew)
m = nrow(pXnew)
c = length(ulabe)
# 3. build neighborhood information among labeled
nbdstruct = aux.graphnbd(pXnew,method="euclidean",
type=nbdtype,symmetric=nbdsymmetric)
# 4. construct U matrix
U1 = array(0,c(m,c))
for (i in 1:c){
idU1 = which(label==ulabel[i])
U1[idU1,i] = rep(1,length(idU1))
}
U2top = array(0,c(p,m-p))
U2bot = diag(m-p)
U = cbind(U1,rbind(U2top, U2bot))
# 5. construct L and D
W = nbdstruct$mask*1 # should be (m x m) matrix
D = diag(base::rowSums(W))
L = D-W
#------------------------------------------------------------------------
## COMPUTATION : MAIN PART
# 1. generalized eigenvalue problem; numerical error serious..
llterm = t(U)%*%L%*%U; alpha1 = cge.minimaladd(llterm)$alpha
rrterm = t(U)%*%D%*%U; alpha2 = cge.minimaladd(rrterm)$alpha
glterm = llterm + max(alpha1, alpha2)*diag(nrow(llterm))
grterm = rrterm + max(alpha1, alpha2)*diag(nrow(rrterm))
gfun = getFromNamespace("hidden_geigen","maotai")
geigs = gfun(glterm, grterm, normalize=TRUE)
# geigs = geigen::geigen(glterm, grterm) # increasing order
idmin = max(which.min(geigs$values > 10*.Machine$double.eps), 2)
Z = geigs$vectors[,idmin:(idmin+ndim-1)]
# 2. reconstruct
pY = U%*%Z
########################################################################
## 5. return output
result = list()
result$Y = pY
trfinfo$algtype = "nonlinear"
result$trfinfo = trfinfo
return(result)
}
# minimal addition --------------------------------------------------------
#' @keywords internal
#' @noRd
cge.minimaladd <- function(D0){
ntgt = nrow(D0)
rD0 = round(aux_rank(D0))
if (rD0 >= ntgt){
output = list()
output$D0 = D0
output$alpha = 0.0
return(output)
} else {
alpha = 0.1
hello = TRUE
while (hello){
nice = D0 + alpha*diag(ntgt)
alpha = alpha + 0.1
hello = (round(aux_rank(nice)) < ntgt)
}
output = list()
output$D0 = D0+alpha*diag(ntgt)
output$alpha = alpha
return(output)
}
}