-
Notifications
You must be signed in to change notification settings - Fork 10
/
nonlinear_KSDA.R
executable file
·145 lines (138 loc) · 5.26 KB
/
nonlinear_KSDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#' Kernel Semi-Supervised Discriminant Analysis
#'
#' Kernel Semi-Supervised Discriminant Analysis (KSDA) is a nonlinear variant of
#' SDA (\code{\link{do.sda}}). For simplicity, we enabled heat/gaussian kernel only.
#' Note that this method is \emph{quite} sensitive to choices of
#' parameters, \code{alpha}, \code{beta}, and \code{t}. Especially when data
#' are well separated in the original space, it may lead to unsatisfactory results.
#'
#' @param X an \eqn{(n\times p)} matrix or data frame whose rows are observations
#' and columns represent independent variables.
#' @param label a length-\eqn{n} vector of data class labels.
#' @param ndim an integer-valued target dimension.
#' @param type a vector of neighborhood graph construction. Following types are supported;
#' \code{c("knn",k)}, \code{c("enn",radius)}, and \code{c("proportion",ratio)}.
#' Default is \code{c("proportion",0.1)}, connecting about 1/10 of nearest data points
#' among all data points. See also \code{\link{aux.graphnbd}} for more details.
#' @param alpha balancing parameter between model complexity and empirical loss.
#' @param beta Tikhonov regularization parameter.
#' @param t bandwidth parameter for heat kernel.
#'
#' @return a named list containing
#' \describe{
#' \item{Y}{an \eqn{(n\times ndim)} matrix whose rows are embedded observations.}
#' \item{trfinfo}{a list containing information for out-of-sample prediction.}
#' }
#'
#' @seealso \code{\link{do.sda}}
#' @examples
#' ## generate data of 3 types with clear difference
#' set.seed(100)
#' dt1 = aux.gensamples(n=20)-100
#' dt2 = aux.gensamples(n=20)
#' dt3 = aux.gensamples(n=20)+100
#'
#' ## merge the data and create a label correspondingly
#' X = rbind(dt1,dt2,dt3)
#' label = rep(1:3, each=20)
#'
#' ## copy a label and let 10% of elements be missing
#' nlabel = length(label)
#' nmissing = round(nlabel*0.10)
#' label_missing = label
#' label_missing[sample(1:nlabel, nmissing)]=NA
#'
#' ## compare true case with missing-label case
#' out1 = do.ksda(X, label, beta=0, t=0.1)
#' out2 = do.ksda(X, label_missing, beta=0, t=0.1)
#'
#' ## visualize
#' opar = par(no.readonly=TRUE)
#' par(mfrow=c(1,2))
#' plot(out1$Y, col=label, main="true projection")
#' plot(out2$Y, col=label, main="20% missing labels")
#' par(opar)
#'
#' @references
#' \insertRef{cai_semisupervised_2007}{Rdimtools}
#'
#' @rdname nonlinear_KSDA
#' @author Kisung You
#' @concept nonlinear_methods
#' @export
do.ksda <- function(X, label, ndim=2, type=c("proportion",0.1), alpha=1.0, beta=1.0, t=1.0){
#------------------------------------------------------------------------
## PREPROCESSING
# 1. data matrix
aux.typecheck(X)
n = nrow(X)
p = ncol(X)
# 2. label : check and return a de-factored vector
# For this example, there should be no degenerate class of size 1.
label = check_label(label, n)
ulabel = unique(label)
if (all(!is.na(ulabel))){
message("* Semi-Supervised Learning : there is no missing labels. Consider using Supervised methods.")
}
labelorder = order(label)
labelrank = rank(label)
# 3. ndim
if (!check_ndim(ndim,p)){
stop("* do.ksda : 'ndim' is a positive integer in [1,#(covariates)].")
}
ndim = as.integer(ndim)
# 4. alpha : balancing
alpha = as.double(alpha)
if (!check_NumMM(alpha,0,Inf,compact=FALSE)){stop("* do.ksda : 'alpha' needs to be a positive real number.")}
# 5. beta : regularization
beta = as.double(beta)
if (!check_NumMM(beta,0,Inf,compact=TRUE)){stop("* do.ksda : 'beta; needs to be a nonnegative real number.")}
# 6. neighborhood type
nbdtype = type
# 7. t : kernel bandwidth
t = as.double(t)
if (!check_NumMM(t, 0, 1e+10, compact=FALSE)){stop("* do.ksda : 't' is a bandwidth parameter for gaussian kernel.")}
# (Implicit) preprocessing
algpreprocess = "center"
# (Implicit) neighborhood symmetric
nbdsymmetric = "union"
#------------------------------------------------------------------------
## COMPUTATION : PRELIMINARY
# 1. preprocessing with re-labeling of data
tmplist = aux.preprocess.hidden(X,type=algpreprocess,algtype="nonlinear")
trfinfo = tmplist$info
pX = tmplist$pX[labelorder,]
label = label[labelorder]
# 2. neighborhood graph
nbdstruct = aux.graphnbd(pX,method="euclidean",
type=nbdtype,symmetric=nbdsymmetric)
nbdmask = nbdstruct$mask
# 3. S : binary adjacency
S = nbdmask*1.0
L = diag(rowSums(S))-S
# 4. W : Weight Matrix
idxmaxlabeled = sum(!is.na(label))
Wl = sda_build_Wl(label[1:idxmaxlabeled])
W = array(0,c(n,n))
W[1:idxmaxlabeled, 1:idxmaxlabeled] = Wl
# 5. Itilde
Itilde = array(0,c(n,n))
Itilde[1:idxmaxlabeled, 1:idxmaxlabeled] = diag(idxmaxlabeled)
# 6. computation
K = exp(-(as.matrix(dist(pX))^2)/(2*(t^2)))
#------------------------------------------------------------------------
## COMPUTATION : PRELIMINARY
# 1. setup
LHS = K%*%W%*%K
RHS = K%*%( Itilde + (alpha*L) + (beta*diag(n)) )%*%K
# 2. top eigenvectors
pseudoproj = aux.geigen(LHS, RHS, ndim, maximal=TRUE)
# 3. find projected ones : recovering its order will be performed later.
Y = K%*%pseudoproj
#------------------------------------------------------------------------
## RETURN
result = list()
result$Y = Y[labelrank,]
result$trfinfo = trfinfo
return(result)
}