/
linear_ANMM.R
executable file
·218 lines (207 loc) · 7.19 KB
/
linear_ANMM.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#' Average Neighborhood Margin Maximization
#'
#' Average Neighborhood Margin Maximization (ANMM) is a supervised method
#' for feature extraction. It aims to find a projection mapping in the following manner;
#' for each data point, the algorithm tries to pull the neighboring points in the
#' same class while pushing neighboring points of different classes far away. It is known
#' that ANMM does suffer less from small sample size problem, which is bottleneck for LDA.
#'
#' @param X an \eqn{(n\times p)} matrix or data frame whose rows are observations
#' and columns represent independent variables.
#' @param label a length-\eqn{n} vector of data class labels.
#' @param ndim an integer-valued target dimension.
#' @param preprocess an additional option for preprocessing the data.
#' Default is "null". See also \code{\link{aux.preprocess}} for more details.
#' @param No neighborhood size for same-class data points; either a constant number or
#' a vector of length-\eqn{n} can be provided, as long as the values reside in \eqn{[2,n]}.
#' @param Ne neighborhood size for different-class data points; either a constant number or
#' a vector of length-\eqn{n} can be provided, as long as the values reside in \eqn{[2,n]}.
#'
#' @return a named list containing
#' \describe{
#' \item{Y}{an \eqn{(n\times ndim)} matrix whose rows are embedded observations.}
#' \item{trfinfo}{a list containing information for out-of-sample prediction.}
#' \item{projection}{a \eqn{(p\times ndim)} whose columns are basis for projection.}
#' }
#'
#' @examples
#' ## load iris data
#' data(iris)
#' set.seed(100)
#' subid = sample(1:150,50)
#' X = as.matrix(iris[subid,1:4])
#' label = as.factor(iris[subid,5])
#'
#' ## perform ANMM on different choices of neighborhood size
#' out1 = do.anmm(X, label, No=6, Ne=6)
#' out2 = do.anmm(X, label, No=2, Ne=10)
#' out3 = do.anmm(X, label, No=10,Ne=2)
#'
#' ## visualize
#' opar <- par(no.readonly=TRUE)
#' par(mfrow=c(1,3))
#' plot(out1$Y, main="(No,Ne)=(6,6)", pch=19, cex=0.5, col=label)
#' plot(out2$Y, main="(No,Ne)=(2,10)", pch=19, cex=0.5, col=label)
#' plot(out3$Y, main="(No,Ne)=(10,2)", pch=19, cex=0.5, col=label)
#' par(opar)
#'
#' @references
#' \insertRef{wang_feature_2007}{Rdimtools}
#'
#' @author Kisung You
#' @rdname linear_ANMM
#' @concept linear_methods
#' @export
do.anmm <- function(X, label, ndim=2, preprocess=c("null","center","scale","cscale","decorrelate","whiten"),
No=ceiling(nrow(X)/10), Ne=ceiling(nrow(X)/10)){
#------------------------------------------------------------------------
## PREPROCESSING
# 1. data matrix
aux.typecheck(X)
n = nrow(X)
p = ncol(X)
# 2. label : check and return a de-factored vector
# For this example, there should be no degenerate class of size 1.
label = check_label(label, n)
ulabel = unique(label)
for (i in 1:length(ulabel)){
if (sum(label==ulabel[i])==1){
stop("* do.anmm : no degerate class of size 1 is allowed.")
}
}
if (any(is.na(label))||(any(is.infinite(label)))){
stop("* Supervised Learning : any element of 'label' as NA or Inf will simply be considered as a class, not missing entries.")
}
# 3. ndim
ndim = as.integer(ndim)
if (!check_ndim(ndim,p)){
stop("* do.anmm : 'ndim' is a positive integer in [1,#(covariates)].")
}
# 4. preprocess
algpreprocess = match.arg(preprocess)
# 5. No : size for Homogeneous Neighborhood
# Ne : Heterogenoeus Neighborhood
vecNo = anmm_nbdstructure(No, n, 1)
vecNe = anmm_nbdstructure(Ne, n, 2)
#------------------------------------------------------------------------
## COMPUTATION
# 1. preprocessing of data : note that output pX still has (n-by-p) format
tmplist = aux.preprocess.hidden(X,type=algpreprocess,algtype="linear")
trfinfo = tmplist$info
pX = tmplist$pX
# 2. extract 2 types of neighborhood information
D = as.matrix(dist(pX, method="euclidean"))
listNo = anmm_find_No(D, label, vecNo)
listNe = anmm_find_Ne(D, label, vecNe)
# 3. compute scatter and compactness
S = anmm_computeSC(pX, listNe)
C = anmm_computeSC(pX, listNo)
# 4. extract eigenvector
eigSC = RSpectra::eigs_sym(S-C, ndim, which="LA")
projection = matrix(eigSC$vectors, nrow=p)
# 5. adjust eigenvectors
projection = aux.adjprojection(projection)
#------------------------------------------------------------------------
## RETURN
result = list()
result$Y = pX%*%projection
result$trfinfo = trfinfo
result$projection = projection
return(result)
}
# auxiliary for ANMM ------------------------------------------------------
# 1. nbdstructure input argument
#' @keywords internal
#' @noRd
anmm_nbdstructure <- function(sizevec, n, Ntype){
tmp = as.vector(round(sizevec))
if (length(tmp)==1){
nbdvec = rep(tmp, n)
} else if (length(tmp)==n){
nbdvec = tmp
} else {
if (Ntype==1){
stop("* do.anmm : homogeneous neighborhood input is invalid.")
} else {
stop("* do.anmm : heterogeneous neighborhood input is invalid.")
}
}
if ((any(nbdvec<1))||(any(nbdvec>n))){
if (Ntype==1){
stop("* do.anmm : range of values from No is invalid.")
} else {
stop("* do.anmm : range of values from Ne is invalid.")
}
}
return(nbdvec)
}
# HERE comes the real problem. Say we needed to choose 3-Ho neighborhood.
# However, it is still possible that we only have 1 element in each class.
# For this case, I already cleared it out in the preprocessin step,
# by not allowing the degenerate class of size 1. In both cases, it returns
# a list containing membership structure at each node.
#
# 2. find homogeneous neighborhood
#' @keywords internal
#' @noRd
anmm_find_No <- function(matD, label, vecNo){
n = length(label)
if (nrow(matD)!=n){stop("* do.anmm : I don't know why it stopped 1.")}
output = list()
numNo = rep(0,n)
for (i in 1:n){
# compute possible values by taking minimization
clabel = which(label==label[i])
nclabel = sum(label==label[i])
nselect = round(min(nclabel, vecNo[i]))
numNo[i]= nselect
# find the distance
tgtdist = matD[i,clabel]
smindex = which(order(tgtdist)<=(nselect+1))
tgtlabel = setdiff(clabel[smindex], round(i))
output[[i]] = tgtlabel
}
return(output)
}
# 3. find heterogeneous neighborhood
#' @keywords internal
#' @noRd
anmm_find_Ne <- function(matD, label, vecNe){
n = length(label)
if (nrow(matD)!=n){stop("* do.anmm : I don't know why it stopped 2.")}
output = list()
numNe = rep(0,n)
for (i in 1:n){
clabel = which(label!=label[i])
nclabel = length(clabel)
nselect = round(min(nclabel, vecNe[i]))
numNe[i]= nselect
tgtdist = matD[i,clabel]
smindex = which(order(tgtdist)<=(nselect+1))
tgtlabel= setdiff(clabel[smindex], round(i))
output[[i]] = tgtlabel
}
return(output)
}
# 4. compute Scatterness or Compactness
#' @keywords internal
#' @noRd
anmm_computeSC <- function(X, memlist){
n = nrow(X)
p = ncol(X)
S = array(0, c(p,p))
for (i in 1:n){
xi = as.vector(X[i,])
tgtvecs = memlist[[i]]
tgtsize = length(tgtvecs)
Stmp = array(0,c(p,p))
for (k in 1:tgtsize){
xk = as.vector(X[tgtvecs[k],])
xdiff= xi-xk
Stmp = Stmp + outer(xdiff,xdiff)
}
Stmp = Stmp/tgtsize
S = S + Stmp
}
return(S)
}