/
nonlinear_SPMDS.R
executable file
·195 lines (182 loc) · 6.75 KB
/
nonlinear_SPMDS.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#' Spectral Multidimensional Scaling
#'
#' \code{do.spmds} transfers the classical multidimensional scaling problem into
#' the data spectral domain using Laplace-Beltrami operator. Its flexibility
#' to use subsamples and spectral interpolation of non-reference data enables relatively
#' efficient computation for large-scale data.
#'
#' @param X an \eqn{(n\times p)} matrix or data frame whose rows are observations
#' and columns represent independent variables.
#' @param ndim an integer-valued target dimension.
#' @param neigs number of eigenvectors to be used as \emph{spectral dimension}.
#' @param ratio percentage of subsamples as reference points.
#' @param preprocess an additional option for preprocessing the data.
#' Default is \code{"null"}. See also \code{\link{aux.preprocess}} for more details.
#' @param type a vector of neighborhood graph construction. Following types are supported;
#' \code{c("knn",k)}, \code{c("enn",radius)}, and \code{c("proportion",ratio)}.
#' Default is \code{c("proportion",0.1)}, connecting about 1/10 of nearest data points
#' among all data points. See also \code{\link{aux.graphnbd}} for more details.
#' @param symmetric one of \code{"intersect"}, \code{"union"} or \code{"asymmetric"} is supported. Default is \code{"union"}. See also \code{\link{aux.graphnbd}} for more details.
#'
#' @return a named list containing
#' \describe{
#' \item{Y}{an \eqn{(n\times ndim)} matrix whose rows are embedded observations.}
#' \item{trfinfo}{a list containing information for out-of-sample prediction.}
#' }
#'
#' @examples
#' \dontrun{
#' ## Replicate the numerical example from the paper
#' # Data Preparation
#' set.seed(100)
#' dim.true = 3 # true dimension
#' dim.embed = 100 # embedding space (high-d)
#' npoints = 1000 # number of samples to be generated
#'
#' v = matrix(runif(dim.embed*dim.true),ncol=dim.embed)
#' coeff = matrix(runif(dim.true*npoints), ncol=dim.true)
#' X = coeff%*%v
#'
#' # see the effect of neighborhood size
#' out1 = do.spmds(X, neigs=100, type=c("proportion",0.10))
#' out2 = do.spmds(X, neigs=100, type=c("proportion",0.25))
#' out3 = do.spmds(X, neigs=100, type=c("proportion",0.50))
#'
#' # visualize the results
#' opar <- par(no.readonly=TRUE)
#' par(mfrow=c(1,3))
#' plot(out1$Y, main="10% neighborhood")
#' plot(out2$Y, main="25% neighborhood")
#' plot(out3$Y, main="50% neighborhood")
#' par(opar)
#' }
#'
#' @references
#' \insertRef{aflalo_spectral_2013}{Rdimtools}
#'
#' @author Kisung You
#' @rdname nonlinear_SPMDS
#' @concept nonlinear_methods
#' @export
do.spmds <- function(X, ndim=2, neigs=max(2,nrow(X)/10), ratio=0.1,
preprocess=c("null","center","scale","cscale","decorrelate","whiten"),
type=c("proportion",0.1), symmetric=c("union","intersect","asymmetric")){
#------------------------------------------------------------------------
## PREPROCESSING : PARAMETERS
# 1. data matrix
aux.typecheck(X)
n = nrow(X)
p = ncol(X)
# 2. ndim and neigs
if ((!is.numeric(ndim))||(ndim<1)||(ndim>ncol(X))||is.infinite(ndim)||is.na(ndim)){
stop("* do.spmds : 'ndim' is a positive integer in [1,#(covariates)].")
}
ndim = as.integer(ndim)
# 3. sample ratio
if ((length(ratio)>1)|| (!((ratio<1)&&(ratio>0)))){
stop("* do.spmds : 'ratio' is invalid.")
}
m_e = as.integer(neigs)
if ((!is.numeric(m_e))||(m_e<2)||(m_e>=n)||(length(m_e)>1)||(is.infinite(m_e))||is.na(m_e)){
stop("* do.spmds : 'neigs' is not valid.")
}
# 3. preprocessing of data
if (missing(preprocess)){
algpreprocess = "null"
} else {
algpreprocess = match.arg(preprocess)
}
# 4. neighborhood information
nbdtype = type
if (missing(symmetric)){
nbdsymmetric = "union"
} else {
nbdsymmetric = match.arg(symmetric)
}
# 5. bandwidth
# bandwidth = sum(bandwidth)
# if (!is.numeric(bandwidth)|(bandwidth<0)|is.infinite(bandwidth)){
# stop("* do.spmds : 'bandwidth' should be a real number >= 0.")
# }
# 6. other parameters
m_s = as.integer(n*ratio)
#------------------------------------------------------------------------
## PREPROCESSING : DATA
# 1. data preprocessing
tmplist = aux.preprocess.hidden(X,type=algpreprocess,algtype="nonlinear")
trfinfo = tmplist$info
pX = tmplist$pX
# 2. neighborhood information
nbdstruct = aux.graphnbd(pX,method="euclidean",
type=nbdtype,symmetric=nbdsymmetric)
# 3. construct graph laplacian : no need for scaling
W = 1.0*nbdstruct$mask
if (!isSymmetric(W)){
W = (W+t(W))/2
}
L = diag(rowSums(W))-W
#
# W = exp(-(sD^2)/bandwidth) # weighted graph [0,1]
# diag(W) = 0.0
# if (!isSymmetric(W)){
# W = (W + t(W))/2
# }
# L = diag(rowSums(W))-W # graph laplacian for entire dataset
#------------------------------------------------------------------------
## MAIN COMPUTATION
# 1. eigendecomposition of L
eigL = RSpectra::eigs_sym(L, m_e, which="SM") # decreasing order, top ones
E = eigL$vectors[,m_e:1]
V = diag(eigL$values[m_e:1])
# 2. index of random samples
indice = sample(1:n, m_s, replace=FALSE)
# 3. pairwise squared geodesic distances for selected points
X_ind = t(pX[indice,])
x2 = matrix(colSums(X_ind^2),nrow=1)
D = array(1,c(m_s,1))%*%x2 -2*t(X_ind)%*%X_ind + t(x2)%*%array(1,c(1,m_s))
# 4. compute
Ylow = do.spmds.approxMDS(D, indice, E, V, ndim)
#------------------------------------------------------------------------
## RETURN
result = list()
result$Y = Ylow
result$trfinfo = trfinfo
return(result)
}
# -----------------------------------------------------------------------
# D : (m.s-by-m.s) squared pairwise distances
# sample : vector of m.s indices
# Phi : (M-by-K) eigenvector matrix of the set of points
# lambda : (K-by-K) eigenvalues diagonal matrix
# dim_o : dimension of the embedding
#' @keywords internal
#' @noRd
do.spmds.approxMDS <- function(D, sample, Phi, lambda, dim_o){
D = (D+t(D))/2
sigma = 1
n = nrow(Phi)
N = ncol(Phi)
Aeq = array(0,c(length(sample), n))
for (i in 1:length(sample)){
Aeq[i,sample[i]] = 1
}
Aeq_tilde = Aeq%*%Phi
B = sigma*(t(Aeq_tilde)%*%Aeq_tilde)
LHS = lambda[1:N,1:N]+B
RHS = t(Aeq_tilde)
Mat_interp = sigma*(base::solve(LHS,RHS))
Alpha = Mat_interp%*%D%*%t(Mat_interp)
J_Phi = Phi-(1/n)*matrix(rep(colSums(Phi),times=n),nrow=n,byrow=TRUE) ## risky
svdJ_Phi = base::svd(J_Phi)
S = svdJ_Phi$u
U = diag(svdJ_Phi$d)
V = svdJ_Phi$v
Q1 = -U%*%t(V)%*%Alpha%*%V%*%U # coinciding with the definition ?
eigQ1 = eigen(Q1)
S2 = eigQ1$vectors
eigvals = eigQ1$values # adjust for negative ones
eigvals[(eigvals<=sqrt(.Machine$double.eps))] = sqrt(.Machine$double.eps)
# Q = (1/sqrt(2))*S%*%S2%*%diag(sqrt(eigvals))
Q = S%*%S2%*%diag(sqrt(eigvals))
return(Q[,1:dim_o])
}