-
Notifications
You must be signed in to change notification settings - Fork 1
/
pca.R
372 lines (301 loc) · 10.7 KB
/
pca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
# glaplacian: this normalizes and regularizes the data matrix.
#' glaplacian
#'
#' Normalizes and regularizes a sparse adjacency Matrix with deg_row and deg_col as the number of non-zero elements in each row/column
#'
#' @param A
#' @param regularize
#'
#' @return
#' @export
#'
#' @examples
glaplacian <- function(A, regularize = TRUE) {
# deg_row <- Matrix::rowSums(abs(A))
# deg_col <- Matrix::colSums(abs(A))
deg_row <- Matrix::rowSums(A!=0)
deg_col <- Matrix::colSums(A!=0)
tau_row <- mean(deg_row)
tau_col <- mean(deg_col)
D_row = Matrix::Diagonal(nrow(A), 1 / sqrt(deg_row + tau_row))
D_col = Matrix::Diagonal(ncol(A), 1 / sqrt(deg_col + tau_col))
L = D_row %*% A %*% D_col
rownames(L) = rownames(A)
colnames(L) = colnames(A)
return(L)
}
#' get_Matrix
#'
#' @param interaction_model
#'
#' @return
#' @export
#' @importFrom dplyr rowwise transmute c_across all_of ungroup pull
#'
#' @examples
get_Matrix = function(interaction_model, import_names = FALSE){
A = Matrix::sparseMatrix(
i = interaction_model$interaction_tibble$row_num,
j = interaction_model$interaction_tibble$col_num,
x = interaction_model$interaction_tibble$outcome,
dims = c(nrow(interaction_model$row_universe), nrow(interaction_model$column_universe)))
if(import_names){
these_row_names = interaction_model$row_universe |>
rowwise() |>
transmute(combined = paste(c_across(all_of(rev(interaction_model$settings$row_variables))), collapse = "/")) |>
ungroup() |>
pull(combined)
these_col_names = interaction_model$column_universe |>
rowwise() |>
transmute(combined = paste(c_across(all_of(rev(interaction_model$settings$column_variables))), collapse = "/")) |>
ungroup() |>
pull(combined)
rownames(A) = these_row_names
colnames(A) = these_col_names
}
return(A)
}
get_Incomplete_Matrix = function(interaction_model){
softImpute::Incomplete(i = interaction_model$interaction_tibble$row_num,
j = interaction_model$interaction_tibble$col_num,
x = interaction_model$interaction_tibble$outcome)
}
#' update_lhs_to_1 (internal to pca_count)
#'
#' @param formula
#'
#' @return
#'
#' @examples
update_lhs_to_1 <- function(formula, quiet = FALSE) {
if (length(formula) == 3) {
# If the formula has an LHS, replace it with "outcome"
if(formula[[2]]!=1 ){
formula[[2]] <- 1
if(quiet) warning("left hand side of formula has been updated to 1 because this is a call to pca_count which requires 1. The formula is now:\n\n", deparse(formula))
# print()
}
} else if (length(formula) == 2) {
# If the formula does not have an LHS, add "outcome"
formula <- stats::as.formula(paste("1", deparse(formula)))
}
return(formula)
}
#' pca_count
#'
#' this is the first user function to generate pcs.
#'
#' @param fo a formula, like ~ (row_ids & context) * measurement_type. The left hand side should be left empty. If it isn't, it will be converted to 1. Either way, the elements of the sparse matrix will just be counts.
#' @param tib a tibble that contains the variables in the formula. The only exception is that the left-hand-side can be 1 and this does not need to be in tib.
#' @param k number of dimensions for the pca
#'
#' @return the output is a list of: (1) row_features for whichever term is first on the right hand side of the formula, (2) column_features for whichever term is second on the right hand side of the formula, (3) middle_B which is a tibble corresponding to a diagonal matrix in sparse triplet form, (4) settings which is a list of details.
#' @export
#'
#' @examples
pca_count = function(fo, tib, k){
# first, ensure the outcome is 1:
fo = update_lhs_to_1(fo)
pcs = pca_sum(fo, tib, k)
pcs$settings$fit_method = "pca_count"
pcs
}
#' pca_sum
#'
#' This performs pca on a sparse Matrix specified by the formula and the tibble.
#'
#' @param fo a formula, like outcome ~ (row_ids & context) * measurement_type.
#' @param tib a tibble that contains the variables in the formula. The only exception is that the left-hand-side can be 1 and this does not need to be in tib.
#' @param k number of dimensions for the pca
#'
#' @return the output is a list of: (1) row_features for whichever term is first on the right hand side of the formula, (2) column_features for whichever term is second on the right hand side of the formula, (3) middle_B which is a tibble corresponding to a diagonal matrix in sparse triplet form, (4) settings which is a list of details.
#' @export
#'
#' @examples
pca_sum = function(fo, tib, k){
im = make_interaction_model(tib, fo)
pca(im, k)
}
#' pca
#'
#' @param im
#' @param k
#' @param method_prefix
#'
#' @return
#' @export
#'
#' @examples
pca = function(im,k, method_prefix = "pc", regularize = TRUE, sqrt_counts = TRUE){
A = get_Matrix(im)
# A = sp_A_dat$A
if(sqrt_counts) A@x = sqrt(A@x)
if(regularize) A = glaplacian(A)
# s_svd = irlba::irlba(A,nu = k, nv = k)
s_svd = RSpectra::svds(A,k = k)
# dimension_prefix = paste0(im$settings$data_prefix, method_prefix)
# print(length(dimension_prefix))
# print(str(im$settings$data_prefix))
dimension_prefix = method_prefix
pcs = s_2_pc(interaction_model = im, s = s_svd, dimension_prefix=dimension_prefix)
# parsed_model = parse_variables(fo, tib)
settings = list(fit_method = "pca_sum",
prefix_for_dimensions = stringr::str_glue(dimension_prefix, "_"),
prefix_for_data = im$settings$data_prefix,
prefix_for_method = method_prefix,
k = k,
normalized = TRUE,
reguarlized = TRUE,
outcome_variables = im$settings$outcome_variables,
row_variables = im$settings$row_variables,
column_variables = im$settings$column_variables)
pcs[[4]] = settings
names(pcs)[4] = "settings"
class(pcs) = "pc"
pcs
}
#' pca_text
#'
#' given formula ~ row_ids * text, where text is a character string, construct a matrix where row's are indexed by row_ids and columns are (default) bag-of-words. Perform PCA on this matrix.
#'
#' Extended arguments ... are passed to tidytext::unnest_tokens. for example ... as token = "ngrams", n=2 would construct matrix of bigrams.
#'
#' TODO: make diagnose_text
#'
#' @param fo
#' @param tib
#' @param k
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
pca_text = function(fo, tib, k, ...){
im = make_interaction_model(tib, fo, parse_text = TRUE, ...)
pca(im, k)
# A = sp_A_dat$A
# A@x = sqrt(A@x)
# L = glaplacian(A)
# s_svd = irlba::irlba(L,nu = k, nv = k)
#
# pcs = s_2_pc(sparse_matrix_data = sp_A_dat, s = s_svd, dimension_prefix=dimension_prefix)
#
#
# settings = list(fit_method = "pca_sum",
# prefix_for_dimensions = stringr::str_glue(dimension_prefix, "_"),
# k = k,
# normalized = TRUE,
# reguarlized = TRUE,
# outcome_variables = parsed_model[[1]],
# row_variables = parsed_model[[2]],
# column_variables = parsed_model[[3]])
#
# pcs[[4]] = settings
# names(pcs)[4] = "settings"
# class(pcs) = "pc"
# pcs
}
#' remove_L_normalization (internal to pca_mean/pca_average)
#'
#' @param s_svd
#' @param A
#' @param orthogonalize
#'
#' @return
#'
#' @examples
remove_L_normalization = function(s_svd, A, orthogonalize= FALSE){
# if using L in s_svd(L) ,
# then we might want to remove that normalization in the output.
# this function does that.
# deg_row <- Matrix::rowSums(abs(A))
# deg_col <- Matrix::colSums(abs(A))
deg_row <- Matrix::rowSums(A!=0)
deg_col <- Matrix::colSums(A!=0)
tau_row <- mean(deg_row)
tau_col <- mean(deg_col)
D_row = Matrix::Diagonal(nrow(A), sqrt(deg_row + tau_row))
D_col = Matrix::Diagonal(ncol(A), sqrt(deg_col + tau_col))
s_svd$u = D_row %*% s_svd$u
s_svd$v = D_col %*% s_svd$v
s_svd$u = as.matrix(s_svd$u)
s_svd$v = as.matrix(s_svd$v)
if(!orthogonalize) return(s_svd)
# if we want to re-orthogonalize s$u and s$v....
# then you need to do some algebra.
# I think this is right, but has not been tested:
su= svd(s_svd$u)
sv= svd(s_svd$v)
b= diag(su$d) %*% t(su$v) %*% diag(s_svd$d) %*% sv$v %*% diag(sv$d)
sb = svd(b)
u = su$u%*% sb$u
d = sb$d
v = sv$u %*% sb$v
return(list(u = as.matrix(u), d = d, v = as.matrix(v)))
}
# pca_average does low-rank matrix completion
#' pca_na
#'
#' This performs pca on an interaction_model, where if a (row_id,col_id) are not present in the data, then it is presumed NA and imputed. This contrasts to the other `pca` function which imputes the missing values to zero.
#'
#' @param fo
#' @param tib
#' @param k
#'
#' @return
#' @export
#'
#' @examples
pca_na = function(im, k){
# the output is a pc object... a list of:
# row_features (for whichever term is first on the right hand side of the formula)
# column_features (for whichever term is second on the right hand side of the formula)
# middle_B (this is a diagonal matrix, but in a triplet form in a tibble)
# settings (this is a list of details)
# im = make_interaction_model(tib, fo, duplicates= "average")
# sp_A_dat = make_incomplete_matrix_raw(fo, tib)
cat("Taking", k, "core. Starting with:\n", nrow(im$row_universe), "rows\n",
nrow(im$column_universe), "columns\n",
nrow(im$interaction_tibble), "observed values")
im = core(im,core_threshold = k)
cat("After taking", k, "core. There remain:\n", nrow(im$row_universe), "rows\n",
nrow(im$column_universe), "columns\n",
nrow(im$interaction_tibble), "observed values")
A = get_Incomplete_Matrix(im)
# A = sp_A_dat$A
# L = glaplacian(A)
# s_svd = softImpute::softImpute(L,rank.max = k)
# s_svd = remove_L_normalization(s_svd,A)
s_svd = softImpute::softImpute(A, rank.max = k)
dimension_prefix = "na_pc"
pcs = s_2_pc(interaction_model = im, s = s_svd, dimension_prefix=dimension_prefix)
settings = list(fit_method = "pca_average",
prefix_for_dimensions = stringr::str_glue(dimension_prefix, "_"),
prefix_for_data = im$settings$data_prefix,
prefix_for_method = dimension_prefix,
k = k,
normalized = FALSE,
reguarlized = FALSE,
outcome_variables = im$settings$outcome_variables,
row_variables = im$settings$row_variables,
column_variables = im$settings$column_variables)
pcs[[4]] = settings
names(pcs)[4] = "settings"
class(pcs) = "pc"
pcs
}
pca_average= pca_na
pca_mean = pca_average
#' print.pc
#'
#' @param pcs
#'
#' @return
#' @export
#'
#' @examples
print.pc = function(pcs){
print(pcs$row_features)
print(pcs$column_features)
}