/
rainette2.R
446 lines (395 loc) · 14.6 KB
/
rainette2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
## Compute signed Khi2 statistic from different sizes
compute_chi2 <- function(n_both, n1, n2, n_tot) {
tab <- matrix(
c(n_both, n1 - n_both, n2 - n_both, n_tot - n1 - n2 + n_both),
nrow = 2
)
suppressWarnings(cs <- stats::chisq.test(tab))
res <- cs$statistic
if (cs$expected[1, 1] > n_both) {
res <- -res
}
return(res)
}
## Compute data frame of group membership at each level for a clustering
get_groups <- function(res) {
groups <- purrr::imap(
res$uce_groups,
function(group, level) {
paste(level, group, sep = ".")
}
)
names(groups) <- seq_along(groups)
dplyr::bind_cols(groups)
}
## Compute size and chi2 for all combinations of two groups from two
## clustering results
groups_crosstab <- function(groups1, groups2, min_members, min_chi2) {
# Frequencies of each group in first clustering
g1_count <- groups1 %>%
tidyr::pivot_longer(
everything(),
names_to = "level1", values_to = "g1"
) %>%
dplyr::count(.data$level1, .data$g1, name = "n1")
# Frequencies of each group in second clustering
g2_count <- groups2 %>%
tidyr::pivot_longer(
everything(),
names_to = "level2", values_to = "g2"
) %>%
dplyr::count(.data$level2, .data$g2, name = "n2")
# Compute frequencies of all groups combinations
res <- purrr::map_dfr(names(groups1), function(level1) {
purrr::map_dfr(names(groups2), function(level2) {
df <- dplyr::tibble(
g1 = groups1[[level1]],
g2 = groups2[[level2]]
)
df %>%
dplyr::count(.data$g1, .data$g2) %>%
tidyr::complete(.data$g1, .data$g2, fill = list(n = 0)) %>%
dplyr::rename(n_both = n) %>%
dplyr::mutate(level1 = .env$level1, level2 = .env$level2)
})
})
# Add group count, chi-squared statistic and members
res %>%
# Filter on members
dplyr::filter(.data$n_both > .env$min_members) %>%
dplyr::left_join(g1_count, by = c("level1", "g1")) %>%
dplyr::left_join(g2_count, by = c("level2", "g2")) %>%
# Compute chi-squared
dplyr::rowwise() %>%
dplyr::mutate(
chi2 = compute_chi2(.data$n_both, .data$n1, .data$n2, nrow(groups1)) %>% unname()
) %>%
dplyr::ungroup() %>%
# Filter chi-squared
dplyr::filter(.data$chi2 > .env$min_chi2) %>%
dplyr::mutate(interclass = paste(.data$g1, .data$g2, sep = "x"))
}
# Add members list to each crossing group
crosstab_add_members <- function(tab, groups1, groups2) {
tab %>%
rowwise() %>%
dplyr::mutate(
members = list(
which(groups1[[.data$level1]] == .data$g1 & groups2[[.data$level2]] == .data$g2)
)
) %>%
dplyr::ungroup() %>%
# Filter groups with same members
dplyr::distinct(.data$members, .keep_all = TRUE) %>%
dplyr::mutate(id = 1:n())
}
# Only keep most associated pairs of crossing group
crosstab_keep_max <- function(tab) {
# We only keep crossing group where the second member is the group with
# the highest chi2 association with the first, and the first member is the
# group with the highest chi2 association with the second
res <- tab %>%
dplyr::group_by(.data$g1) %>%
dplyr::mutate(max_g2 = .data$g2[which.max(.data$chi2)]) %>%
dplyr::group_by(.data$g2) %>%
dplyr::mutate(max_g1 = .data$g1[which.max(.data$chi2)]) %>%
dplyr::filter(.data$max_g1 == .data$g1 & .data$max_g2 == .data$g2) %>%
dplyr::ungroup()
res
}
## Compute size of each pairs of intersection classes. Lower triangle
## and diagonal at 1 not to be selected as a partition afterward.
cross_sizes <- function(crosstab) {
sizes <- matrix(1, nrow = nrow(crosstab), ncol = nrow(crosstab))
for (i in 1:(nrow(crosstab) - 1)) {
for (j in (i + 1):nrow(crosstab)) {
sizes[i, j] <- length(
intersect(crosstab$members[[i]], crosstab$members[[j]])
)
}
}
sizes
}
## Compute next level partitions with for loop and progress bar
next_partitions_for <- function(partitions, sizes) {
res <- list()
progressr::with_progress({
p <- progressr::progressor(steps = length(partitions) / 100)
## for each previous partition
for (i in seq_along(partitions)) {
partition <- partitions[[i]]
size_inter <- colSums(sizes[partition, ])
classes_ok <- which(size_inter == 0)
res_current <- vector(mode = "list", length = length(classes_ok))
for (j in seq_along(classes_ok)) {
res_current[[j]] <- c(partition, classes_ok[j])
}
res[[i]] <- res_current
if (i %% 100 == 0) p()
}
})
res <- unlist(res, recursive = FALSE)
if (length(res) == 0) {
return(NULL)
}
res
}
## Compute next level partitions with mclapply (doesn't work on Windows)
next_partitions_parallel <- function(partitions, sizes) {
## for each previous partition
res <- parallel::mclapply(partitions, function(partition) {
size_inter <- colSums(sizes[partition, ])
classes_ok <- which(size_inter == 0)
res_current <- vector(mode = "list", length = length(classes_ok))
for (j in seq_along(classes_ok)) {
res_current[[j]] <- c(partition, classes_ok[j])
}
res_current
})
res <- unlist(res, recursive = FALSE)
if (length(res) == 0) {
return(NULL)
}
res
}
## From computed partitions, select the optimal ones and add group
## membership
get_optimal_partitions <- function(partitions, cross_groups, n_tot, full) {
## Compute group memberships from a vector of groups
compute_members <- function(cluster) {
cluster <- unlist(cluster)
groups <- rep(NA, n_tot)
for (i in seq_along(cluster)) {
members <- unlist(cross_groups$members[cluster[i]])
groups[members] <- i
}
groups
}
progressr::with_progress({
p <- progressr::progressor(steps = length(partitions) + 1)
## Compute data frame of results
res <- purrr::imap_dfr(partitions, function(partition, k) {
if (is.null(partition)) {
return(NULL)
}
out <- dplyr::tibble(clusters = partition, k = k + 1) %>%
dplyr::rowwise() %>%
## Compute size and sum of Khi2 for each partition
dplyr::mutate(
chi2 = sum(cross_groups$chi2[.data$clusters]),
n = sum(cross_groups$n_both[.data$clusters])
)
## If full computation, keep both max chi2 and max size
if (full) {
out <- out %>%
## Filter partitions with max size or max chi2 for each k
dplyr::group_by(.data$k) %>%
dplyr::filter(.data$n == max(.data$n) | .data$chi2 == max(.data$chi2)) %>%
## If several partitions with same n, keep max chi2
dplyr::group_by(.data$k, .data$n) %>%
dplyr::slice_max(.data$chi2) %>%
## If several partitions with same chi2, keep max n
dplyr::group_by(.data$k, .data$chi2) %>%
dplyr::slice_max(.data$n) %>%
dplyr::ungroup()
}
## If not full computation, only keep max chi2
else {
out <- out %>%
## Filter partitions with max chi2 for each k
dplyr::group_by(.data$k) %>%
dplyr::slice_max(.data$chi2) %>%
## If several partitions with same chi2, keep max n
dplyr::group_by(.data$k, .data$chi2) %>%
dplyr::slice_max(.data$n) %>%
dplyr::ungroup()
}
p()
out
})
res <- res %>%
dplyr::rowwise() %>%
# Add group membership for each clustering
dplyr::mutate(groups = list(compute_members(.data$clusters))) %>%
# Replace cross groups ids by their name
dplyr::mutate(clusters = list(cross_groups$interclass[.data$clusters])) %>%
dplyr::ungroup()
})
res
}
#' Corpus clustering based on the Reinert method - Double clustering
#'
#' @param x either a quanteda dfm object or the result of [rainette()]
#' @param y if `x` is a [rainette()] result, this must be another [rainette()]
#' result from same dfm but with different uc size.
#' @param max_k maximum number of clusters to compute
#' @param min_segment_size1 if `x` is a dfm, minimum uc size for first clustering
#' @param min_segment_size2 if `x` is a dfm, minimum uc size for second clustering
#' @param full if TRUE, all crossed groups are kept to compute optimal partitions, otherwise
#' only the most mutually associated groups are kept.
#' @param doc_id character name of a dtm docvar which identifies source documents.
#' @param min_members minimum members of each cluster
#' @param min_chi2 minimum chi2 for each cluster
#' @param parallel if TRUE, use `parallel::mclapply` to compute partitions
#' (won't work on Windows, uses more RAM)
#' @param uc_size1 deprecated, use min_segment_size1 instead
#' @param uc_size2 deprecated, use min_segment_size2 instead
#' @param ... if `x` is a dfm object, parameters passed to [rainette()] for both
#' simple clusterings
#'
#' @details
#' You can pass a quanteda dfm as `x` object, the function then performs two simple
#' clustering with varying minimum uc size, and then proceed to find optimal partitions
#' based on the results of both clusterings.
#'
#' If both clusterings have already been computed, you can pass them as `x` and `y` arguments
#' and the function will only look for optimal partitions.
#'
#' `doc_id` must be provided unless the corpus comes from `split_segments`,
#' in this case `segment_source` is used by default.
#'
#' If `full = FALSE`, computation may be much faster, but the chi2 criterion will be the only
#' one available for best partition detection, and the result may not be optimal.
#'
#' For more details on optimal partitions search algorithm, please see package vignettes.
#'
#' @return
#' A tibble with optimal partitions found for each available value of `k` as rows, and the following
#' columns :
#'
#' - `clusters` list of the crossed original clusters used in the partition
#' - `k` the number of clusters
#' - `chi2` sum of the chi2 value of each cluster
#' - `n` sum of the size of each cluster
#' - `groups` group membership of each document for this partition (`NA` if not assigned)
#'
#' @seealso [rainette()], [cutree_rainette2()], [rainette2_plot()], [rainette2_explor()]
#'
#' @references
#'
#' - Reinert M, Une méthode de classification descendante hiérarchique : application à l'analyse lexicale par contexte, Cahiers de l'analyse des données, Volume 8, Numéro 2, 1983. <http://www.numdam.org/item/?id=CAD_1983__8_2_187_0>
#' - Reinert M., Alceste une méthodologie d'analyse des données textuelles et une application: Aurelia De Gerard De Nerval, Bulletin de Méthodologie Sociologique, Volume 26, Numéro 1, 1990. \doi{10.1177/075910639002600103}
#'
#' @export
#'
#' @examples
#' \donttest{
#' require(quanteda)
#' corpus <- data_corpus_inaugural
#' corpus <- head(corpus, n = 10)
#' corpus <- split_segments(corpus)
#' tok <- tokens(corpus, remove_punct = TRUE)
#' tok <- tokens_remove(tok, stopwords("en"))
#' dtm <- dfm(tok, tolower = TRUE)
#' dtm <- dfm_trim(dtm, min_docfreq = 3)
#'
#' res1 <- rainette(dtm, k = 5, min_segment_size = 10)
#' res2 <- rainette(dtm, k = 5, min_segment_size = 15)
#'
#' res <- rainette2(res1, res2, max_k = 4)
#' }
#'
rainette2 <- function(x, y = NULL, max_k = 5,
min_segment_size1 = 10, min_segment_size2 = 15,
doc_id = NULL, min_members = 10, min_chi2 = 3.84,
parallel = FALSE, full = TRUE,
uc_size1, uc_size2, ...) {
## Check for deprecated uc_size1 argument
if (!missing(uc_size1)) {
warning("! uc_size1 is deprecated. Use min_segment_size1 instead.")
if (missing(min_segment_size1)) {
min_segment_size1 <- uc_size1
}
}
## Check for deprecated uc_size2 argument
if (!missing(uc_size2)) {
warning("! uc_size2 is deprecated. Use min_segment_size2 instead.")
if (missing(min_segment_size2)) {
min_segment_size2 <- uc_size2
}
}
## If passed a dfm, compute both clustering
if (inherits(x, "dfm")) {
dtm <- x
message(" Computing first clustering with min_segment_size1 = ", min_segment_size1)
x <- rainette::rainette(
dtm,
k = max_k, min_segment_size = min_segment_size1, doc_id = doc_id,
min_split_members = min_members, ...
)
message(" Computing second clustering with min_segment_size2 = ", min_segment_size2)
y <- rainette::rainette(
dtm,
k = max_k, min_segment_size = min_segment_size2, doc_id = doc_id,
min_split_members = min_members, ...
)
}
## max_k should not be higher than the smallest k in both clustering
max_k_res <- min(max(x$group, na.rm = TRUE), max(y$group, na.rm = TRUE))
if (max_k_res < max_k) {
message("! Lowering max_k from ", max_k, " to ", max_k_res)
max_k <- max_k_res
}
message(" Searching for best partitions...")
message(" Computing size 2 partitions...")
progressr::with_progress({
p <- progressr::progressor(steps = 5)
## Compute data frame of groups at each k for both clusterings
groups1 <- get_groups(x)
groups2 <- get_groups(y)
p()
## Check if both clusterings have same ndoc
if (nrow(groups1) != nrow(groups2)) {
stop("! Number of documents in both clustering results must be the same")
}
## Total number of documents
n_tot <- nrow(groups1)
## Compute sizes and chi2 of every crossing between classes
## of both clusterings (intersection classes)
cross_groups <- groups_crosstab(groups1, groups2, min_members, min_chi2)
p()
## Add members list to each crossing group
cross_groups <- crosstab_add_members(cross_groups, groups1, groups2)
p()
if (nrow(cross_groups) < 2) {
stop("! Not enough valid classes to continue. You may try a lower min_members value.")
}
if (!full) {
cross_groups <- crosstab_keep_max(cross_groups)
}
## Matrix of number of common elements between crossing groups
sizes <- cross_sizes(cross_groups)
p()
## Compute size 2 partitions
partitions <- list()
## Size 2 partitions : pairs of cross groups with no common elements
partitions[[1]] <- which(sizes == 0, arr.ind = TRUE, useNames = FALSE) %>%
asplit(1)
p()
})
## Partitions computing method
next_partitions <- ifelse(
parallel,
next_partitions_parallel,
next_partitions_for
)
## Compute partitions of size > 2
if (max_k > 2) {
for (k in 1:(max_k - 2)) {
message(" Computing size ", k + 2, " partitions...")
next_part <- next_partitions(partitions[[k]], sizes)
if (is.null(next_part)) {
message("! No more partitions found, stopping at k=", k + 1)
break
}
partitions[[k + 1]] <- next_part
}
}
message(" Selecting best partitions...")
## Select optimal partitions and add group membership for each one
res <- get_optimal_partitions(partitions, cross_groups, n_tot, full)
message(" Done.")
class(res) <- c("rainette2", class(res))
attr(res, "full") <- full
res
}