In [5]:
library(dplyr)
library(ggplot2)
library(readr)
library(broom)
library(cluster)   # silhouette()
# If broom.mixed not installed, broom is enough for tidy(prcomp)

set.seed(20250909)

dir.create("outputs/Phase4", recursive = TRUE, showWarnings = FALSE)

# 1) Load Phase 4 inputs exported from Phase 3 (id, gender, academic_year, residence, Dep, Anx, Str, BRS)
phase4_df <- readr::read_csv("outputs/Phase4/phase4_latent_inputs.csv", show_col_types = FALSE)

# 2) Select latent features and scale (z-score). Keep an ID column for merges/exports.
features <- c("Dep","Anx","Str","BRS")
stopifnot(all(features %in% names(phase4_df)))
X <- scale(as.matrix(phase4_df[, features]))  # numeric matrix
X_df <- as.data.frame(X)


In [7]:
# 3) PCA on scaled latent features
pca_fit <- prcomp(X, center = FALSE, scale. = FALSE)  # already scaled

# Eigenvalues and variance explained
eig <- pca_fit$sdev^2
var_expl <- eig / sum(eig)
cum_expl <- cumsum(var_expl)
pca_tbl <- data.frame(PC = paste0("PC", seq_along(eig)),
                      eigenvalue = eig,
                      prop_var = var_expl,
                      cum_var = cum_expl)
readr::write_csv(pca_tbl, "outputs/Phase4/pca_variance.csv")

# 4) Scree and cumulative variance plots
p_scree <- ggplot(pca_tbl, aes(x = PC, y = eigenvalue, group = 1)) +
  geom_line(color = "#2563eb") + geom_point(color = "#2563eb") +
  labs(title = "PCA Scree", y = "Eigenvalue", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/Phase4/fig_pca_scree.png", p_scree, width = 6, height = 3.6, dpi = 200,bg='white')

p_cum <- ggplot(pca_tbl, aes(x = PC, y = cum_var, group = 1)) +
  geom_line(color = "#059669") + geom_point(color = "#059669") +
  geom_hline(yintercept = 0.85, linetype = 2, color = "grey60") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(title = "PCA cumulative variance", y = "Cumulative variance", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/Phase4/fig_pca_cumvar.png", p_cum, width = 6, height = 3.6, dpi = 200,bg='white')

# 5) Loadings (rotation) table
loadings <- as.data.frame(pca_fit$rotation)
loadings$Feature <- rownames(loadings)
readr::write_csv(loadings[, c("Feature", colnames(pca_fit$rotation))],
                 "outputs/Phase4/pca_loadings.csv")

# 6) PCA scores (for plotting and optional clustering in PC space)
scores <- as.data.frame(pca_fit$x)  # PC scores
scores$id <- phase4_df$id
readr::write_csv(scores, "outputs/Phase4/pca_scores.csv")


In [9]:
# 7) Choose dimensionality for clustering: use first 2 PCs by default
#    Optionally switch to first 3 PCs if cum_var for PC3 is still meaningfully increasing.
use_pcs <- 1:2
Z <- as.matrix(scores[, paste0("PC", use_pcs)])

# 8) Helper to run kmeans with multiple starts; return WSS and silhouette
km_scan <- function(Z, k_values = 2:8, nstart = 50){
  library(cluster)
  res <- lapply(k_values, function(k){
    km <- kmeans(Z, centers = k, nstart = nstart, algorithm = "Lloyd")
    # Total within sum of squares (WSS) for elbow
    wss <- km$tot.withinss
    # Silhouette requires dissimilarities; compute on same space
    sil <- silhouette(km$cluster, dist(Z))
    sil_avg <- mean(sil[, "sil_width"])
    list(k = k, wss = wss, sil_avg = sil_avg, km = km)
  })
  data.frame(
    k = sapply(res, `[[`, "k"),
    wss = sapply(res, `[[`, "wss"),
    sil_avg = sapply(res, `[[`, "sil_avg")
  )
}

k_values <- 2:8
scan_tbl <- km_scan(Z, k_values, nstart = 50)
readr::write_csv(scan_tbl, "outputs/Phase4/k_scan.csv")

# 9) Elbow (WSS) and silhouette plots
p_wss <- ggplot(scan_tbl, aes(x = k, y = wss)) +
  geom_line(color = "#0ea5e9") + geom_point(color = "#0ea5e9") +
  scale_x_continuous(breaks = k_values) +
  labs(title = "Elbow: total within-cluster sum of squares", x = "k", y = "WSS") +
  theme_minimal(base_size = 11)
ggsave("outputs/Phase4/fig_wss_elbow.png", p_wss, width = 6, height = 3.6, dpi = 200,bg='white')

p_sil <- ggplot(scan_tbl, aes(x = k, y = sil_avg)) +
  geom_line(color = "#10b981") + geom_point(color = "#10b981") +
  scale_x_continuous(breaks = k_values) +
  labs(title = "Average silhouette by k", x = "k", y = "Average silhouette") +
  theme_minimal(base_size = 11)
ggsave("outputs/Phase4/fig_silhouette_avg.png", p_sil, width = 6, height = 3.6, dpi = 200,bg='white')




In [10]:
# 10) Shortlist based on local elbow (WSS drop slowing) and top silhouettes
#     Here, pick the top-2 k by silhouette and those around the elbow visually.
top_sil <- scan_tbl %>% arrange(desc(sil_avg)) %>% head(2) %>% pull(k)
elbow_k <- scan_tbl$k[which.min(diff(diff(scan_tbl$wss)) + Inf)] # crude curvature proxy
shortlist <- sort(unique(c(top_sil, elbow_k)))

# Save shortlist and the chosen dimensionality
meta <- list(
  pcs_used = paste(use_pcs, collapse = ","),
  shortlist_k = paste(shortlist, collapse = ",")
)
writeLines(c(
  paste("pcs_used:", meta$pcs_used),
  paste("shortlist_k:", meta$shortlist_k)
), "outputs/Phase4/shortlist_meta.txt")

# 11) Save cluster labels for shortlisted k
for (k in shortlist) {
  set.seed(20250909 + k)
  km <- kmeans(Z, centers = k, nstart = 50, algorithm = "Lloyd")
  labs <- data.frame(id = phase4_df$id,
                     cluster_k = k,
                     cluster = paste0("K", k, "_C", km$cluster))
  readr::write_csv(labs, sprintf("outputs/Phase4/labels_k%d.csv", k))
}




In [15]:
# Safe chooser for k with fallbacks
choose_k <- function(shortlist, scan_tbl, idx = 1, k_override = NULL, n_max = NULL){
  # 1) If user forces k, honor it if valid
  if (!is.null(k_override)) {
    k <- as.integer(k_override)
  } else {
    # 2) Otherwise take the idx-th shortlist element if it exists
    k <- if (length(shortlist) >= idx) as.integer(shortlist[idx]) else NA_integer_
  }
  # 3) If NA or invalid, use best silhouette k
  if (is.na(k) || k < 2) {
    k <- scan_tbl$k[which.max(scan_tbl$sil_avg)]
  }
  # 4) Ensure k < n_max (observations) if provided
  if (!is.null(n_max) && k >= n_max) {
    # Take the largest valid k below n_max, otherwise fallback to 2
    cand <- scan_tbl$k[scan_tbl$k < n_max]
    k <- if (length(cand)) max(cand) else 2L
  }
  k
}

# Example: pick a valid k even if idx = 23 is out of range
idx <- 23
n_obs <- nrow(Z)
k <- choose_k(shortlist, scan_tbl, idx = idx, k_override = NULL, n_max = n_obs)

# Run kmeans robustly and plot PC1–PC2
set.seed(20250909 + k)
km <- kmeans(Z, centers = k, nstart = 50, algorithm = "Lloyd")
plot_df <- data.frame(PC1 = Z[,1], PC2 = Z[,2], cluster = factor(km$cluster))

p_scatter <- ggplot(plot_df, aes(PC1, PC2, color = cluster)) +
  geom_point(alpha = 0.8, size = 1.8) +
  labs(title = paste0("PC1–PC2 scatter by k-means clusters (k=", k, ")"),
       x = "PC1", y = "PC2") +
  theme_minimal(base_size = 11) +
  theme_bw(base_size = 11)

ggsave(sprintf("outputs/Phase4/fig_pc_scatter_k%d.png", k),
       p_scatter, width = 6.4, height = 4.5, dpi = 200)




In [16]:
library(cluster)
sil <- silhouette(km$cluster, dist(Z))
sil_avg <- mean(sil[, "sil_width"])
cat("Average silhouette for k =", k, ":", round(sil_avg, 3), "\n")


Average silhouette for k = 2 : 0.451 


In [17]:
library(dplyr); library(ggplot2); library(readr); library(cluster)

dir.create("outputs/Phase4", recursive = TRUE, showWarnings = FALSE)

# Use the best silhouette k=2 explicitly
k_final <- 2L
set.seed(20250909 + k_final)
km_final <- kmeans(Z, centers = k_final, nstart = 100, algorithm = "Lloyd")

assignments <- data.frame(
  id = phase4_df$id,
  cluster = paste0("C", km_final$cluster)
)
readr::write_csv(assignments, "outputs/Phase4/labels_k2_final.csv")




In [20]:
library(flexclust)  # for Rand index; if not available, use fossil or mclustcomp

set.seed(20250910)
B <- 200  # bootstrap iterations
n <- nrow(Z)
ari_vec <- numeric(B)

for (b in 1:B) {
  # Bootstrap sample indices
  idx <- sample.int(n, replace = TRUE)
  Z_b <- Z[idx, , drop = FALSE]

  # Fit k-means on bootstrap sample
  km_b <- kmeans(Z_b, centers = k_final, nstart = 50)

  # Map back to original space via nearest-centroid prediction
  # Compute labels for all points using bootstrap centroids
  predict_kmeans <- function(Zall, centers) {
    d <- as.matrix(dist(rbind(centers, Zall)))[seq_len(nrow(centers)), (nrow(centers)+1):(nrow(centers)+nrow(Zall))]
    apply(d, 2, which.min)
  }
  lab_all <- predict_kmeans(Z, km_b$centers)

  # Align labels to km_final (resolve label switching) using maximum agreement
  align <- table(km_final$cluster, lab_all)
  perm <- apply(align, 2, function(col) which.max(col))
  lab_all_aligned <- perm[lab_all]

  # ARI between final labels and bootstrap-predicted labels
  ari_vec[b] <- flexclust::randIndex(km_final$cluster, lab_all_aligned, correct = TRUE)
}

stability <- data.frame(iter = 1:B, ARI = ari_vec)
readr::write_csv(stability, "outputs/Phase4/stability_k2_bootstrap_ari.csv")

p_ari <- ggplot(stability, aes(ARI)) +
  geom_histogram(fill = "#10b981", color = "white", bins = 20) +
  geom_vline(xintercept = mean(ari_vec, na.rm = TRUE), color = "#065f46", linetype = 2) +
  labs(title = "Bootstrap stability (ARI), k = 2", x = "Adjusted Rand Index", y = "Count") +
  theme_minimal(base_size = 11)
ggsave("outputs/Phase4/fig_stability_ari_k2.png", p_ari, width = 6.4, height = 3.8, dpi = 200,bg='white')


In [21]:
# Ward clustering on the same PC space
D <- dist(Z)
hc <- hclust(D, method = "ward.D2")
hc_labels <- cutree(hc, k = k_final)

sens_ari <- flexclust::randIndex(km_final$cluster, hc_labels, correct = TRUE)
writeLines(sprintf("Ward vs k-means ARI (k=2): %.3f", sens_ari),
           "outputs/Phase4/sensitivity_ward_vs_kmeans.txt")


In [23]:
# Merge latent z-scores for profiles (X_df holds the z-scored features)
profile_df <- cbind(phase4_df[, c("id","gender","academic_year","residence")],
                    X_df, cluster = factor(km_final$cluster, labels = paste0("C", 1:k_final)))

# Means (z) with 95% CI per cluster
summarize_ci <- function(x) {
  m <- mean(x, na.rm = TRUE); se <- sd(x, na.rm = TRUE)/sqrt(sum(!is.na(x)))
  c(mean = m, lo = m - 1.96*se, hi = m + 1.96*se)
}
prof <- profile_df %>%
  tidyr::pivot_longer(cols = all_of(features), names_to = "Feature", values_to = "z") %>%
  group_by(cluster, Feature) %>%
  summarize(stat = list(summarize_ci(z)), .groups = "drop") %>%
  tidyr::unnest_wider(stat)
readr::write_csv(prof, "outputs/Phase4/cluster_profiles_means.csv")

# Bar plot: mean z by feature per cluster
p_profiles <- ggplot(prof, aes(x = Feature, y = mean, fill = cluster)) +
  geom_col(position = position_dodge(width = 0.7), width = 0.6) +
  geom_errorbar(aes(ymin = lo, ymax = hi),
                position = position_dodge(width = 0.7), width = .15) +
  geom_hline(yintercept = 0, color = "grey60") +
  labs(title = "Cluster profiles (z-scores of latents)", y = "Mean (z)", x = "") +
  theme_minimal(base_size = 11)+
  theme_bw(base_size = 11)
ggsave("outputs/Phase4/fig_cluster_profiles_k2.png", p_profiles, width = 7.2, height = 4.2, dpi = 200)

# Descriptor distributions (post-hoc, not used for clustering)
desc <- profile_df %>%
  tidyr::pivot_longer(cols = c("gender","academic_year","residence"),
                      names_to = "Descriptor", values_to = "Level") %>%
  group_by(cluster, Descriptor, Level) %>%
  summarize(n = n(), .groups = "drop_last") %>%
  mutate(pct = n / sum(n)) %>% ungroup()
readr::write_csv(desc, "outputs/Phase4/cluster_descriptors.csv")


In [24]:
# Final per-person file: id, PC scores, cluster, labels
final_out <- scores %>%
  select(id, PC1, PC2) %>%
  left_join(assignments, by = "id") %>%
  left_join(phase4_df %>% select(id, gender, academic_year, residence), by = "id")
readr::write_csv(final_out, "outputs/Phase4/person_level_PCs_clusters.csv")


In [25]:
library(dplyr); library(readr); library(ggplot2); library(tidyr)

dir.create("outputs/Phase4/final", recursive = TRUE, showWarnings = FALSE)

# Inputs carried from earlier steps:
# phase4_df: id, gender, academic_year, residence, Dep, Anx, Str, BRS
# X_df: z-scored Dep/Anx/Str/BRS (same order as phase4_df)
# km_final: k-means model with k=2 on Z (PC1–PC2)
# assignments: id + C1/C2 labels
# prof (cluster mean z + CI), desc (descriptor percentages)
# p_profiles, p_scatter (PC scatter), p_ari (bootstrap stability), p_wss, p_sil, p_cum, p_scree

# 1) Cluster sizes
sizes <- assignments %>%
  count(cluster) %>%
  mutate(pct = n / sum(n))
readr::write_csv(sizes, "outputs/Phase4/final/cluster_sizes.csv")

# 2) Profile means with CIs (already computed in 'prof')
readr::write_csv(prof, "outputs/Phase4/final/cluster_profile_means_CI.csv")

# 3) Descriptors table (already computed in 'desc')
readr::write_csv(desc, "outputs/Phase4/final/cluster_descriptors.csv")

# 4) Save core figures to final folder
ggsave("outputs/Phase4/final/fig_cluster_profiles_k2.png", p_profiles, width = 7.2, height = 4.2, dpi = 200,bg='white')
ggsave("outputs/Phase4/final/fig_pc_scatter_k2.png", p_scatter, width = 6.4, height = 4.5, dpi = 200,bg='white')
ggsave("outputs/Phase4/final/fig_stability_ari_k2.png", p_ari, width = 6.4, height = 3.8, dpi = 200,bg='white')
ggsave("outputs/Phase4/final/fig_wss_elbow.png", p_wss, width = 6, height = 3.6, dpi = 200,bg='white')
ggsave("outputs/Phase4/final/fig_silhouette_avg.png", p_sil, width = 6, height = 3.6, dpi = 200,bg='white')
ggsave("outputs/Phase4/final/fig_pca_scree.png", p_scree, width = 6, height = 3.6, dpi = 200,bg='white')
ggsave("outputs/Phase4/final/fig_pca_cumvar.png", p_cum, width = 6, height = 3.6, dpi = 200,bg='white')

# 5) One-page textual summary
# Identify which cluster is higher distress / lower resilience
prof_wide <- prof %>% select(cluster, Feature, mean) %>% tidyr::pivot_wider(names_from = Feature, values_from = mean)
# Heuristic labels: higher Dep+Anx+Str and lower BRS => "High-distress/Low-resilience"
prof_wide$profile_label <- with(prof_wide, ifelse((Dep + Anx + Str) > 0 & BRS < 0, "High-distress / Low-resilience", "Low-distress / High-resilience"))

lab_map <- prof_wide %>% select(cluster, profile_label)
sizes_labeled <- sizes %>% left_join(lab_map, by = "cluster")

summ_lines <- c(
  sprintf("Phase 4: PCA PCs used = %s; shortlist k = %s", "1,2", "2,3"),
  sprintf("Chosen solution: k-means k = 2; Average silhouette ≈ %.3f", 
          NA_real_),  # fill from earlier print if stored
  sprintf("Ward vs k-means ARI (k=2): %.3f", sens_ari),
  "",
  "Cluster sizes:",
  paste0("  ", sizes_labeled$cluster, " (", sizes_labeled$profile_label, 
         "): n=", sizes_labeled$n, " (", scales::percent(sizes_labeled$pct, accuracy = 0.1), ")"),
  "",
  "Profile means (z): see cluster_profile_means_CI.csv",
  "Descriptor percentages: see cluster_descriptors.csv",
  "Figures: profiles, PC scatter, ARI stability, elbow, silhouette, PCA scree/cumvar saved in outputs/Phase4/final/"
)
writeLines(summ_lines, "outputs/Phase4/final/summary.txt")



Attaching package: ‘tidyr’

The following objects are masked from ‘package:Matrix’:

    expand, pack, unpack



In [27]:
# UMAP visualization (descriptive only)
# If umap isn't installed, uncomment: install.packages("uwot")
library(uwot)
library(ggplot2)
set.seed(20250911)

# Use the same scaled latent space as used for PCA/clustering (Z was PC1–PC2; for richer geometry, use X)
# Here we use the 4D z-scored latent matrix X_df to compute UMAP, then color by the final k=2 labels.
U <- uwot::umap(as.matrix(X_df), n_neighbors = 15, min_dist = 0.15, metric = "euclidean", n_components = 2)
umap_df <- data.frame(UMAP1 = U[,1], UMAP2 = U[,2], cluster = factor(km_final$cluster))

p_umap <- ggplot(umap_df, aes(UMAP1, UMAP2, color = cluster)) +
  geom_point(alpha = 0.8, size = 1.8) +
  labs(title = "UMAP (descriptive): colored by k-means k=2", x = "UMAP1", y = "UMAP2") +
  theme_minimal(base_size = 11)

dir.create("outputs/Phase4/final", recursive = TRUE, showWarnings = FALSE)
ggsave("outputs/Phase4/final/fig_umap_descriptive_k2.png", p_umap, width = 6.4, height = 4.5, dpi = 200,bg='white')


In [28]:
library(ggplot2)
library(dplyr)
library(scales)

# Build Ward labels if not already present
if (!exists("hc")) {
  D <- dist(Z)
  hc <- hclust(D, method = "ward.D2")
}
hc_labels <- cutree(hc, k = 2)
km_labels <- km_final$cluster

tab <- table(KMeans = paste0("C", km_labels),
             Ward   = paste0("C", hc_labels)) %>% as.data.frame()
# Normalize by total or by row for readability
tab <- tab %>% group_by(KMeans) %>% mutate(prop = Freq / sum(Freq)) %>% ungroup()

p_conf <- ggplot(tab, aes(Ward, KMeans, fill = prop)) +
  geom_tile(color = "white") +
  geom_text(aes(label = paste0(Freq, "\n", percent(prop, accuracy = 0.1))), size = 3.5) +
  scale_fill_gradient(low = "#e0f2f1", high = "#00695c") +
  labs(title = "Ward vs k-means (k=2): confusion heatmap",
       x = "Ward cluster", y = "k-means cluster", fill = "Row prop") +
  theme_minimal(base_size = 11)

ggsave("outputs/Phase4/final/fig_confusion_ward_vs_kmeans_k2.png",
       p_conf, width = 5.8, height = 4.5, dpi = 200,bg='white')



Attaching package: ‘scales’

The following object is masked from ‘package:readr’:

    col_factor

