In [2]:
library(dplyr); library(haven); library(forcats); library(lavaan); library(semTools)
library(mice); library(stringr)

sanitize_names <- function(x){
  x <- gsub("[^A-Za-z0-9_]", "_", x)
  x <- gsub("_+", "_", x); x <- gsub("^_+","",x); x <- gsub("_+$","",x)
  ifelse(grepl("^[A-Za-z]", x), x, paste0("X_", x)) |> make.unique(sep = "_")
}

# NA‑preserving K−1 dummies (no “Missing” category)
mk_dummies_na <- function(df, var, prefix = var) {
  # set a stable baseline = most frequent non-missing level
  base <- names(sort(table(df[[var]]), decreasing = TRUE))[6]
  v <- forcats::fct_relevel(df[[var]], base)
  mf <- model.frame(reformulate("v"), data = data.frame(v), na.action = na.pass)
  mm <- model.matrix(~ v, data = mf)
  mm <- mm[, colnames(mm) != "(Intercept)", drop = FALSE]
  colnames(mm) <- sub("^v", prefix, colnames(mm))
  as.data.frame(mm)
}

safe_write_csv <- function(x, path) {
  if (!inherits(x, "data.frame")) x <- as.data.frame(x, stringsAsFactors = FALSE)
  bad <- sapply(x, function(col) is.list(col) || is.matrix(col) || is.array(col))
  if (any(bad)) {
    x[bad] <- lapply(x[bad], function(col) {
      if (is.list(col)) {
        if (all(vapply(col, length, 1L) <= 1L)) unlist(col) else NULL
      } else if (is.matrix(col) && ncol(col) == 1L) as.vector(col) else NULL
    })
    x <- x[!vapply(x, is.null, logical(1))]
  }
  if (ncol(x) == 0) x <- data.frame(note = "no data", stringsAsFactors = FALSE)
  if (nrow(x) == 0) x[1,] <- NA
  colnames(x) <- make.names(colnames(x), unique = TRUE)
  utils::write.csv(x, path, row.names = FALSE)
}



Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

This is lavaan 0.6-19
lavaan is FREE software! Please report any bugs.
 
###############################################################################
This is semTools 0.5-7
All users of R (or SEM) are invited to submit functions or ideas for functions.
###############################################################################

Attaching package: ‘mice’

The following object is masked from ‘package:stats’:

    filter

The following objects are masked from ‘package:base’:

    cbind, rbind



In [3]:
# Load Phase 2 analysis file
dat0 <- readRDS("data/processed/analysis_phase1_ordered.rds") %>%
  mutate(
    gender           = haven::as_factor(gender, levels = "labels"),
    academic_year    = haven::as_factor(academic_year, levels = "labels"),
    dept_group       = haven::as_factor(dept_group, levels = "labels"),
    residence        = haven::as_factor(residence, levels = "labels"),
    living_situation = haven::as_factor(living_situation, levels = "labels")
  )

# Dummies (no Missing category)
dum_gender <- mk_dummies_na(dat0, "gender",           "G_")
dum_year   <- mk_dummies_na(dat0, "academic_year",    "AY_")
dum_dept   <- mk_dummies_na(dat0, "dept_group",       "DG_")
dum_res    <- mk_dummies_na(dat0, "residence",        "RES_")
dum_live   <- mk_dummies_na(dat0, "living_situation", "LIV_")

stopifnot(nrow(dum_gender)==nrow(dat0), nrow(dum_year)==nrow(dat0),
          nrow(dum_dept)==nrow(dat0),   nrow(dum_res)==nrow(dat0),
          nrow(dum_live)==nrow(dat0))

dat <- dplyr::bind_cols(dat0, dum_gender, dum_year, dum_dept, dum_res, dum_live)

# Sanitize all column names for lavaan
names(dat) <- sanitize_names(names(dat))

# Item sets after sanitization
dep_items  <- sanitize_names(c("dQ3D","dQ5D","dQ10D","dQ13D","dQ16D","dQ17D","dQ21D"))
anx_items  <- sanitize_names(c("dQ2A","dQ4A","dQ7A","dQ9A","dQ15A","dQ19A","dQ20A"))
str_items  <- sanitize_names(c("dQ1S","dQ6S","dQ8S","dQ11S","dQ12S","dQ14S","dQ18S"))
brs_items  <- sanitize_names(c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))
dass_items <- c(dep_items, anx_items, str_items)

# Ensure ordered types
dat[dass_items] <- lapply(dat[dass_items], function(x) factor(as.integer(as.character(x)), levels = 0:3, ordered = TRUE))
dat[brs_items]  <- lapply(dat[brs_items],  function(x) factor(as.integer(as.character(x)), levels = 1:5, ordered = TRUE))

# Exogenous dummy candidates
safe_exo <- grep("^(G_|AY_|DG_|RES_|LIV_)", names(dat), value = TRUE)


[1m[22m1 unknown level in `f`: NA 
[1m[22m1 unknown level in `f`: NA 
[1m[22m1 unknown level in `f`: NA 
[1m[22m1 unknown level in `f`: NA 


In [4]:
# Remove ultra-sparse dummies to reduce WLSMV singularity risk
dummy_ones <- sapply(dat[, safe_exo, drop = FALSE], function(v) sum(v == 1, na.rm = TRUE))
safe_exo_core <- names(dummy_ones)[dummy_ones >= 10]

rhs_core <- paste(c("BRS", safe_exo_core), collapse = " + ")

model_core <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_core, '
  Anx ~ ', rhs_core, '
  Str ~ ', rhs_core, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_core <- sem(model_core, data = dat,
                ordered = c(dass_items, brs_items),
                estimator = "WLSMV",
                parameterization = "theta",
                std.lv = TRUE,
                control = list(iter.max = 20000, rel.tol = 1e-6))

summary(fit_core, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)

# Save core outputs
dir.create("outputs/SEM", showWarnings = FALSE, recursive = TRUE)
safe_write_csv(as.data.frame(t(fitMeasures(fit_core, c("cfi","tli","rmsea","srmr","chisq","df")))),
               "outputs/SEM/core_sem_fit.csv")
std_core <- standardizedSolution(fit_core)
safe_write_csv(std_core %>% filter(op=="~", lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/core_sem_paths.csv")
safe_write_csv(as.data.frame(lavInspect(fit_core, "r2")), "outputs/SEM/core_sem_r2.csv")


lavaan->muthen1984():  
   trouble constructing W matrix; used generalized inverse for A11 submatrix 


In [5]:
# Prepare trimmed exogenous sets per grouping
exo_no_gender <- setdiff(safe_exo_core, grep("^G_", safe_exo_core, value = TRUE))
exo_no_year   <- setdiff(safe_exo_core, grep("^AY_", safe_exo_core, value = TRUE))

drop_group_constant <- function(data, group_var, exo_names){
  keep <- sapply(exo_names, function(v){
    byv <- split(data[[v]], data[[group_var]])
    all(sapply(byv, function(x) {
      x <- x[!is.na(x)]
      if (length(x) == 0L) FALSE else (length(unique(x)) > 1L)
    }))
  })
  exo_names[keep]
}

# Gender multi-group
dat_g <- dat %>% filter(!is.na(gender))
exo_gender_final <- drop_group_constant(dat_g, "gender", exo_no_gender)
rhs_gender <- paste(c("BRS", exo_gender_final), collapse = " + ")

model_mg_gender <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_gender, '
  Anx ~ ', rhs_gender, '
  Str ~ ', rhs_gender, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_mg_gender <- sem(model_mg_gender, data = dat_g,
                     group = "gender",
                     ordered = c(dass_items, brs_items),
                     estimator = "WLSMV",
                     parameterization = "theta",
                     std.lv = TRUE,
                     group.equal = c("thresholds","loadings"))
summary(fit_mg_gender, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)

# Extract paths and latent means by gender
std_g <- standardizedSolution(fit_mg_gender)
safe_write_csv(std_g %>% filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/mg_gender_BRS_paths_by_group.csv")
pe_g <- parameterEstimates(fit_mg_gender, standardized = TRUE, ci = TRUE)
safe_write_csv(subset(pe_g, op=="~1" & lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/mg_gender_latent_means.csv")
safe_write_csv(as.data.frame(t(fitMeasures(fit_mg_gender, c("cfi","tli","rmsea","srmr","chisq","df")))),
               "outputs/SEM/mg_gender_fit.csv")

# Academic year multi-group
dat_y <- dat %>% filter(!is.na(academic_year))
exo_year_final <- drop_group_constant(dat_y, "academic_year", exo_no_year)
rhs_year <- paste(c("BRS", exo_year_final), collapse = " + ")

model_mg_year <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_year, '
  Anx ~ ', rhs_year, '
  Str ~ ', rhs_year, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_mg_year <- sem(model_mg_year, data = dat_y,
                   group = "academic_year",
                   ordered = c(dass_items, brs_items),
                   estimator = "WLSMV",
                   parameterization = "theta",
                   std.lv = TRUE,
                   group.equal = c("thresholds","loadings"))
summary(fit_mg_year, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)

std_y <- standardizedSolution(fit_mg_year)
safe_write_csv(std_y %>% filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/mg_year_BRS_paths_by_group.csv")
pe_y <- parameterEstimates(fit_mg_year, standardized = TRUE, ci = TRUE)
safe_write_csv(subset(pe_y, op=="~1" & lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/mg_year_latent_means.csv")
safe_write_csv(as.data.frame(t(fitMeasures(fit_mg_year, c("cfi","tli","rmsea","srmr","chisq","df")))),
               "outputs/SEM/mg_year_fit.csv")


1: lavaan->lav_object_post_check():  
   covariance matrix of latent variables is not positive definite in group 2; use lavInspect(fit, "cov.lv") 
   to investigate. 
2: lavaan->lav_object_post_check():  
   covariance matrix of latent variables is not positive definite in group 3; use lavInspect(fit, "cov.lv") 
   to investigate. 


In [6]:
library(mice)

mi_vars  <- c("cgpa","family_income")
aux_preds<- c("gender","academic_year","dept_group","residence","living_situation")

mi_dat <- dat0[, c(mi_vars, aux_preds), drop = FALSE]

meth <- make.method(mi_dat)
meth["cgpa"]          <- "pmm"
meth["family_income"] <- "pmm"

pred <- make.predictorMatrix(mi_dat); pred[,] <- 0
pred["cgpa", aux_preds]          <- 1
pred["family_income", aux_preds] <- 1

set.seed(20250908)
imp <- mice(mi_dat, m = 20, maxit = 20, method = meth, predictorMatrix = pred, printFlag = FALSE)
saveRDS(imp, "outputs/SEM/mi_covariates_cgpa_income_m20.rds")


Number of logged events: 800 


In [7]:
library(forcats)

# Safe dummy builder that preserves N, avoids NA as level, and no “unknown level” warnings
mk_dummies_preserve <- function(df, var, prefix = var){
  v <- df[[var]]
  base <- names(sort(table(v), decreasing = TRUE))[7]
  # Non-NA rows
  keep <- !is.na(v)
  mm <- matrix(NA_real_, nrow = length(v), ncol = 0)
  if (any(keep)) {
    v_nonNA <- forcats::fct_relevel(v[keep], base)
    mm_non  <- model.matrix(~ v_nonNA)[, -1, drop = FALSE]   # K-1 dummies
    colnames(mm_non) <- sub("^v_nonNA", prefix, colnames(mm_non))
    # place back into full-N matrix
    mm <- matrix(NA_real_, nrow = length(v), ncol = ncol(mm_non))
    mm[keep, ] <- mm_non
    colnames(mm) <- colnames(mm_non)
  }
  as.data.frame(mm)
}

prep_sem_dataset <- function(comp_covs, full_data){
  stopifnot(nrow(comp_covs) == nrow(full_data))
  dat_ <- full_data
  dat_[, c("cgpa","family_income")] <- comp_covs[, c("cgpa","family_income")]

  # Ensure factors
  dat_ <- dat_ %>%
    mutate(
      gender           = haven::as_factor(gender, levels = "labels"),
      academic_year    = haven::as_factor(academic_year, levels = "labels"),
      dept_group       = haven::as_factor(dept_group, levels = "labels"),
      residence        = haven::as_factor(residence, levels = "labels"),
      living_situation = haven::as_factor(living_situation, levels = "labels")
    )

  # Dummies
  dum_gender <- mk_dummies_preserve(dat_, "gender",           "G_")
  dum_year   <- mk_dummies_preserve(dat_, "academic_year",    "AY_")
  dum_dept   <- mk_dummies_preserve(dat_, "dept_group",       "DG_")
  dum_res    <- mk_dummies_preserve(dat_, "residence",        "RES_")
  dum_live   <- mk_dummies_preserve(dat_, "living_situation", "LIV_")

  dat_ <- dplyr::bind_cols(dat_, dum_gender, dum_year, dum_dept, dum_res, dum_live)

  # Sanitize names
  names(dat_) <- sanitize_names(names(dat_))

  # Ensure ordered indicators
  d_items <- sanitize_names(c("dQ1S","dQ2A","dQ3D","dQ4A","dQ5D","dQ6S","dQ7A","dQ8S","dQ9A",
                              "dQ10D","dQ11S","dQ12S","dQ13D","dQ14S","dQ15A","dQ16D","dQ17D",
                              "dQ18S","dQ19A","dQ20A","dQ21D"))
  b_items <- sanitize_names(c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))
  dat_[d_items] <- lapply(dat_[d_items], function(x) factor(as.integer(as.character(x)), levels = 0:3, ordered = TRUE))
  dat_[b_items] <- lapply(dat_[b_items], function(x) factor(as.integer(as.character(x)), levels = 1:5, ordered = TRUE))

  # Drop ultra-sparse dummies for this dataset
  safe_exo <- grep("^(G_|AY_|DG_|RES_|LIV_)", names(dat_), value = TRUE)
  if (length(safe_exo)) {
    ones <- sapply(dat_[, safe_exo, drop = FALSE], function(v) sum(v == 1, na.rm = TRUE))
    drop_cols <- names(ones)[ones < 10]
    if (length(drop_cols)) dat_[drop_cols] <- NULL
  }

  dat_
}

analysis_data_full <- readRDS("data/processed/analysis_phase1_ordered.rds")
complete_list <- mice::complete(imp, "all")
dataList <- lapply(complete_list, prep_sem_dataset, full_data = analysis_data_full)




In [8]:
library(forcats)

# --- DROP-IN: build dummies + sanitize + ordered + drop ultra-sparse (per dataset) ---

mk_dummies_preserve <- function(df, var, prefix = var){
  v <- df[[var]]
  base <- names(sort(table(v), decreasing = TRUE))[1]
  keep <- !is.na(v)
  mm <- matrix(NA_real_, nrow = length(v), ncol = 0)
  if (any(keep)) {
    v_nonNA <- forcats::fct_relevel(v[keep], base)
    mm_non  <- model.matrix(~ v_nonNA)[, -1, drop = FALSE]
    colnames(mm_non) <- sub("^v_nonNA", prefix, colnames(mm_non))
    mm <- matrix(NA_real_, nrow = length(v), ncol = ncol(mm_non))
    mm[keep, ] <- mm_non
    colnames(mm) <- colnames(mm_non)
  }
  as.data.frame(mm)
}

prep_sem_dataset <- function(comp_covs, full_data){
  stopifnot(nrow(comp_covs) == nrow(full_data))
  dat_ <- full_data
  dat_[, c("cgpa","family_income")] <- comp_covs[, c("cgpa","family_income")]

  dat_ <- dat_ %>%
    mutate(
      gender           = haven::as_factor(gender, levels = "labels"),
      academic_year    = haven::as_factor(academic_year, levels = "labels"),
      dept_group       = haven::as_factor(dept_group, levels = "labels"),
      residence        = haven::as_factor(residence, levels = "labels"),
      living_situation = haven::as_factor(living_situation, levels = "labels")
    )

  dum_gender <- mk_dummies_preserve(dat_, "gender",           "G_")
  dum_year   <- mk_dummies_preserve(dat_, "academic_year",    "AY_")
  dum_dept   <- mk_dummies_preserve(dat_, "dept_group",       "DG_")
  dum_res    <- mk_dummies_preserve(dat_, "residence",        "RES_")
  dum_live   <- mk_dummies_preserve(dat_, "living_situation", "LIV_")

  dat_ <- dplyr::bind_cols(dat_, dum_gender, dum_year, dum_dept, dum_res, dum_live)
  names(dat_) <- sanitize_names(names(dat_))

  d_items <- sanitize_names(c("dQ1S","dQ2A","dQ3D","dQ4A","dQ5D","dQ6S","dQ7A","dQ8S","dQ9A",
                              "dQ10D","dQ11S","dQ12S","dQ13D","dQ14S","dQ15A","dQ16D","dQ17D",
                              "dQ18S","dQ19A","dQ20A","dQ21D"))
  b_items <- sanitize_names(c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))
  dat_[d_items] <- lapply(dat_[d_items], function(x) factor(as.integer(as.character(x)), levels = 0:3, ordered = TRUE))
  dat_[b_items] <- lapply(dat_[b_items], function(x) factor(as.integer(as.character(x)), levels = 1:5, ordered = TRUE))

  # Drop ultra-sparse dummies (<10 ones) to stabilize WLSMV
  safe_exo <- grep("^(G_|AY_|DG_|RES_|LIV_)", names(dat_), value = TRUE)
  if (length(safe_exo)) {
    ones <- sapply(dat_[, safe_exo, drop = FALSE], function(v) sum(v == 1, na.rm = TRUE))
    drop_cols <- names(ones)[ones < 10]
    if (length(drop_cols)) dat_[drop_cols] <- NULL
  }
  dat_
}


analysis_data_full <- readRDS("data/processed/analysis_phase1_ordered.rds")
complete_list <- mice::complete(imp, "all")
dataList <- lapply(complete_list, prep_sem_dataset, full_data = analysis_data_full)


In [9]:
library(lavaan.mi)

# --- DROP-IN: aggressive pruning to avoid collinearity/warnings ---

prune_exo_aggressive <- function(df){
  exo_all <- grep("^(G_|AY_|DG_|RES_|LIV_)", names(df), value = TRUE)

  # Keep only Gender + AY + RES (drop DG_ and LIV_ completely)
  exo <- unique(c(grep("^G_", exo_all, value = TRUE),
                  grep("^AY_", exo_all, value = TRUE),
                  grep("^RES_", exo_all, value = TRUE)))

  # Remove near-constant (variance < 1e-3)
  if (length(exo)) {
    vrs <- sapply(df[, exo, drop = FALSE], function(x) var(x, na.rm = TRUE))
    exo <- exo[vrs > 1e-3]
  }

  # Remove highly collinear dummies (|r| > .95)
  if (length(exo) > 2) {
    R <- suppressWarnings(cor(df[, exo, drop = FALSE], use = "pairwise.complete.obs"))
    drop <- character(0)
    for (i in seq_along(exo)) {
      if (exo[i] %in% drop) next
      hi <- setdiff(names(which(abs(R[exo[i], ]) > 0.95)), exo[i])
      drop <- union(drop, hi)
    }
    exo <- setdiff(exo, drop)
  }

  keep <- c(setdiff(names(df), grep("^(G_|AY_|DG_|RES_|LIV_)", names(df), value = TRUE)), exo)
  df[, keep, drop = FALSE]
}

dataList_pruned <- lapply(dataList, prune_exo_aggressive)

# Detect names from first pruned dataset
nm0 <- names(dataList_pruned[[4]])
dep_items <- intersect(nm0, grep("^dQ(3D|5D|10D|13D|16D|17D|21D)$", nm0, value = TRUE))
anx_items <- intersect(nm0, grep("^dQ(2A|4A|7A|9A|15A|19A|20A)$", nm0, value = TRUE))
str_items <- intersect(nm0, grep("^dQ(1S|6S|8S|11S|12S|14S|18S)$", nm0, value = TRUE))
brs_items <- intersect(nm0, c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))
dass_items <- c(dep_items, anx_items, str_items)
safe_exo   <- grep("^(G_|AY_|RES_)", nm0, value = TRUE)

rhs_ext <- paste(c("BRS", safe_exo, "cgpa", "family_income"), collapse = " + ")

model_ext <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '
  Dep ~ ', rhs_ext, '
  Anx ~ ', rhs_ext, '
  Str ~ ', rhs_ext, '
  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

library(lavaan.mi)

fit_ext_mi <- lavaan.mi::lavaan.mi(
  model = model_ext,
  data  = dataList_pruned,       # CORRECT argument for lavaan.mi
  estimator = "WLSMV",
  parameterization = "theta",
  std.lv = TRUE,
  ordered = c(dass_items, brs_items)
)

summary(fit_ext_mi, fit.measures = TRUE, standardized = TRUE)



 
###################################################################
This is lavaan.mi 0.1-0
See the README file on github.com/TDJorgensen/lavaan.mi
for a table comparing it with deprecated semTools features.
###################################################################

Attaching package: ‘lavaan.mi’

The following objects are masked from ‘package:semTools’:

    calculate.D2, cfa.mi, growth.mi, lavaan.mi, lavTestLRT.mi, lavTestScore.mi, lavTestWald.mi,
    modificationindices.mi, modificationIndices.mi, modindices.mi, sem.mi



1: lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. This may be a symptom that the model 
   is not identified. 
2: lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. This may be a symptom that the model 
   is not identified. 
3: lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. This may be a symptom that the model 
   is not identified. 
4: lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. T

In [10]:
library(lavaan.mi)                # guarantees the *.mi methods are visible

# --- DROP-IN: pooled extraction using lavaan.mi methods ---

pe_pool  <- parameterEstimates.mi(fit_ext_mi, standardized = FALSE, ci = TRUE)
std_pool <- standardizedSolution.mi(fit_ext_mi)   # has std.all

focus <- subset(std_pool,
                op  == "~" &
                lhs %in% c("Dep","Anx","Str") &
                rhs %in% c("BRS","cgpa","family_income"))

safe_write_csv(focus, "outputs/SEM/extended_sem_pooled_paths.csv")
safe_write_csv(as.data.frame(t(fitMeasures(fit_ext_mi, c("cfi","tli","rmsea","srmr","chisq","df")))),
               "outputs/SEM/extended_sem_fit.csv")


lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. This may be a symptom that the model 
   is not identified. 


"D3" and "D4" only available using maximum likelihood estimation. Changed to pool.method = "D2".


lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. This may be a symptom that the model 
   is not identified. 


In [11]:
fit_ext_mi_dwls <- lavaan.mi::lavaan.mi(
  model = model_ext,
  data  = dataList_pruned,
  estimator = "DWLS",
  parameterization = "theta",
  std.lv = TRUE,
  ordered = c(dass_items, brs_items)
)
# Compare key paths BRS/cgpa/family_income across WLSMV vs DWLS to confirm robustness




In [12]:
# Core (already fitted as fit_core)
core_fit   <- as.data.frame(t(fitMeasures(fit_core, c("cfi","tli","rmsea","srmr","chisq","df"))))
core_paths <- standardizedSolution(fit_core) |>
  dplyr::filter(op=="~", lhs %in% c("Dep","Anx","Str"), rhs %in% c("BRS"))
print(core_fit); print(core_paths)

# Extended pooled (fit_ext_mi)
ext_fit  <- as.data.frame(t(fitMeasures(fit_ext_mi, c("cfi","tli","rmsea","srmr","chisq","df"))))
ext_paths_focus <- std_pool |>
  dplyr::filter(op=="~", lhs %in% c("Dep","Anx","Str"),
                rhs %in% c("BRS","cgpa","family_income"))
print(ext_fit); print(ext_paths_focus)


  t(fitMeasures(fit_core, c("cfi", "tli", "rmsea", "srmr", "chisq", "df"))).cfi
1                                                                  9.234998e-01
2                                                                          <NA>
3                                                                          <NA>
4                                                                          <NA>
5                                                                          <NA>
6                                                                          <NA>
  t(fitMeasures(fit_core, c("cfi", "tli", "rmsea", "srmr", "chisq", "df"))).tli
1                                                                  9.573785e-01
2                                                                          <NA>
3                                                                          <NA>
4                                                                          <NA>
5                                       

In format.data.frame(if (omit) x[seq_len(n0), , drop = FALSE] else x,  :
  corrupt data frame: columns will be truncated or padded with NAs


  lhs op rhs est.std    se       z pvalue ci.lower ci.upper
1 Dep  ~ BRS  -0.490 0.039 -12.486      0   -0.567   -0.413
2 Anx  ~ BRS  -0.448 0.039 -11.595      0   -0.523   -0.372
3 Str  ~ BRS  -0.432 0.043  -9.947      0   -0.517   -0.347
"D3" and "D4" only available using maximum likelihood estimation. Changed to pool.method = "D2".


lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. This may be a symptom that the model 
   is not identified. 


  t(fitMeasures(fit_ext_mi, c("cfi", "tli", "rmsea", "srmr", "chisq", "df"))).cfi
1                                                                    9.588214e-01
2                                                                            <NA>
3                                                                            <NA>
4                                                                            <NA>
5                                                                            <NA>
6                                                                            <NA>
  t(fitMeasures(fit_ext_mi, c("cfi", "tli", "rmsea", "srmr", "chisq", "df"))).tli
1                                                                    9.757895e-01
2                                                                            <NA>
3                                                                            <NA>
4                                                                            <NA>
5               

In format.data.frame(if (omit) x[seq_len(n0), , drop = FALSE] else x,  :
  corrupt data frame: columns will be truncated or padded with NAs


                  lhs op           rhs est.std    se       z pvalue ci.lower ci.upper
Dep~BRS           Dep  ~           BRS  -0.514 0.040 -12.800  0.000   -0.593   -0.435
Dep~cgpa          Dep  ~          cgpa  -0.080 0.048  -1.647  0.100   -0.174    0.015
Dep~family_income Dep  ~ family_income  -0.011 0.045  -0.243  0.808   -0.099    0.077
Anx~BRS           Anx  ~           BRS  -0.443 0.040 -11.126  0.000   -0.521   -0.365
Anx~cgpa          Anx  ~          cgpa  -0.025 0.047  -0.523  0.601   -0.117    0.068
Anx~family_income Anx  ~ family_income   0.010 0.046   0.219  0.827   -0.081    0.101
Str~BRS           Str  ~           BRS  -0.446 0.044 -10.220  0.000   -0.532   -0.361
Str~cgpa          Str  ~          cgpa  -0.085 0.049  -1.759  0.079   -0.180    0.010
Str~family_income Str  ~ family_income  -0.021 0.045  -0.476  0.634   -0.109    0.066


In [14]:
dir.create("outputs/SEM/final", recursive = TRUE, showWarnings = FALSE)

# Core SEM
core_r2   <- as.data.frame(lavInspect(fit_core, "r2"))
core_std  <- standardizedSolution(fit_core)
safe_write_csv(core_fit, "outputs/SEM/final/core_fit.csv")
safe_write_csv(core_std %>% dplyr::filter(op=="~", lhs %in% c("Dep","Anx","Str")), "outputs/SEM/final/core_paths.csv")
safe_write_csv(core_r2,  "outputs/SEM/final/core_r2.csv")

# Extended (lavaan.mi pooled)

focus_ext <- std_pool %>% dplyr::filter(op=="~", lhs %in% c("Dep","Anx","Str"),
                                        rhs %in% c("BRS","cgpa","family_income"))

safe_write_csv(ext_fit,    "outputs/SEM/final/extended_fit.csv")
safe_write_csv(pe_pool,    "outputs/SEM/final/extended_param_table.csv")
safe_write_csv(focus_ext,  "outputs/SEM/final/extended_paths_focus.csv")

# Compact TXT summary to paste in thesis
sink("outputs/SEM/final/phase3_summary.txt")
cat("Phase 3 Core SEM fit (WLSMV, theta):\n"); print(core_fit)
cat("\nCore standardized BRS paths:\n"); print(core_std %>% dplyr::filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")))

cat("\nExtended pooled SEM fit (WLSMV, theta; D2 pooling):\n"); print(ext_fit)
cat("\nExtended pooled standardized paths (BRS, cgpa, family_income):\n"); print(focus_ext)
sink()


In format.data.frame(if (omit) x[seq_len(n0), , drop = FALSE] else x,  :
  corrupt data frame: columns will be truncated or padded with NAs
In format.data.frame(if (omit) x[seq_len(n0), , drop = FALSE] else x,  :
  corrupt data frame: columns will be truncated or padded with NAs


In [15]:
# Gender
std_g <- standardizedSolution(fit_mg_gender)
paths_g <- std_g %>% dplyr::filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")) %>%
  dplyr::select(group, lhs, est.std, se, pvalue)
means_g <- parameterEstimates(fit_mg_gender, standardized = TRUE, ci = TRUE) %>%
  dplyr::filter(op=="~1", lhs %in% c("Dep","Anx","Str")) %>%
  dplyr::select(group, lhs, est, se, ci.lower, ci.upper, pvalue)
safe_write_csv(paths_g, "outputs/SEM/final/mg_gender_BRS_paths.csv")
safe_write_csv(means_g, "outputs/SEM/final/mg_gender_latent_means.csv")

# Academic year
std_y <- standardizedSolution(fit_mg_year)
paths_y <- std_y %>% dplyr::filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")) %>%
  dplyr::select(group, lhs, est.std, se, pvalue)
means_y <- parameterEstimates(fit_mg_year, standardized = TRUE, ci = TRUE) %>%
  dplyr::filter(op=="~1", lhs %in% c("Dep","Anx","Str")) %>%
  dplyr::select(group, lhs, est, se, ci.lower, ci.upper, pvalue)
safe_write_csv(paths_y, "outputs/SEM/final/mg_year_BRS_paths.csv")
safe_write_csv(means_y, "outputs/SEM/final/mg_year_latent_means.csv")


In [18]:
# Core effect-style summary
core_es <- core_std %>%
  dplyr::filter(op=="~", lhs %in% c("Dep","Anx","Str"), rhs=="BRS") %>%
  dplyr::transmute(Outcome = lhs, Predictor = rhs, Beta = round(est.std,3),
                   SE = round(se,3), z = round(z,2), p = signif(pvalue,3))
# Replace the old R2 code with this
r2_vec <- lavInspect(fit_core, "r2")              # named numeric vector
core_r2_tidy <- data.frame(
  Outcome = names(r2_vec),
  R2      = as.numeric(r2_vec),
  stringsAsFactors = FALSE
)
core_r2_tidy$R2 <- round(core_r2_tidy$R2, 3)

safe_write_csv(core_r2_tidy, "outputs/SEM/final/core_r2.csv")


# Extended pooled effect-style summary
ext_es <- std_pool %>%
  dplyr::filter(op=="~", lhs %in% c("Dep","Anx","Str"), rhs %in% c("BRS","cgpa","family_income")) %>%
  dplyr::transmute(Outcome = lhs, Predictor = rhs, Beta = round(est.std,3),
                   SE = round(se,3), z = round(z,2), p = signif(pvalue,3),
                   CI_low = round(ci.lower,3), CI_high = round(ci.upper,3))
safe_write_csv(ext_es, "outputs/SEM/final/extended_effectsize_table.csv")


In [20]:
# Gender: constrain BRS paths equal
model_mg_gender_eq <- gsub("Dep ~ BRS", "Dep ~ c(bd,bd)*BRS", model_mg_gender, fixed = TRUE)
model_mg_gender_eq <- gsub("Anx ~ BRS", "Anx ~ c(ba,ba)*BRS", model_mg_gender_eq, fixed = TRUE)
model_mg_gender_eq <- gsub("Str ~ BRS", "Str ~ c(bs,bs)*BRS", model_mg_gender_eq, fixed = TRUE)

fit_mg_gender_eq <- sem(model_mg_gender_eq, data = dat_g, group = "gender",
                        ordered = c(dass_items, brs_items), estimator = "WLSMV",
                        parameterization = "theta", std.lv = TRUE,
                        group.equal = c("thresholds","loadings"))

g_free <- fitMeasures(fit_mg_gender,    c("cfi","tli","rmsea"))
g_eq   <- fitMeasures(fit_mg_gender_eq, c("cfi","tli","rmsea"))
delta_g <- data.frame(Index = names(g_free), Free = unname(g_free),
                      Equal = unname(g_eq), Delta = unname(g_eq - g_free))
safe_write_csv(delta_g, "outputs/SEM/final/mg_gender_delta_indices.csv")

# Academic year: same idea (ensure correct c() length equals number of year levels)


In [22]:
writeLines(capture.output(sessionInfo()), "outputs/SEM/final/sessionInfo.txt")
writeLines(model_core,        "outputs/SEM/final/model_core_syntax.txt")
writeLines(model_mg_gender,   "outputs/SEM/final/model_mg_gender_syntax.txt")
writeLines(model_mg_year,     "outputs/SEM/final/model_mg_year_syntax.txt")
writeLines(model_ext,         "outputs/SEM/final/model_extended_syntax.txt")


In [23]:
# --- DROP-IN: robust mediation with auto-detected residence dummies ---

# 1) Detect residence dummy columns actually present (after sanitizing/pruning)
res_dums <- grep("^RES_", names(dat), value = TRUE)

# 2) Require at least two residence dummies; otherwise, skip mediation cleanly
if (length(res_dums) >= 2) {

  # Use the first two dummies as Urban and Rural proxies for illustration;
  # replace with specific ones if you know the exact labels you want.
  res1 <- res_dums[1]
  res2 <- res_dums[2]

  # 3) Build mediation model syntax using the detected names
  model_core_med <- paste0('
    Dep =~ ', paste(dep_items, collapse = " + "), '
    Anx =~ ', paste(anx_items, collapse = " + "), '
    Str =~ ', paste(str_items, collapse = " + "), '
    BRS =~ ', paste(brs_items, collapse = " + "), '

    BRS ~ a1*', res1, ' + a2*', res2, '
    Dep ~ b1*BRS + c1*', res1, ' + c2*', res2, '
    Anx ~ b2*BRS + d1*', res1, ' + d2*', res2, '
    Str ~ b3*BRS + e1*', res1, ' + e2*', res2, '

    Dep ~~ Anx
    Dep ~~ Str
    Anx ~~ Str

    ind_Dep_', res1, ' := a1*b1
    ind_Anx_', res1, ' := a1*b2
    ind_Str_', res1, ' := a1*b3

    ind_Dep_', res2, ' := a2*b1
    ind_Anx_', res2, ' := a2*b2
    ind_Str_', res2, ' := a2*b3
  ')

  fit_core_med <- sem(model_core_med, data = dat,
                      ordered = c(dass_items, brs_items),
                      estimator = "WLSMV", parameterization = "theta",
                      std.lv = TRUE)

  pe_med <- parameterEstimates(fit_core_med, standardized = TRUE, ci = TRUE)
  safe_write_csv(pe_med, "outputs/SEM/final/core_mediation_params.csv")

} else {
  message("Mediation skipped: fewer than two RES_ dummies available after pruning.")
  writeLines("Mediation skipped: not enough RES_ dummies detected in dat.",
             "outputs/SEM/final/core_mediation_params.txt")
}


In [24]:
# --- DROP-IN: minimal plotting model with only latents and BRS regressions ---

# 1) Ensure these items exist (they should; we used sanitized dep_items, anx_items, str_items, brs_items)
stopifnot(all(c(dep_items, anx_items, str_items, brs_items) %in% names(dat)))

model_core_plot <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ BRS
  Anx ~ BRS
  Str ~ BRS

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_core_plot <- sem(model_core_plot, data = dat,
                     ordered = c(dass_items, brs_items),
                     estimator = "WLSMV", parameterization = "theta",
                     std.lv = TRUE)

# 2) Plot with manifest nodes hidden to avoid clutter and dimnames issues
library(semPlot)
png("outputs/SEM/final/core_sem_paths_min.png", width = 1600, height = 1000, res = 200)
semPaths(
  fit_core_plot,
  what = "std", whatLabels = "est", style = "ram",
  layout = "tree", sizeMan = 1, sizeLat = 8,
  edge.label.cex = 0.9, residuals = FALSE, intercepts = FALSE,
  nCharNodes = 0, exoCov = FALSE, mar = c(5,5,5,5)
)
dev.off()


In [26]:
library(ggplot2); library(dplyr)

# Core forest (BRS only)
core_coef <- core_std %>%
  filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")) %>%
  transmute(Outcome = lhs, Predictor = rhs, est = est.std, se = se) %>%
  mutate(ci_lo = est - 1.96*se, ci_hi = est + 1.96*se)

p_core_forest <- ggplot(core_coef, aes(x = Outcome, y = est)) +
  geom_hline(yintercept = 0, color = "grey60") +
  geom_point(color = "#2563eb", size = 2) +
  geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi), width = .15, color = "#2563eb") +
  coord_flip() +
  labs(title = "Core SEM: standardized paths", y = "Beta (std)", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/SEM/final/fig_core_forest.png", p_core_forest, width = 6, height = 3.6, dpi = 200,bg='white')

# Extended pooled forest (BRS, cgpa, family_income)
ext_std <- standardizedSolution.mi(fit_ext_mi)
ext_coef <- ext_std %>%
  filter(op=="~", lhs %in% c("Dep","Anx","Str"),
         rhs %in% c("BRS","cgpa","family_income")) %>%
  transmute(Outcome = lhs, Predictor = rhs, est = est.std, se = se,
            ci_lo = ci.lower, ci_hi = ci.upper)

p_ext_forest <- ggplot(ext_coef, aes(x = interaction(Outcome, Predictor),
                                     y = est, color = Predictor)) +
  geom_hline(yintercept = 0, color = "grey60") +
  geom_point(size = 2) +
  geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi), width = .15) +
  coord_flip() +
  labs(title = "Extended SEM (pooled): standardized paths",
       y = "Beta (std)", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/SEM/final/fig_extended_forest.png", p_ext_forest, width = 7.5, height = 4.2, dpi = 200,bg='white')


lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. This may be a symptom that the model 
   is not identified. 


In [27]:
r2_vec <- lavInspect(fit_core, "r2")
r2_df <- data.frame(Outcome = names(r2_vec), R2 = as.numeric(r2_vec))
p_r2 <- ggplot(r2_df, aes(x = Outcome, y = R2)) +
  geom_col(fill = "#059669") +
  geom_text(aes(label = sprintf("%.2f", R2)), vjust = -0.3, size = 3.4) +
  ylim(0, 1) +
  labs(title = "Core SEM: latent R²", y = "R²", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/SEM/final/fig_core_r2.png", p_r2, width = 5.5, height = 3.3, dpi = 200,bg='white')


In [32]:
# Gender: BRS paths by group
std_g <- standardizedSolution(fit_mg_gender)
g_coef <- std_g %>%
  filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")) %>%
  transmute(Group = as.factor(group), Outcome = lhs, est = est.std, se = se) %>%
  mutate(ci_lo = est - 1.96*se, ci_hi = est + 1.96*se)

p_gender_brs <- ggplot(g_coef, aes(x = Outcome, y = est, color = Group)) +
  geom_hline(yintercept = 0, color = "grey60") +
  geom_point(position = position_dodge(width = 0.5), size = 2) +
  geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi),
                position = position_dodge(width = 0.5), width = .15) +
  coord_flip() +
  labs(title = "Gender: BRS → DASS (std)", y = "Beta (std)", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/SEM/final/fig_mg_gender_brs.png", p_gender_brs, width = 6.5, height = 3.6, dpi = 200,bg='white')

# Gender: latent mean differences (ref group mean = 0)
pe_g <- parameterEstimates(fit_mg_gender, standardized = TRUE, ci = TRUE)
g_means <- pe_g %>%
  filter(op=="~1", lhs %in% c("Dep","Anx","Str")) %>%
  transmute(Group = as.factor(group), Outcome = lhs,
            mean = est, ci_lo = ci.lower, ci_hi = ci.upper)

p_gender_means <- ggplot(g_means, aes(x = Outcome, y = mean, color = Group)) +
  geom_hline(yintercept = 0, color = "grey60") +
  geom_point(position = position_dodge(width = 0.5), size = 2) +
  geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi),
                position = position_dodge(width = 0.5), width = .15) +
  coord_flip() +
  labs(title = "Gender: latent means (reference group fixed at 0)",
       y = "Latent mean (unit)", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/SEM/final/fig_mg_gender_means.png", p_gender_means, width = 6.5, height = 3.6, dpi = 200,bg='white')

# Academic year: repeat for year
std_y <- standardizedSolution(fit_mg_year)
y_coef <- std_y %>%
  filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")) %>%
  transmute(Group = as.factor(group), Outcome = lhs, est = est.std, se = se) %>%
  mutate(ci_lo = est - 1.96*se, ci_hi = est + 1.96*se)
p_year_brs <- ggplot(y_coef, aes(x = Outcome, y = est, color = Group)) +
  geom_hline(yintercept = 0, color = "grey60") +
  geom_point(position = position_dodge(width = 0.6), size = 2) +
  geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi),
                position = position_dodge(width = 0.6), width = .15) +
  coord_flip() +
  labs(title = "Academic year: BRS → DASS (std)", y = "Beta (std)", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/SEM/final/fig_mg_year_brs.png", p_year_brs, width = 7.5, height = 4.2, dpi = 200,bg='white')

pe_y <- parameterEstimates(fit_mg_year, standardized = TRUE, ci = TRUE)
y_means <- pe_y %>%
  filter(op=="~1", lhs %in% c("Dep","Anx","Str")) %>%
  transmute(Group = as.factor(group), Outcome = lhs,
            mean = est, ci_lo = ci.lower, ci_hi = ci.upper)
p_year_means <- ggplot(y_means, aes(x = Outcome, y = mean, color = Group)) +
  geom_hline(yintercept = 0, color = "grey60") +
  geom_point(position = position_dodge(width = 0.6), size = 2) +
  geom_errorbar(aes(ymin = ci_lo, ymax = ci_hi),
                position = position_dodge(width = 0.6), width = .15) +
  coord_flip() +
  labs(title = "Academic year: latent means (reference group fixed at 0)",
       y = "Latent mean (unit)", x = "") +
  theme_minimal(base_size = 11)
ggsave("outputs/SEM/final/fig_mg_year_means.png", p_year_means, width = 7.5, height = 4.2, dpi = 200,bg='white')


In [29]:
library(semPlot)
png("outputs/SEM/final/fig_core_latent_diagram.png", width = 1600, height = 1000, res = 200,bg='white')
semPaths(
  fit_core_plot, what="std", whatLabels="est", style="ram",
  layout="tree", sizeMan=1, sizeLat=8, residuals=FALSE, intercepts=FALSE,
  edge.label.cex=.9, exoCov=FALSE, mar=c(5,5,5,5)
)
dev.off()


In [31]:
# Build a quick audit of available vs used exogenous dummies in extended model
nm0 <- names(dataList[[4]])
available_blocks <- c("Gender" = any(grepl("^G_",   nm0)),
                      "AcadYear"= any(grepl("^AY_",  nm0)),
                      "Dept"    = any(grepl("^DG_",  nm0)),
                      "Residence"=any(grepl("^RES_", nm0)),
                      "Living"  = any(grepl("^LIV_", nm0)))

used_nm <- names(dataList_pruned[[4]])
used_blocks <- c("Gender" = any(grepl("^G_",   used_nm)),
                 "AcadYear"= any(grepl("^AY_",  used_nm)),
                 "Dept"    = any(grepl("^DG_",  used_nm)),
                 "Residence"=any(grepl("^RES_", used_nm)),
                 "Living"  = any(grepl("^LIV_", used_nm)))

audit <- data.frame(Block = names(available_blocks),
                    Available = as.logical(available_blocks),
                    Used = as.logical(used_blocks))
safe_write_csv(audit, "outputs/SEM/final/pruning_audit.csv")


In [35]:
# Indices of rows used in fit_core
idx_used <- lavInspect(fit_core, "case.idx")  # integer vector of row numbers
length(idx_used)            # should equal nrow(fs)

# Build a scores data frame with ID and labels from the original 'dat'
fs <- lavPredict(fit_core, type = "lv", label = TRUE)
fs <- as.data.frame(fs)
stopifnot(nrow(fs) == length(idx_used))

# Pull the ID and low-missing labels for those specific rows
vars_label <- intersect(c("id","gender","academic_year","residence"), names(dat))
labels_used <- dat[idx_used, vars_label, drop = FALSE]

# Bind labels to scores (same order by construction)
phase4_df <- cbind(labels_used, fs)

# Optional: reset row names to avoid confusion
rownames(phase4_df) <- NULL


In [36]:
# Indices of rows used in fit_core
idx_used <- lavInspect(fit_core, "case.idx")  # integer vector of row numbers
length(idx_used)            # should equal nrow(fs)

# Build a scores data frame with ID and labels from the original 'dat'
fs <- lavPredict(fit_core, type = "lv", label = TRUE)
fs <- as.data.frame(fs)
stopifnot(nrow(fs) == length(idx_used))

# Pull the ID and low-missing labels for those specific rows
vars_label <- intersect(c("id","gender","academic_year","residence"), names(dat))
labels_used <- dat[idx_used, vars_label, drop = FALSE]

# Bind labels to scores (same order by construction)
phase4_df <- cbind(labels_used, fs)

# Optional: reset row names to avoid confusion
rownames(phase4_df) <- NULL


In [37]:
# Drop any residual missing latent rows (rare)
phase4_df <- phase4_df[stats::complete.cases(phase4_df[, c("Dep","Anx","Str","BRS")]), ]
dir.create("outputs/Phase4", recursive = TRUE, showWarnings = FALSE)
safe_write_csv(phase4_df, "outputs/Phase4/phase4_latent_inputs.csv")


In [39]:
# 5) Archive final Phase 3 objects and metadata
dir.create("outputs/Phase3_archive", recursive = TRUE, showWarnings = FALSE)

# Core fit indices and standardized paths (already built earlier; re-save if needed)
core_fit <- as.data.frame(t(fitMeasures(fit_core, c("cfi","tli","rmsea","srmr","chisq","df"))))
safe_write_csv(core_fit, "outputs/Phase3_archive/core_fit.csv")

core_std <- standardizedSolution(fit_core)
safe_write_csv(core_std, "outputs/Phase3_archive/core_standardized_solution.csv")

# Extended pooled (lavaan.mi) parameter table and fit
pe_pool <- parameterEstimates.mi(fit_ext_mi, standardized = FALSE, ci = TRUE)
safe_write_csv(pe_pool, "outputs/Phase3_archive/extended_pooled_params.csv")

ext_fit <- as.data.frame(t(fitMeasures(fit_ext_mi, c("cfi","tli","rmsea","srmr","chisq","df"))))
safe_write_csv(ext_fit, "outputs/Phase3_archive/extended_fit.csv")

# Multi-group summaries (if not already saved)
std_g <- standardizedSolution(fit_mg_gender)
safe_write_csv(std_g, "outputs/Phase3_archive/mg_gender_standardized_solution.csv")
std_y <- standardizedSolution(fit_mg_year)
safe_write_csv(std_y, "outputs/Phase3_archive/mg_year_standardized_solution.csv")

# Model syntax files (ensure the corresponding objects exist in your environment)
writeLines(model_core,        "outputs/Phase3_archive/model_core_syntax.txt")
writeLines(model_mg_gender,   "outputs/Phase3_archive/model_mg_gender_syntax.txt")
writeLines(model_mg_year,     "outputs/Phase3_archive/model_mg_year_syntax.txt")
writeLines(model_ext,         "outputs/Phase3_archive/model_extended_syntax.txt")

# Session info
writeLines(capture.output(sessionInfo()), "outputs/Phase3_archive/sessionInfo.txt")


"D3" and "D4" only available using maximum likelihood estimation. Changed to pool.method = "D2".


lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.181245e-13) is close to zero. This may be a symptom that the 
   model is not identified. 


In [40]:
# 6) Quick sanity checks for Phase 4
stopifnot(all(colSums(is.na(phase4_df[, c("Dep","Anx","Str","BRS")])) == 0))
apply(scale(phase4_df[, c("Dep","Anx","Str","BRS")]), 2, function(v) c(mean=mean(v), sd=sd(v)))  # should be ~0, ~1

# Set a global seed for later PCA/clustering runs
set.seed(20250909)  # reproducible initialization for downstream steps
