In [1]:
library(dplyr); library(haven); library(forcats); library(lavaan); library(semTools)
library(mice); library(stringr)

sanitize_names <- function(x){
  x <- gsub("[^A-Za-z0-9_]", "_", x)
  x <- gsub("_+", "_", x); x <- gsub("^_+","",x); x <- gsub("_+$","",x)
  ifelse(grepl("^[A-Za-z]", x), x, paste0("X_", x)) |> make.unique(sep = "_")
}

# NA‑preserving K−1 dummies (no “Missing” category)
mk_dummies_na <- function(df, var, prefix = var) {
  # set a stable baseline = most frequent non-missing level
  base <- names(sort(table(df[[var]]), decreasing = TRUE))[6]
  v <- forcats::fct_relevel(df[[var]], base)
  mf <- model.frame(reformulate("v"), data = data.frame(v), na.action = na.pass)
  mm <- model.matrix(~ v, data = mf)
  mm <- mm[, colnames(mm) != "(Intercept)", drop = FALSE]
  colnames(mm) <- sub("^v", prefix, colnames(mm))
  as.data.frame(mm)
}

safe_write_csv <- function(x, path) {
  if (!inherits(x, "data.frame")) x <- as.data.frame(x, stringsAsFactors = FALSE)
  bad <- sapply(x, function(col) is.list(col) || is.matrix(col) || is.array(col))
  if (any(bad)) {
    x[bad] <- lapply(x[bad], function(col) {
      if (is.list(col)) {
        if (all(vapply(col, length, 1L) <= 1L)) unlist(col) else NULL
      } else if (is.matrix(col) && ncol(col) == 1L) as.vector(col) else NULL
    })
    x <- x[!vapply(x, is.null, logical(1))]
  }
  if (ncol(x) == 0) x <- data.frame(note = "no data", stringsAsFactors = FALSE)
  if (nrow(x) == 0) x[1,] <- NA
  colnames(x) <- make.names(colnames(x), unique = TRUE)
  utils::write.csv(x, path, row.names = FALSE)
}



Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

This is lavaan 0.6-19
lavaan is FREE software! Please report any bugs.
 
###############################################################################
This is semTools 0.5-7
All users of R (or SEM) are invited to submit functions or ideas for functions.
###############################################################################

Attaching package: ‘mice’

The following object is masked from ‘package:stats’:

    filter

The following objects are masked from ‘package:base’:

    cbind, rbind



In [2]:
# Load Phase 2 analysis file
dat0 <- readRDS("data/processed/analysis_phase1_ordered.rds") %>%
  mutate(
    gender           = haven::as_factor(gender, levels = "labels"),
    academic_year    = haven::as_factor(academic_year, levels = "labels"),
    dept_group       = haven::as_factor(dept_group, levels = "labels"),
    residence        = haven::as_factor(residence, levels = "labels"),
    living_situation = haven::as_factor(living_situation, levels = "labels")
  )

# Dummies (no Missing category)
dum_gender <- mk_dummies_na(dat0, "gender",           "G_")
dum_year   <- mk_dummies_na(dat0, "academic_year",    "AY_")
dum_dept   <- mk_dummies_na(dat0, "dept_group",       "DG_")
dum_res    <- mk_dummies_na(dat0, "residence",        "RES_")
dum_live   <- mk_dummies_na(dat0, "living_situation", "LIV_")

stopifnot(nrow(dum_gender)==nrow(dat0), nrow(dum_year)==nrow(dat0),
          nrow(dum_dept)==nrow(dat0),   nrow(dum_res)==nrow(dat0),
          nrow(dum_live)==nrow(dat0))

dat <- dplyr::bind_cols(dat0, dum_gender, dum_year, dum_dept, dum_res, dum_live)

# Sanitize all column names for lavaan
names(dat) <- sanitize_names(names(dat))

# Item sets after sanitization
dep_items  <- sanitize_names(c("dQ3D","dQ5D","dQ10D","dQ13D","dQ16D","dQ17D","dQ21D"))
anx_items  <- sanitize_names(c("dQ2A","dQ4A","dQ7A","dQ9A","dQ15A","dQ19A","dQ20A"))
str_items  <- sanitize_names(c("dQ1S","dQ6S","dQ8S","dQ11S","dQ12S","dQ14S","dQ18S"))
brs_items  <- sanitize_names(c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))
dass_items <- c(dep_items, anx_items, str_items)

# Ensure ordered types
dat[dass_items] <- lapply(dat[dass_items], function(x) factor(as.integer(as.character(x)), levels = 0:3, ordered = TRUE))
dat[brs_items]  <- lapply(dat[brs_items],  function(x) factor(as.integer(as.character(x)), levels = 1:5, ordered = TRUE))

# Exogenous dummy candidates
safe_exo <- grep("^(G_|AY_|DG_|RES_|LIV_)", names(dat), value = TRUE)


[1m[22m1 unknown level in `f`: NA 
[1m[22m1 unknown level in `f`: NA 
[1m[22m1 unknown level in `f`: NA 
[1m[22m1 unknown level in `f`: NA 


In [3]:
# Remove ultra-sparse dummies to reduce WLSMV singularity risk
dummy_ones <- sapply(dat[, safe_exo, drop = FALSE], function(v) sum(v == 1, na.rm = TRUE))
safe_exo_core <- names(dummy_ones)[dummy_ones >= 10]

rhs_core <- paste(c("BRS", safe_exo_core), collapse = " + ")

model_core <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_core, '
  Anx ~ ', rhs_core, '
  Str ~ ', rhs_core, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_core <- sem(model_core, data = dat,
                ordered = c(dass_items, brs_items),
                estimator = "WLSMV",
                parameterization = "theta",
                std.lv = TRUE,
                control = list(iter.max = 20000, rel.tol = 1e-6))

summary(fit_core, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)

# Save core outputs
dir.create("outputs/SEM", showWarnings = FALSE, recursive = TRUE)
safe_write_csv(as.data.frame(t(fitMeasures(fit_core, c("cfi","tli","rmsea","srmr","chisq","df")))),
               "outputs/SEM/core_sem_fit.csv")
std_core <- standardizedSolution(fit_core)
safe_write_csv(std_core %>% filter(op=="~", lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/core_sem_paths.csv")
safe_write_csv(as.data.frame(lavInspect(fit_core, "r2")), "outputs/SEM/core_sem_r2.csv")


lavaan->muthen1984():  
   trouble constructing W matrix; used generalized inverse for A11 submatrix 


In [4]:
# Prepare trimmed exogenous sets per grouping
exo_no_gender <- setdiff(safe_exo_core, grep("^G_", safe_exo_core, value = TRUE))
exo_no_year   <- setdiff(safe_exo_core, grep("^AY_", safe_exo_core, value = TRUE))

drop_group_constant <- function(data, group_var, exo_names){
  keep <- sapply(exo_names, function(v){
    byv <- split(data[[v]], data[[group_var]])
    all(sapply(byv, function(x) {
      x <- x[!is.na(x)]
      if (length(x) == 0L) FALSE else (length(unique(x)) > 1L)
    }))
  })
  exo_names[keep]
}

# Gender multi-group
dat_g <- dat %>% filter(!is.na(gender))
exo_gender_final <- drop_group_constant(dat_g, "gender", exo_no_gender)
rhs_gender <- paste(c("BRS", exo_gender_final), collapse = " + ")

model_mg_gender <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_gender, '
  Anx ~ ', rhs_gender, '
  Str ~ ', rhs_gender, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_mg_gender <- sem(model_mg_gender, data = dat_g,
                     group = "gender",
                     ordered = c(dass_items, brs_items),
                     estimator = "WLSMV",
                     parameterization = "theta",
                     std.lv = TRUE,
                     group.equal = c("thresholds","loadings"))
summary(fit_mg_gender, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)

# Extract paths and latent means by gender
std_g <- standardizedSolution(fit_mg_gender)
safe_write_csv(std_g %>% filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/mg_gender_BRS_paths_by_group.csv")
pe_g <- parameterEstimates(fit_mg_gender, standardized = TRUE, ci = TRUE)
safe_write_csv(subset(pe_g, op=="~1" & lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/mg_gender_latent_means.csv")
safe_write_csv(as.data.frame(t(fitMeasures(fit_mg_gender, c("cfi","tli","rmsea","srmr","chisq","df")))),
               "outputs/SEM/mg_gender_fit.csv")

# Academic year multi-group
dat_y <- dat %>% filter(!is.na(academic_year))
exo_year_final <- drop_group_constant(dat_y, "academic_year", exo_no_year)
rhs_year <- paste(c("BRS", exo_year_final), collapse = " + ")

model_mg_year <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_year, '
  Anx ~ ', rhs_year, '
  Str ~ ', rhs_year, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_mg_year <- sem(model_mg_year, data = dat_y,
                   group = "academic_year",
                   ordered = c(dass_items, brs_items),
                   estimator = "WLSMV",
                   parameterization = "theta",
                   std.lv = TRUE,
                   group.equal = c("thresholds","loadings"))
summary(fit_mg_year, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)

std_y <- standardizedSolution(fit_mg_year)
safe_write_csv(std_y %>% filter(op=="~", rhs=="BRS", lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/mg_year_BRS_paths_by_group.csv")
pe_y <- parameterEstimates(fit_mg_year, standardized = TRUE, ci = TRUE)
safe_write_csv(subset(pe_y, op=="~1" & lhs %in% c("Dep","Anx","Str")),
               "outputs/SEM/mg_year_latent_means.csv")
safe_write_csv(as.data.frame(t(fitMeasures(fit_mg_year, c("cfi","tli","rmsea","srmr","chisq","df")))),
               "outputs/SEM/mg_year_fit.csv")


1: lavaan->lav_object_post_check():  
   covariance matrix of latent variables is not positive definite in group 2; use lavInspect(fit, 
   "cov.lv") to investigate. 
2: lavaan->lav_object_post_check():  
   covariance matrix of latent variables is not positive definite in group 3; use lavInspect(fit, 
   "cov.lv") to investigate. 


In [None]:
library(mice)

mi_vars  <- c("cgpa","family_income")
aux_preds<- c("gender","academic_year","dept_group","residence","living_situation")

mi_dat <- dat0[, c(mi_vars, aux_preds), drop = FALSE]

meth <- make.method(mi_dat)
meth["cgpa"]          <- "pmm"
meth["family_income"] <- "pmm"

pred <- make.predictorMatrix(mi_dat); pred[,] <- 0
pred["cgpa", aux_preds]          <- 1
pred["family_income", aux_preds] <- 1

set.seed(20250908)
imp <- mice(mi_dat, m = 20, maxit = 20, method = meth, predictorMatrix = pred, printFlag = FALSE)
saveRDS(imp, "outputs/SEM/mi_covariates_cgpa_income_m20.rds")


Number of logged events: 800 


In [12]:
library(forcats)

# Safe dummy builder that preserves N, avoids NA as level, and no “unknown level” warnings
mk_dummies_preserve <- function(df, var, prefix = var){
  v <- df[[var]]
  base <- names(sort(table(v), decreasing = TRUE))[7]
  # Non-NA rows
  keep <- !is.na(v)
  mm <- matrix(NA_real_, nrow = length(v), ncol = 0)
  if (any(keep)) {
    v_nonNA <- forcats::fct_relevel(v[keep], base)
    mm_non  <- model.matrix(~ v_nonNA)[, -1, drop = FALSE]   # K-1 dummies
    colnames(mm_non) <- sub("^v_nonNA", prefix, colnames(mm_non))
    # place back into full-N matrix
    mm <- matrix(NA_real_, nrow = length(v), ncol = ncol(mm_non))
    mm[keep, ] <- mm_non
    colnames(mm) <- colnames(mm_non)
  }
  as.data.frame(mm)
}

prep_sem_dataset <- function(comp_covs, full_data){
  stopifnot(nrow(comp_covs) == nrow(full_data))
  dat_ <- full_data
  dat_[, c("cgpa","family_income")] <- comp_covs[, c("cgpa","family_income")]

  # Ensure factors
  dat_ <- dat_ %>%
    mutate(
      gender           = haven::as_factor(gender, levels = "labels"),
      academic_year    = haven::as_factor(academic_year, levels = "labels"),
      dept_group       = haven::as_factor(dept_group, levels = "labels"),
      residence        = haven::as_factor(residence, levels = "labels"),
      living_situation = haven::as_factor(living_situation, levels = "labels")
    )

  # Dummies
  dum_gender <- mk_dummies_preserve(dat_, "gender",           "G_")
  dum_year   <- mk_dummies_preserve(dat_, "academic_year",    "AY_")
  dum_dept   <- mk_dummies_preserve(dat_, "dept_group",       "DG_")
  dum_res    <- mk_dummies_preserve(dat_, "residence",        "RES_")
  dum_live   <- mk_dummies_preserve(dat_, "living_situation", "LIV_")

  dat_ <- dplyr::bind_cols(dat_, dum_gender, dum_year, dum_dept, dum_res, dum_live)

  # Sanitize names
  names(dat_) <- sanitize_names(names(dat_))

  # Ensure ordered indicators
  d_items <- sanitize_names(c("dQ1S","dQ2A","dQ3D","dQ4A","dQ5D","dQ6S","dQ7A","dQ8S","dQ9A",
                              "dQ10D","dQ11S","dQ12S","dQ13D","dQ14S","dQ15A","dQ16D","dQ17D",
                              "dQ18S","dQ19A","dQ20A","dQ21D"))
  b_items <- sanitize_names(c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))
  dat_[d_items] <- lapply(dat_[d_items], function(x) factor(as.integer(as.character(x)), levels = 0:3, ordered = TRUE))
  dat_[b_items] <- lapply(dat_[b_items], function(x) factor(as.integer(as.character(x)), levels = 1:5, ordered = TRUE))

  # Drop ultra-sparse dummies for this dataset
  safe_exo <- grep("^(G_|AY_|DG_|RES_|LIV_)", names(dat_), value = TRUE)
  if (length(safe_exo)) {
    ones <- sapply(dat_[, safe_exo, drop = FALSE], function(v) sum(v == 1, na.rm = TRUE))
    drop_cols <- names(ones)[ones < 10]
    if (length(drop_cols)) dat_[drop_cols] <- NULL
  }

  dat_
}

analysis_data_full <- readRDS("data/processed/analysis_phase1_ordered.rds")
complete_list <- mice::complete(imp, "all")
dataList <- lapply(complete_list, prep_sem_dataset, full_data = analysis_data_full)




In [13]:
library(forcats)

# Safe dummy builder that preserves N, avoids NA as level, and no “unknown level” warnings
mk_dummies_preserve <- function(df, var, prefix = var){
  v <- df[[var]]
  base <- names(sort(table(v), decreasing = TRUE))[7]
  # Non-NA rows
  keep <- !is.na(v)
  mm <- matrix(NA_real_, nrow = length(v), ncol = 0)
  if (any(keep)) {
    v_nonNA <- forcats::fct_relevel(v[keep], base)
    mm_non  <- model.matrix(~ v_nonNA)[, -1, drop = FALSE]   # K-1 dummies
    colnames(mm_non) <- sub("^v_nonNA", prefix, colnames(mm_non))
    # place back into full-N matrix
    mm <- matrix(NA_real_, nrow = length(v), ncol = ncol(mm_non))
    mm[keep, ] <- mm_non
    colnames(mm) <- colnames(mm_non)
  }
  as.data.frame(mm)
}

prep_sem_dataset <- function(comp_covs, full_data){
  stopifnot(nrow(comp_covs) == nrow(full_data))
  dat_ <- full_data
  dat_[, c("cgpa","family_income")] <- comp_covs[, c("cgpa","family_income")]

  # Ensure factors
  dat_ <- dat_ %>%
    mutate(
      gender           = haven::as_factor(gender, levels = "labels"),
      academic_year    = haven::as_factor(academic_year, levels = "labels"),
      dept_group       = haven::as_factor(dept_group, levels = "labels"),
      residence        = haven::as_factor(residence, levels = "labels"),
      living_situation = haven::as_factor(living_situation, levels = "labels")
    )

  # Dummies
  dum_gender <- mk_dummies_preserve(dat_, "gender",           "G_")
  dum_year   <- mk_dummies_preserve(dat_, "academic_year",    "AY_")
  dum_dept   <- mk_dummies_preserve(dat_, "dept_group",       "DG_")
  dum_res    <- mk_dummies_preserve(dat_, "residence",        "RES_")
  dum_live   <- mk_dummies_preserve(dat_, "living_situation", "LIV_")

  dat_ <- dplyr::bind_cols(dat_, dum_gender, dum_year, dum_dept, dum_res, dum_live)

  # Sanitize names
  names(dat_) <- sanitize_names(names(dat_))

  # Ensure ordered indicators
  d_items <- sanitize_names(c("dQ1S","dQ2A","dQ3D","dQ4A","dQ5D","dQ6S","dQ7A","dQ8S","dQ9A",
                              "dQ10D","dQ11S","dQ12S","dQ13D","dQ14S","dQ15A","dQ16D","dQ17D",
                              "dQ18S","dQ19A","dQ20A","dQ21D"))
  b_items <- sanitize_names(c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))
  dat_[d_items] <- lapply(dat_[d_items], function(x) factor(as.integer(as.character(x)), levels = 0:3, ordered = TRUE))
  dat_[b_items] <- lapply(dat_[b_items], function(x) factor(as.integer(as.character(x)), levels = 1:5, ordered = TRUE))

  # Drop ultra-sparse dummies for this dataset
  safe_exo <- grep("^(G_|AY_|DG_|RES_|LIV_)", names(dat_), value = TRUE)
  if (length(safe_exo)) {
    ones <- sapply(dat_[, safe_exo, drop = FALSE], function(v) sum(v == 1, na.rm = TRUE))
    drop_cols <- names(ones)[ones < 10]
    if (length(drop_cols)) dat_[drop_cols] <- NULL
  }

  dat_
}

analysis_data_full <- readRDS("data/processed/analysis_phase1_ordered.rds")
complete_list <- mice::complete(imp, "all")
dataList <- lapply(complete_list, prep_sem_dataset, full_data = analysis_data_full)




In [None]:
library(lavaan.mi)

# Names derived from the first completed dataset (already created above)
nm0 <- names(dataList[[14]])
dep_items <- intersect(nm0, grep("^dQ(3D|5D|10D|13D|16D|17D|21D)$", nm0, value = TRUE))
anx_items <- intersect(nm0, grep("^dQ(2A|4A|7A|9A|15A|19A|20A)$", nm0, value = TRUE))
str_items <- intersect(nm0, grep("^dQ(1S|6S|8S|11S|12S|14S|18S)$", nm0, value = TRUE))
brs_items <- intersect(nm0, c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))
dass_items <- c(dep_items, anx_items, str_items)
safe_exo   <- grep("^(G_|AY_|DG_|RES_|LIV_)", nm0, value = TRUE)

rhs_ext <- paste(c("BRS", safe_exo, "cgpa", "family_income"), collapse = " + ")

model_ext <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_ext, '
  Anx ~ ', rhs_ext, '
  Str ~ ', rhs_ext, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

library(lavaan.mi)

# Fit pooled SEM to imputed data (list of completed data frames)
fit_ext_mi <- lavaan.mi::lavaan.mi(
  model = model_ext,
  data  = dataList,                      # CORRECT: pass list via 'data', not 'dataList'
  estimator = "WLSMV",
  parameterization = "theta",
  std.lv = TRUE,
  ordered = c(dass_items, brs_items)     # use the detected item names for this list
)

summary(fit_ext_mi, fit.measures = TRUE, standardized = TRUE)



1: lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.151805e-13) is close to zero. This may be a symptom that 
   the model is not identified. 
2: lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.151805e-13) is close to zero. This may be a symptom that 
   the model is not identified. 
3: lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.151805e-13) is close to zero. This may be a symptom that 
   the model is not identified. 
4: lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.151805e-13) is close to zero. T

: [1m[33mError[39m in `lav_standardize_all()`:[22m
[33m![39m no slot of name "implied" for this object of class "lavaan.mi"

In [20]:
library(lavaan.mi)                # guarantees the *.mi methods are visible

std <- standardizedSolution.mi(fit_ext_mi)
focus <- subset(std,
                op  == "~" &
                lhs %in% c("Dep","Anx","Str") &
                rhs %in% c("BRS","cgpa","family_income"))

safe_write_csv(focus, "outputs/SEM/extended_sem_pooled_paths.csv")

fit_pool <- fitMeasures(fit_ext_mi, c("cfi","tli","rmsea","srmr","chisq","df"))
safe_write_csv(as.data.frame(t(fit_pool)), "outputs/SEM/extended_sem_fit.csv")


lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.151805e-13) is close to zero. This may be a symptom that 
   the model is not identified. 


"D3" and "D4" only available using maximum likelihood estimation. Changed to pool.method = "D2".


lavaan->lav_model_vcov():  
   The variance-covariance matrix of the estimated parameters (vcov) does not appear to be positive 
   definite! The smallest eigenvalue (= 2.151805e-13) is close to zero. This may be a symptom that 
   the model is not identified. 
