In [19]:
library(dplyr)
library(haven)
library(forcats)
library(lavaan)

dat <- readRDS("data/processed/analysis_phase1_ordered.rds")

# 1) Convert to regular factors with labels (avoid haven_labelled)
dat <- dat %>%
  mutate(
    gender           = haven::as_factor(gender, levels = "labels"),
    academic_year    = haven::as_factor(academic_year, levels = "labels"),
    dept_group       = haven::as_factor(dept_group, levels = "labels"),
    residence        = haven::as_factor(residence, levels = "labels"),
    living_situation = haven::as_factor(living_situation, levels = "labels")
  )

# 2) Choose a sensible baseline = most frequent non-missing level (prevents accidental alphabetical reference)
most_freq_level <- function(x) names(sort(table(x), decreasing = TRUE))[3]

dat <- dat %>%
  mutate(
    gender           = fct_relevel(gender,           most_freq_level(gender)),
    academic_year    = fct_relevel(academic_year,    most_freq_level(academic_year)),
    dept_group       = fct_relevel(dept_group,       most_freq_level(dept_group)),
    residence        = fct_relevel(residence,        most_freq_level(residence)),
    living_situation = fct_relevel(living_situation, most_freq_level(living_situation))
  )

# 3) NA‑preserving K−1 dummies (no “Missing” level)
mk_dummies_na <- function(df, var, prefix = var) {
  # Preserve rows and NAs
  mf <- model.frame(reformulate(var), data = df, na.action = na.pass)
  mm <- model.matrix(reformulate(var), data = mf)        # includes intercept + K−1
  mm <- mm[, colnames(mm) != "(Intercept)", drop = FALSE]# drop intercept, keep K−1 dummies
  colnames(mm) <- sub(paste0("^", var), prefix, colnames(mm))
  as.data.frame(mm)                                      # rows with NA in var become NA across all dummies
}

dum_gender <- mk_dummies_na(dat, "gender",           "G_")
dum_year   <- mk_dummies_na(dat, "academic_year",    "AY_")
dum_dept   <- mk_dummies_na(dat, "dept_group",       "DG_")
dum_res    <- mk_dummies_na(dat, "residence",        "RES_")
dum_live   <- mk_dummies_na(dat, "living_situation", "LIV_")

# 4) Sanity checks: all have the same row count as dat; no “Missing” columns created
stopifnot(nrow(dum_gender)==nrow(dat), nrow(dum_year)==nrow(dat),
          nrow(dum_dept)==nrow(dat),   nrow(dum_res)==nrow(dat),
          nrow(dum_live)==nrow(dat))

# 5) Bind and proceed
dat <- dplyr::bind_cols(dat, dum_gender, dum_year, dum_dept, dum_res, dum_live)
exo_covars <- c(colnames(dum_gender), colnames(dum_year),
                colnames(dum_dept),  colnames(dum_res), colnames(dum_live))


In [20]:
library(dplyr)
library(stringi)

# Function to sanitize names (letters, digits, underscore only)
sanitize_names <- function(x){
  x <- gsub("[^A-Za-z0-9_]", "_", x)     # replace any non-word char with _
  x <- gsub("_+", "_", x)                # collapse multiple _
  x <- gsub("^_+", "", x)                # no leading _
  x <- gsub("_+$", "", x)                # no trailing _
  # ensure starts with letter
  x <- ifelse(grepl("^[A-Za-z]", x), x, paste0("X_", x))
  make.unique(x, sep = "_")
}

# Sanitize existing dummy column names in-place
old_names <- names(dat)
# Identify which columns are dummy predictors
exo_covars <- c(colnames(dum_gender), colnames(dum_year),
                colnames(dum_dept),  colnames(dum_res), colnames(dum_live))

safe_exo   <- sanitize_names(exo_covars)

# Build a name map and rename in dat
name_map <- setNames(safe_exo, exo_covars)
names(dat) <- sanitize_names(names(dat))  # also sanitize all names in dat to be safe
# After sanitizing all, rebuild the exo list from the renamed data by mapping
safe_exo <- unname(name_map[intersect(names(name_map), names(dat))])

# Also re-define the item vectors using sanitized names
dep_items <- sanitize_names(c("dQ3D","dQ5D","dQ10D","dQ13D","dQ16D","dQ17D","dQ21D"))
anx_items <- sanitize_names(c("dQ2A","dQ4A","dQ7A","dQ9A","dQ15A","dQ19A","dQ20A"))
str_items <- sanitize_names(c("dQ1S","dQ6S","dQ8S","dQ11S","dQ12S","dQ14S","dQ18S"))
dass_items <- c(dep_items, anx_items, str_items)

brs_items <- sanitize_names(c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))

# Ensure these columns exist post-rename
stopifnot(all(dass_items %in% names(dat)), all(brs_items %in% names(dat)))


In [21]:
dat[dass_items] <- lapply(dat[dass_items], function(x) factor(as.integer(as.character(x)), levels = 0:3, ordered = TRUE))
dat[brs_items]  <- lapply(dat[brs_items],  function(x) factor(as.integer(as.character(x)), levels = 1:5, ordered = TRUE))


In [22]:
# Rebuild RHS with sanitized dummy names
rhs <- paste(c("BRS", safe_exo), collapse = " + ")

model_core <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs, '
  Anx ~ ', rhs, '
  Str ~ ', rhs, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_core <- sem(model_core, data = dat,
                ordered = c(dass_items, brs_items),
                estimator = "WLSMV",
                parameterization = "theta",
                std.lv = TRUE,
                control = list(iter.max = 20000, rel.tol = 1e-6))

summary(fit_core, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)


lavaan->muthen1984():  
   trouble constructing W matrix; used generalized inverse for A11 submatrix 


In [23]:
# Reuse: dat (with sanitized names), dep_items/anx_items/str_items/brs_items, dass_items, exo_covars
# Split exogenous lists for each grouping
exo_no_gender <- setdiff(exo_covars, grep("^G_", exo_covars, value = TRUE))
exo_no_year   <- setdiff(exo_covars, grep("^AY_", exo_covars, value = TRUE))


In [25]:
library(stringi)

# 1) Sanitize every column name in dat
sanitize_names <- function(x){
  x <- gsub("[^A-Za-z0-9_]", "_", x)   # replace any non [A-Za-z0-9_] with _
  x <- gsub("_+", "_", x)              # collapse multiple _
  x <- gsub("^_+", "", x)              # remove leading _
  x <- gsub("_+$", "", x)              # remove trailing _
  x <- ifelse(grepl("^[A-Za-z]", x), x, paste0("X_", x))  # must start with a letter
  make.unique(x, sep = "_")
}

old_names <- names(dat)
new_names <- sanitize_names(old_names)
names(dat) <- new_names

# 2) Rebuild item vectors from sanitized names
dep_items <- sanitize_names(c("dQ3D","dQ5D","dQ10D","dQ13D","dQ16D","dQ17D","dQ21D"))
anx_items <- sanitize_names(c("dQ2A","dQ4A","dQ7A","dQ9A","dQ15A","dQ19A","dQ20A"))
str_items <- sanitize_names(c("dQ1S","dQ6S","dQ8S","dQ11S","dQ12S","dQ14S","dQ18S"))
dass_items <- c(dep_items, anx_items, str_items)

brs_items <- sanitize_names(c("rQ1","rQ3","rQ5","rQ2_r","rQ4_r","rQ6_r"))

# 3) Regenerate sanitized exogenous dummy names by pattern
safe_exo <- grep("^(G_|AY_|DG_|RES_|LIV_)", names(dat), value = TRUE)

# 4) Ensure ordered types are correctly set after renaming
dat[dass_items] <- lapply(dat[dass_items], function(x) factor(as.integer(as.character(x)), levels = 0:3, ordered = TRUE))
dat[brs_items]  <- lapply(dat[brs_items],  function(x) factor(as.integer(as.character(x)), levels = 1:5, ordered = TRUE))


In [26]:
# Identify extremely sparse dummy columns and drop them
is_dummy <- names(dat) %in% safe_exo
dummy_counts <- sapply(dat[ , is_dummy, drop = FALSE], function(v) sum(v == 1, na.rm = TRUE))
keep <- names(dummy_counts)[dummy_counts >= 10]  # threshold can be adjusted
safe_exo <- intersect(safe_exo, keep)


In [27]:
rhs <- paste(c("BRS", safe_exo), collapse = " + ")

model_core <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs, '
  Anx ~ ', rhs, '
  Str ~ ', rhs, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_core <- sem(model_core, data = dat,
                ordered = c(dass_items, brs_items),
                estimator = "WLSMV",
                parameterization = "theta",
                std.lv = TRUE,
                control = list(iter.max = 20000, rel.tol = 1e-6))
summary(fit_core, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)


lavaan->muthen1984():  
   trouble constructing W matrix; used generalized inverse for A11 submatrix 


In [29]:
# For gender multi-group
dat_g <- dat %>% dplyr::filter(!is.na(gender))
# For year multi-group (use when running that model)
dat_y <- dat %>% dplyr::filter(!is.na(academic_year))


In [30]:
# exo_no_gender (or exo_no_year) was your sanitized dummy list excluding the group’s own dummies

drop_group_constant <- function(data, group_var, exo_names){
  # keep only dummies that vary within every group level
  keep <- sapply(exo_names, function(v){
    byv <- split(data[[v]], data[[group_var]])
    all(sapply(byv, function(x) {
      x <- x[!is.na(x)]
      if (length(x) == 0L) FALSE else (length(unique(x)) > 1L)
    }))
  })
  exo_names[keep]
}

# For gender model
exo_gender_final <- drop_group_constant(dat_g, "gender", exo_no_gender)

# For academic year model
exo_year_final   <- drop_group_constant(dat_y, "academic_year", exo_no_year)


In [31]:
rhs_gender <- paste(c("BRS", exo_gender_final), collapse = " + ")

model_mg_gender_free <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_gender, '
  Anx ~ ', rhs_gender, '
  Str ~ ', rhs_gender, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_mg_gender_free <- sem(model_mg_gender_free, data = dat_g,
                          group = "gender",
                          ordered = c(dass_items, brs_items),
                          parameterization = "theta",
                          estimator = "WLSMV",
                          std.lv = TRUE,
                          group.equal = c("thresholds","loadings"))
summary(fit_mg_gender_free, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)


In [32]:
rhs_year <- paste(c("BRS", exo_year_final), collapse = " + ")

model_mg_year_free <- paste0('
  Dep =~ ', paste(dep_items, collapse = " + "), '
  Anx =~ ', paste(anx_items, collapse = " + "), '
  Str =~ ', paste(str_items, collapse = " + "), '
  BRS =~ ', paste(brs_items, collapse = " + "), '

  Dep ~ ', rhs_year, '
  Anx ~ ', rhs_year, '
  Str ~ ', rhs_year, '

  Dep ~~ Anx
  Dep ~~ Str
  Anx ~~ Str
')

fit_mg_year_free <- sem(model_mg_year_free, data = dat_y,
                        group = "academic_year",
                        ordered = c(dass_items, brs_items),
                        parameterization = "theta",
                        estimator = "WLSMV",
                        std.lv = TRUE,
                        group.equal = c("thresholds","loadings"))
summary(fit_mg_year_free, fit.measures = TRUE, standardized = TRUE, rsquare = TRUE)


1: lavaan->lav_object_post_check():  
   covariance matrix of latent variables is not positive definite in group 2; use lavInspect(fit, 
   "cov.lv") to investigate. 
2: lavaan->lav_object_post_check():  
   covariance matrix of latent variables is not positive definite in group 3; use lavInspect(fit, 
   "cov.lv") to investigate. 


In [43]:
# Create folders once
dir.create("outputs/SEM", showWarnings = FALSE)

safe_write_csv <- function(x, path) {
  # Convert to data frame if not already
  if (!inherits(x, "data.frame")) x <- as.data.frame(x, stringsAsFactors = FALSE)

  # Drop list or matrix columns by flattening basic parts
  bad_cols <- sapply(x, function(col) is.list(col) || is.matrix(col) || is.array(col))
  if (any(bad_cols)) {
    # Try to unnest simple 1-element lists, else drop
    x[bad_cols] <- lapply(x[bad_cols], function(col) {
      if (is.list(col)) {
        # if every element length is 1, unlist safely
        if (all(vapply(col, length, 1L) <= 1L)) unlist(col)
        else NULL
      } else if (is.matrix(col) || is.array(col)) {
        # keep first column if single-column matrix, else NULL
        if (is.matrix(col) && ncol(col) == 1L) as.vector(col)
        else NULL
      } else col
    })
    # Remove any NULL columns created above
    x <- x[!vapply(x, is.null, logical(1))]
  }

  # If zero columns, create a placeholder
  if (ncol(x) == 0) {
    x <- data.frame(note = "no data", stringsAsFactors = FALSE)
  }

  # If zero rows, still write a header
  if (nrow(x) == 0) {
    # make a one-row NA placeholder with same columns
    x[1,] <- NA
  }

  # Ensure column names are unique and syntactic
  colnames(x) <- make.names(colnames(x), unique = TRUE)

  utils::write.csv(x, path, row.names = FALSE)
}


In [45]:
library(dplyr)

# 1) Standardized BRS -> DASS paths by group (gender)
std_g <- standardizedSolution(fit_mg_gender_free)
brs_paths_gender <- std_g %>%
  filter(op == "~", rhs == "BRS", lhs %in% c("Dep","Anx","Str")) %>%
  select(group, lhs, rhs, est.std, se, pvalue)
safe_write_csv(brs_paths_gender, "outputs/SEM/mg_gender_BRS_paths_by_group.csv")

# 2) Latent means by group (gender): reference group fixed to 0; others estimated
pe_g   <- parameterEstimates(fit_mg_gender_free, standardized = TRUE, ci = TRUE)
means_g <- subset(pe_g, op == "~1" & lhs %in% c("Dep","Anx","Str")) %>%
  select(group, lhs, est, se, ci.lower, ci.upper, pvalue)
safe_write_csv(means_g, "outputs/SEM/mg_gender_latent_means.csv")

# 3) Fit summary (gender)
fits_gender <- data.frame(t(fitMeasures(fit_mg_gender_free, c("cfi","tli","rmsea","srmr","chisq","df"))))
safe_write_csv(fits_gender, "outputs/SEM/mg_gender_fit.csv")

# 4) Academic year: BRS paths and latent means
std_y <- standardizedSolution(fit_mg_year_free)
brs_paths_year <- std_y %>%
  filter(op == "~", rhs == "BRS", lhs %in% c("Dep","Anx","Str")) %>%
  select(group, lhs, rhs, est.std, se, pvalue)
safe_write_csv(brs_paths_year, "outputs/SEM/mg_year_BRS_paths_by_group.csv")

pe_y   <- parameterEstimates(fit_mg_year_free, standardized = TRUE, ci = TRUE)
means_y <- subset(pe_y, op == "~1" & lhs %in% c("Dep","Anx","Str")) %>%
  select(group, lhs, est, se, ci.lower, ci.upper, pvalue)
safe_write_csv(means_y, "outputs/SEM/mg_year_latent_means.csv")

fits_year <- data.frame(t(fitMeasures(fit_mg_year_free, c("cfi","tli","rmsea","srmr","chisq","df"))))
safe_write_csv(fits_year, "outputs/SEM/mg_year_fit.csv")


In [None]:
library(mice)
library(semTools)
library(dplyr)
library(haven)

# Base data (use the unsanitized original file if convenient, then sanitize after completion)
dat_base <- readRDS("data/processed/analysis_phase1_ordered.rds") %>%
  mutate(
    gender           = haven::as_factor(gender, levels = "labels"),
    academic_year    = haven::as_factor(academic_year, levels = "labels"),
    dept_group       = haven::as_factor(dept_group, levels = "labels"),
    residence        = haven::as_factor(residence, levels = "labels"),
    living_situation = haven::as_factor(living_situation, levels = "labels")
  )

# Variables to impute (policy_effectiveness removed)
mi_vars <- c("cgpa","family_income")

# Predictors that help imputation
aux_preds <- c("gender","academic_year","dept_group","residence","living_situation")

keep_for_mi <- c(mi_vars, aux_preds)
mi_dat <- dat_base[, keep_for_mi, drop = FALSE]

meth <- make.method(mi_dat)
meth["cgpa"]          <- "pmm"
meth["family_income"] <- "pmm"

pred <- make.predictorMatrix(mi_dat)
pred[,] <- 0
pred["cgpa", aux_preds]          <- 1
pred["family_income", aux_preds] <- 1

set.seed(20250907)
imp <- mice(mi_dat, m = 20, maxit = 20, method = meth, predictorMatrix = pred, printFlag = FALSE)
saveRDS(imp, "outputs/SEM/mi_covariates_m20.rds")


Number of logged events: 800 
