In [13]:
library(dplyr); library(readr)

# Load item-level ordered data (same coding as CFA/SEM)
items <- readRDS("outputs/PhaseA_prep/items_ordered_wide.rds")

# Load finalized shortlists (main + alternates)
dass_main <- read_csv("outputs/PhaseA_A3/final_shortlists/DASS_shortlist_main.csv", show_col_types = FALSE)
dass_alt  <- read_csv("outputs/PhaseA_A3/final_shortlists/DASS_shortlist_alternates.csv", show_col_types = FALSE)

# Candidate item names (primary set + alternates for swap if needed)
cand_items <- unique(c(dass_main$item, dass_alt$item))

# Confirm availability in dataset
cand_items <- intersect(cand_items, names(items))

# Prepare grouping variables for DIF
grp_vars <- c("gender","academic_year")
stopifnot(all(grp_vars %in% names(items)))

# Keep rows with complete responses on candidate items for stable GRM (document the drop count)
dat_phaseC0 <- items[, c("id", grp_vars, cand_items), drop = FALSE]
n0 <- nrow(dat_phaseC0)
dat_phaseC  <- dat_phaseC0[stats::complete.cases(dat_phaseC0[, cand_items]), ]
message("Phase C baseline N=", nrow(dat_phaseC), " (dropped ", n0 - nrow(dat_phaseC), " rows with missing in candidate items)")

# Save the clean matrix and a separate meta file with variable roles
dir.create("outputs/PhaseC_prep", recursive = TRUE, showWarnings = FALSE)
saveRDS(dat_phaseC, "outputs/PhaseC_prep/phaseC_item_matrix.rds")

meta <- list(
  item_names = cand_items,
  groups = grp_vars,
  primary_items = dass_main$item,
  alternate_items = setdiff(cand_items, dass_main$item),
  coding_note = "Items retain original ordered categories (DASS 0-3); do not recode before GRM."
)
saveRDS(meta, "outputs/PhaseC_prep/phaseC_meta.rds")
message("Phase C prep complete. Data and metadata saved to outputs/PhaseC_prep/")

Phase C baseline N=791 (dropped 60 rows with missing in candidate items)
Phase C prep complete. Data and metadata saved to outputs/PhaseC_prep/


In [14]:
# Verify factor levels and build a numeric matrix for mirt
phaseC <- readRDS("outputs/PhaseC_prep/phaseC_item_matrix.rds")
meta   <- readRDS("outputs/PhaseC_prep/phaseC_meta.rds")

# Check levels for each item
lvl_tbl <- sapply(phaseC[, meta$item_names, drop = FALSE], function(x) {
  if (is.factor(x)) paste(levels(x), collapse = ",") else NA_character_
})
readr::write_csv(
  data.frame(item = names(lvl_tbl), levels = as.vector(lvl_tbl)),
  "outputs/PhaseC_prep/levels_report.csv"
)

# Convert to numeric scores (keeping the original category numbers)
to_num <- function(x) {
  if (is.factor(x)) as.numeric(as.character(x)) else as.numeric(x)
}
X_items <- as.data.frame(lapply(phaseC[, meta$item_names, drop = FALSE], to_num))

# Assemble group vectors
grp_gender <- as.factor(phaseC$gender)
grp_year   <- as.factor(phaseC$academic_year)

# Save matrices ready for mirt
saveRDS(list(X = X_items, gender = grp_gender, year = grp_year, id = phaseC$id),
        "outputs/PhaseC_prep/mirt_inputs.rds")


In [15]:
# Model spec strings for mirt
# Unidimensional general distress (preferred if fit is adequate)
model_uni <- mirt::mirt.model(paste0("G = 1-", length(meta$item_names)))

# Bifactor: General + three domain specifics (Dep, Anx, Str) if items are labeled with suffix D/A/S
# Build index vectors by suffix
dep_idx <- which(grepl("D$", meta$item_names))
anx_idx <- which(grepl("A$", meta$item_names))
str_idx <- which(grepl("S$", meta$item_names))

# Only define spec groups that have ≥2 items
spec_lines <- c()
if (length(dep_idx) >= 2) spec_lines <- c(spec_lines, paste0("Dep = ", paste(dep_idx, collapse = ",")))
if (length(anx_idx) >= 2) spec_lines <- c(spec_lines, paste0("Anx = ", paste(anx_idx, collapse = ",")))
if (length(str_idx) >= 2) spec_lines <- c(spec_lines, paste0("Str = ", paste(str_idx, collapse = ",")))

# General factor loads on all items; specifics on domain subsets; orthogonality typical in bifactor
model_bi_txt <- c(
  paste0("G = 1-", length(meta$item_names)),
  spec_lines,
  "COV = G*Dep, G*Anx, G*Str"
)
model_bi <- mirt::mirt.model(paste(model_bi_txt, collapse = "\n"))

saveRDS(list(model_uni = model_uni, model_bi = model_bi,
             dep_idx = dep_idx, anx_idx = anx_idx, str_idx = str_idx),
        "outputs/PhaseC_prep/mirt_models.rds")


In [16]:
phaseC_rules <- list(
  # Item diagnostics
  min_discrimination = 1.0,      # prefer a >= 1.0
  prefer_ordered_thresholds = TRUE,
  max_residual_corr = 0.20,      # LD threshold; investigate pairs above
  # Precision target (screening band)
  theta_band = c(0, 2),          # moderate-to-high trait region
  min_reliability_band = 0.80,   # marginal reliability target in band
  # DIF testing
  dif_contrasts = c("gender","academic_year"),
  dif_method = "multipleGroup_LRT",   # baseline equal slopes/thresholds, free means/vars
  p_adjust = "BH",                     # FDR control if many tests
  swap_preference = TRUE,              # prefer swapping over parameter adjustments
  # Calibration and ROC
  calibration_target = "SEM_latent",   # link to Phase-3 latent distress
  roc_criterion = "cluster_or_latent", # either Phase-4 cluster label or SEM threshold
  auc_floor = 0.80,
  cut_selection = "Youden_and_resource" # report Youden and a resource-constrained option
)
saveRDS(phaseC_rules, "outputs/PhaseC_prep/phaseC_rules.rds")


In [17]:
dirs <- c("outputs/PhaseC_fit","outputs/PhaseC_DIF","outputs/PhaseC_info",
          "outputs/PhaseC_scoring","outputs/PhaseC_calibration","outputs/PhaseC_ROC")
invisible(lapply(dirs, function(d) dir.create(d, recursive = TRUE, showWarnings = FALSE)))

# Placeholders to verify readiness
writeLines("Phase C prep completed: data, models, and rules ready.",
           "outputs/PhaseC_prep/README.txt")
writeLines(capture.output(sessionInfo()), "outputs/PhaseC_prep/sessionInfo_phaseC_prep.txt")
