In [5]:
library("effsize")     # For effect size calculations like Cohen's D
library(lme4)          # For linear mixed models (not used here)
library(lmerTest)      # Adds p-values to lme4 models (not used here)
library(stringr)       # String operations (not used here)
library(effects)       # For model effect visualization (not used here)
library(dplyr)

Loading required package: Matrix


Attaching package: ‘lmerTest’


The following object is masked from ‘package:lme4’:

    lmer


The following object is masked from ‘package:stats’:

    step


Loading required package: carData

lattice theme set by effectsTheme()
See ?effectsTheme for details.


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [39]:
# Function to run linear regression models and collect results
run_linear_model <- function(data_df, output_dir, output_suffix){

    p_thresh <- 0.05  # Not directly used, but can guide threshold decisions
    results <- data.frame()  # Initialize results container

    for (i in 38:ncol(data_df)) {  # Loop through feature columns starting from 37
        feature_name <- colnames(data_df)[i]
        y <- data_df[[feature_name]]

        # Separate feature values for ACPA-positive (1) and ACPA-negative (0) groups
        condition_a_list <- data_df[data_df$disease_activity_category == 1, i]
        condition_b_list <- data_df[data_df$disease_activity_category == 0, i]

        # Compute Cohen's D and log2 fold-change between groups
        cohend <- cohen.d(condition_a_list, condition_b_list, na.rm = TRUE)
        cohend_value <- cohend$estimate
        log2fc <- log2(mean(condition_a_list, na.rm = TRUE) / mean(condition_b_list, na.rm = TRUE))

        # Run marginal model (no covariates)
        marginal_model <- lm(y ~ disease_activity_category, data = data_df)
        marginal_summary <- summary(marginal_model)

        raw_coef <- coef(marginal_summary)["disease_activity_category", "Estimate"]
        raw_pval <- coef(marginal_summary)["disease_activity_category", "Pr(>|t|)"]

        # Use different covariates depending on comparison

        formula_str <- "y ~ disease_activity_category + sex + age + bmi + smoking + pred + bdmard + all_csdmard"

        # Run adjusted model with covariates
        final_model <- lm(as.formula(formula_str), data = data_df)
        final_summary <- summary(final_model)
        adj_coef <- coef(final_summary)["disease_activity_category", "Estimate"]
        adj_pval <- coef(final_summary)["disease_activity_category", "Pr(>|t|)"]

        # Store results for this feature
        results <- rbind(results, data.frame(
            feature = feature_name,
            cohen_d = cohend_value,
            log2fc = log2fc,
            raw_coef = raw_coef,
            raw_pval = raw_pval,
            adj_coef = adj_coef,
            adj_pval = adj_pval,
            stringsAsFactors = FALSE
        ))
    }

    # FDR-adjust the p-values for multiple testing
    results$adj_pval_fdr <- p.adjust(results$adj_pval, method = "fdr")
    cat(output_suffix, "Number of p-values under 0.01:", sum(results$adj_pval < 0.01), "\n")

    # Save results as TSV
    output_file <- paste0(output_dir, "/linear_regression.", output_suffix, ".metabolomics.tsv")
    write.table(results, file = output_file, sep = "\t", row.names = FALSE, quote = FALSE)
}

In [40]:
# Main function to define group comparisons and run model
main <- function(input_data_df, output_dir){

    acpa_neg <- filter(input_data_df, acpa == 2)

    acpa_neg <- input_data_df %>%
        filter(acpa == 2) %>%
        mutate(disease_activity_category = if_else(das28crp > 3.2, 1, 0)) %>%
        relocate(disease_activity_category, .after = 3)

    acpa_pos <- input_data_df %>%
        filter(acpa == 1) %>%
        mutate(disease_activity_category = if_else(das28crp > 3.2, 1, 0)) %>%
        relocate(disease_activity_category, .after = 3)

    # Run models for each group comparison
    run_linear_model(acpa_neg, output_dir, "acpa_neg_highVSlow")
    run_linear_model(acpa_pos, output_dir, "acpa_pos_highVSlow")
}


In [41]:
# Load input data (metabolomics)
df <- read.csv("../../preprocessed_data/metabolomics/metabolites.patient_info.tsv", header = TRUE, sep = "\t", check.names = FALSE)
input_data_df <- df
output_dir <- "../../revision_analysis"

# Run analysis pipeline
main(input_data_df = input_data_df, output_dir = output_dir)

acpa_neg_highVSlow Number of p-values under 0.01: 35 
acpa_pos_highVSlow Number of p-values under 0.01: 5 
