In [1]:
# Load required libraries
library("effsize")     # For effect size calculations like Cohen's D
library(lme4)          # For linear mixed models (not used here)
library(lmerTest)      # Adds p-values to lme4 models (not used here)
library(stringr)       # String operations (not used here)
library(effects)       # For model effect visualization (not used here)
library(dplyr)         # Data manipulation

Loading required package: Matrix


Attaching package: ‘lmerTest’


The following object is masked from ‘package:lme4’:

    lmer


The following object is masked from ‘package:stats’:

    step


Loading required package: carData

lattice theme set by effectsTheme()
See ?effectsTheme for details.


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
# Function to run linear regression models and collect results
run_linear_model <- function(data_df, output_dir, output_suffix){

    p_thresh <- 0.05  # Not directly used, but can guide threshold decisions
    results <- data.frame()  # Initialize results container

    for (i in 37:ncol(data_df)) {  # Loop through feature columns starting from 37
        feature_name <- colnames(data_df)[i]
        y <- data_df[[feature_name]]

        # Separate feature values for ACPA-positive (1) and ACPA-negative (0) groups
        condition_a_list <- data_df[data_df$acpa == 1, i]
        condition_b_list <- data_df[data_df$acpa == 0, i]
        
        # Compute Cohen's D and log2 fold-change between groups
        cohend <- cohen.d(condition_a_list, condition_b_list)
        cohend_value <- cohend$estimate
        log2fc <- log2(mean(condition_a_list, na.rm = TRUE) / mean(condition_b_list, na.rm = TRUE))

        # Run marginal model (no covariates)
        marginal_model <- lm(y ~ acpa, data = data_df)
        marginal_summary <- summary(marginal_model)
        raw_coef <- coef(marginal_summary)["acpa", "Estimate"]
        raw_pval <- coef(marginal_summary)["acpa", "Pr(>|t|)"]

        # Use different covariates depending on comparison
        if (output_suffix == "negVSpos"){
            formula_str <- "y ~ acpa + sex + age + bmi + smoking + pred + bdmard + all_csdmard + das28crp"
        }else{
            formula_str <- "y ~ acpa + sex + age + bmi + smoking + pred + bdmard + all_csdmard"
        }

        # Run adjusted model with covariates
        final_model <- lm(as.formula(formula_str), data = data_df)
        final_summary <- summary(final_model)
        adj_coef <- coef(final_summary)["acpa", "Estimate"]
        adj_pval <- coef(final_summary)["acpa", "Pr(>|t|)"]

        # Store results for this feature
        results <- rbind(results, data.frame(
            feature = feature_name,
            cohen_d = cohend_value,
            log2fc = log2fc,
            raw_coef = raw_coef,
            raw_pval = raw_pval,
            adj_coef = adj_coef,
            adj_pval = adj_pval,
            stringsAsFactors = FALSE
        ))
    }

    # FDR-adjust the p-values for multiple testing
    results$adj_pval_fdr <- p.adjust(results$adj_pval, method = "fdr")
    cat(output_suffix, "Number of p-values under 0.01:", sum(results$adj_pval < 0.01), "\n")

    # Save results as TSV
    output_file <- paste0(output_dir, "/linear_regression.", output_suffix, ".metabolomics.tsv")
    write.table(results, file = output_file, sep = "\t", row.names = FALSE, quote = FALSE)
}

In [None]:
# Main function to define group comparisons and run model
main <- function(input_data_df, output_dir){

    # Prepare control vs ACPA-negative comparison
    control_vs_acpa_neg <- filter(input_data_df, acpa == 0 | acpa == 2)
    control_vs_acpa_neg$acpa[control_vs_acpa_neg$acpa == 2] <- 1  # Recode ACPA-neg as 1

    # Prepare control vs ACPA-positive comparison
    control_vs_acpa_pos <- filter(input_data_df, acpa == 0 | acpa == 1)

    # Prepare ACPA-negative vs ACPA-positive comparison
    acpa_neg_vs_acpa_pos <- filter(input_data_df, acpa == 1 | acpa == 2)
    acpa_neg_vs_acpa_pos$acpa[acpa_neg_vs_acpa_pos$acpa == 1] <- 0  # Recode ACPA-pos as 0
    acpa_neg_vs_acpa_pos$acpa[acpa_neg_vs_acpa_pos$acpa == 2] <- 1  # Recode ACPA-neg as 1

    # Label mapping for clarity:
    # - acpa_neg_vs_acpa_pos: 0 = pos, 1 = neg
    # - control_vs_acpa_pos: 0 = control, 1 = pos
    # - control_vs_acpa_neg: 0 = control, 1 = neg

    # Run models for each group comparison
    run_linear_model(control_vs_acpa_neg, output_dir, "cVSneg")
    run_linear_model(control_vs_acpa_pos, output_dir, "cVSpos")
    # run_linear_model(acpa_neg_vs_acpa_pos, output_dir, "negVSpos")
}


In [4]:
# Load input data (metabolomics)
df <- read.csv("../../../preprocessed_data/metabolomics/metabolites.patient_info.tsv", header = TRUE, sep = "\t", check.names = FALSE)
input_data_df <- df
output_dir <- "../../../analysis/statistics/linear_regression/metabolomics"

# Run analysis pipeline
main(input_data_df = input_data_df, output_dir = output_dir)

cVSneg Number of p-values under 0.01: 38 
cVSpos Number of p-values under 0.01: 7 
negVSpos Number of p-values under 0.01: 8 
