In [1]:
# Load necessary libraries
library("effsize")     # For computing effect size (e.g., Cohen's D)
library(lme4)          # For linear mixed-effects models (not used in this script)
library(lmerTest)      # Adds p-values to lmer models (not used in this script)
library(stringr)       # String handling (not used in this script)
library(effects)       # For visualizing model effects (not used here)
library(dplyr)         # For data wrangling (used for filtering)

Loading required package: Matrix


Attaching package: ‘lmerTest’


The following object is masked from ‘package:lme4’:

    lmer


The following object is masked from ‘package:stats’:

    step


Loading required package: carData

lattice theme set by effectsTheme()
See ?effectsTheme for details.


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [2]:
# Function to run linear models and save results
run_linear_model <- function(data_df, output_dir, output_suffix){

    p_thresh <- 0.01  # Threshold for significance (not directly used)
    results <- data.frame()  # Initialize results data frame

    for (i in 37:ncol(data_df)) {  # Loop through each feature column
        feature_name <- colnames(data_df)[i]
        y <- data_df[[feature_name]]

        # Split data into ACPA-positive and ACPA-negative
        condition_a_list <- data_df[data_df$acpa == 1, i]
        condition_b_list <- data_df[data_df$acpa == 0, i]
        
        # Calculate Cohen's D and log2 fold change
        cohend <- cohen.d(condition_a_list, condition_b_list)
        cohend_value <- cohend$estimate
        log2fc <- log2(mean(condition_a_list, na.rm = TRUE) / mean(condition_b_list, na.rm = TRUE))

        # Marginal model (no covariates)
        marginal_model <- lm(y ~ acpa, data = data_df)
        marginal_summary <- summary(marginal_model)
        raw_coef <- coef(marginal_summary)["acpa", "Estimate"]
        raw_pval <- coef(marginal_summary)["acpa", "Pr(>|t|)"]


        formula_str <- "y ~ acpa + sex + age + bmi + smoking"

        # Run full model with covariates
        final_model <- lm(as.formula(formula_str), data = data_df)
        final_summary <- summary(final_model)
        adj_coef <- coef(final_summary)["acpa", "Estimate"]
        adj_pval <- coef(final_summary)["acpa", "Pr(>|t|)"]

        # Append results
        results <- rbind(results, data.frame(
            feature = feature_name,
            cohen_d = cohend_value,
            log2fc = log2fc,
            raw_coef = raw_coef,
            raw_pval = raw_pval,
            adj_coef = adj_coef,
            adj_pval = adj_pval,
            stringsAsFactors = FALSE
        ))
    }
        
    # Adjust p-values for multiple testing using FDR
    results$adj_pval_fdr <- p.adjust(results$adj_pval, method = "fdr")
    cat(output_suffix, "Number of adjusted p-values under 0.01:", sum(results$adj_pval < 0.01), "\n")

    # Save result table
    output_file <- paste0(output_dir, "/linear_regression.", output_suffix, ".proteomics.tsv")
    write.table(results, file = output_file, sep = "\t", row.names = FALSE, quote = FALSE)
}

In [3]:
#  Main function to prepare datasets and run analysis
main <- function(input_data_df, output_dir){

    subset_data_df <- filter(input_data_df, all_csdmard == 0 & bdmard == 0 & pred == 0)

    # Prepare control vs ACPA-negative
    control_vs_acpa_neg <- filter(subset_data_df, acpa == 0 | acpa == 2)
    control_vs_acpa_neg$acpa[control_vs_acpa_neg$acpa == 2] <- 1   # Recode ACPA-neg as 1

    # Prepare control vs ACPA-positive
    control_vs_acpa_pos <- filter(subset_data_df, acpa == 0 | acpa == 1)

    # Prepare ACPA-negative vs ACPA-positive
    acpa_neg_vs_acpa_pos <- filter(subset_data_df, acpa == 1 | acpa == 2)
    acpa_neg_vs_acpa_pos$acpa[acpa_neg_vs_acpa_pos$acpa == 1] <- 0   # ACPA-pos -> 0
    acpa_neg_vs_acpa_pos$acpa[acpa_neg_vs_acpa_pos$acpa == 2] <- 1   # ACPA-neg -> 1

    # Note on class labels after recoding
    # acpa_neg_vs_acpa_pos: 0 = pos, 1 = neg
    # control_vs_acpa_pos: 0 = control, 1 = pos
    # control_vs_acpa_neg: 0 = control, 1 = neg

    # Run models for each comparison
    run_linear_model(control_vs_acpa_neg, output_dir, "cVSneg")
    run_linear_model(control_vs_acpa_pos, output_dir, "cVSpos")
    # run_linear_model(acpa_neg_vs_acpa_pos, output_dir, "negVSpos")
}



In [4]:
# Read input data
proteomics_df <- read.csv("../../../preprocessed_data/proteomics/proteomics.patient_info.tsv", header = TRUE, sep = "\t", check.names = FALSE)
input_data_df <- proteomics_df
protomics_output_dir  <- "../../../analysis/statistics/linear_regression_treatment_naive/proteomics"

# Run main analysis
main(input_data_df = input_data_df, output_dir = protomics_output_dir)

cVSneg Number of adjusted p-values under 0.01: 404 
cVSpos Number of adjusted p-values under 0.01: 71 
