In [2]:
#01_DifferentialAbundance_and_cohens_D
#
#use linear model to identify differentially abundant features (for each omics)
#[1] features (or predictors) that have significant coefficients (p < 0.05; P < 0.01)
#           -> whether ACPA status (predictor) has affected the abundance of the feature (response).
#[2] features Cohend's D above medium (i.e., 0.5)
#features that fulfills [1] + [2] will considerexd as differentially abundant.

library("effsize")
library(lme4)
library(lmerTest)
library(stringr)
library(effects)
library(dplyr)


In [3]:
main <- function(input_data_file, patient_info_file, output_dir, output_str){
    
    input_data_df <- read.csv(input_data_file, sep="\t", header=TRUE, row.names=1)
    patient_info_df <- read.csv(patient_info_file, sep="\t", header=TRUE, row.names=1)

    merged_data_df <- rbind(patient_info_df[1,], input_data_df) #acpa    
    merged_data_df <- as.data.frame(t(merged_data_df))

    #control vs RA
    control_vs_ra_df <- merged_data_df
    control_vs_ra_df$acpa[control_vs_ra_df$acpa == 2] <- 1 #changing class to 2 -> 1; making 0 vs 1
    
    #control vs acpa neg
    control_vs_acpa_neg <- filter(merged_data_df, acpa == 0 | acpa == 2)
    control_vs_acpa_neg$acpa[control_vs_acpa_neg$acpa == 2] <- 1   #changing class 2 -> 1; making 0 vs 1

    #control vs acpa pos
    control_vs_acpa_pos <- filter(merged_data_df, acpa == 0 | acpa == 1)

    #acpa neg vs acpa pos
    acpa_neg_vs_acpa_pos <- filter(merged_data_df, acpa == 1 | acpa == 2)
    acpa_neg_vs_acpa_pos$acpa[acpa_neg_vs_acpa_pos$acpa == 1] <- 0   #changing class 2 -> 0
    acpa_neg_vs_acpa_pos$acpa[acpa_neg_vs_acpa_pos$acpa == 2] <- 1   #changing class 2 -> 1
    
    run_cohenD_and_glm(control_vs_ra_df, 0, 1, output_dir, output_str, '.cVSra')
    run_cohenD_and_glm(control_vs_acpa_neg, 0, 1, output_dir, output_str, '.cVSneg')
    run_cohenD_and_glm(control_vs_acpa_pos, 0, 1, output_dir, output_str, '.cVSpos')
    run_cohenD_and_glm(acpa_neg_vs_acpa_pos, 0, 1, output_dir, output_str, '.negVSpos')
    
}

run_cohenD_and_glm <- function(data_df, condition_a_num, condition_b_num, output_dir, output_str, output_type){
    
    NUM_FEATURES <- ncol(data_df)
    temp_condition_a_df <- filter(data_df, acpa == condition_a_num) #this is necessary for cohends D
    temp_condition_b_df <- filter(data_df, acpa == condition_b_num) #this is necessary for cohends D   
    
    output_txt <- paste(output_dir,output_str,output_type, '.tsv', sep="") 
    if (file.exists(output_txt)) {
        #Delete file if it exists
        file.remove(output_txt)
    }

    output_string <- "\tcoef\tcohenD\tpval\n"
    cat(output_string, file=output_txt, append=TRUE)
    
    for (i in 1:NUM_FEATURES){
        if (i > 1){
            
            feature <- colnames(data_df)[i]
            
            #calculate cohens D between two population
            condition_a_list <- temp_condition_a_df[,i]
            condition_b_list <- temp_condition_b_df[,i]
            cohend = cohen.d(condition_a_list,condition_b_list)
            cohend_value <- abs(cohend$estimate)
  
            #calculate the significance of the linear model
            glm_results <- glm(data_df[,i] ~ data_df[,1])
            feature_coef <- (coef(summary(glm_results))[,1][2])
            feature_pval <- (coef(summary(glm_results))[,4][2])

            output_string <- paste(feature, "\t", feature_coef, "\t", cohend_value,"\t", feature_pval, "\n", sep="")
            cat(output_string, file=output_txt,append=TRUE)
        } 
    }
}


In [6]:
#Main
output_dir = '../../../analysis/statistics/linear_model/differential_abundance_v2/'

#make directory if it does not exist
if (!dir.exists(output_dir)){
dir.create(output_dir)
} else {
    print("Dir already exists!")
}

autoantibody_data_file = '../../../preprocessed_data/autoantibody/sengenics_qnorm_data.v2.tsv'
proteomics_data_file = '../../../preprocessed_data/proteomics/somascan_anml.T.v2.tsv'
metabolomics_data_file = '../../../preprocessed_data/metabolomics/metabolone_raw_norm_preprocessed.v2.tsv'
patient_info_file = '../../../preprocessed_data/meta/patient_info.ML_ready.tsv'

[1] "Dir already exists!"


In [7]:
main(autoantibody_data_file, patient_info_file, output_dir, 'autoantibody')
main(metabolomics_data_file, patient_info_file, output_dir, 'metabolomics')
main(proteomics_data_file, patient_info_file, output_dir, 'proteomics')
