In [40]:
#01_DifferentialAbundance_and_cohens_D
#
#use linear model to identify differentially abundant features (for each omics)
#[1] features (or predictors) that have significant coefficients (p < 0.05; P < 0.01)
#           -> whether ACPA status (predictor) has affected the abundance of the feature (response).
#[2] features Cohend's D above medium (i.e., 0.5)
#features that fulfills [1] + [2] will considerexd as differentially abundant.

library("effsize")
library(lme4)
library(lmerTest)
library(stringr)
library(effects)
library(dplyr)


In [41]:
main <- function(input_data_file, output_dir, output_str){
    
    input_data_df <- read.csv(input_data_file, sep="\t", header=TRUE, row.names=1)
    merged_data_df <- as.data.frame(t(input_data_df))

    #young vs old
    young_vs_old_df <- filter(merged_data_df, age_status == 1 | age_status == 0)
    
    run_cohenD_and_glm(young_vs_old_df, 1, 0, output_dir, output_str, '.youngVSold')    
}

run_cohenD_and_glm <- function(data_df, condition_a_num, condition_b_num, output_dir, output_str, output_type){
    
    NUM_FEATURES <- ncol(data_df)
    temp_condition_a_df <- filter(data_df, age_status == condition_a_num) #this is necessary for cohends D
    temp_condition_b_df <- filter(data_df, age_status == condition_b_num) #this is necessary for cohends D   
    
    output_txt <- paste(output_dir,output_str,output_type, '.tsv', sep="") 
    if (file.exists(output_txt)) {
        #Delete file if it exists
        file.remove(output_txt)
    }

    output_string <- "\tcoef\tcohenD\tpval\n"
    cat(output_string, file=output_txt, append=TRUE)
    
    for (i in 1:NUM_FEATURES){
        if (i > 3){
            
            feature <- colnames(data_df)[i]
            # print (feature)
            
            #calculate cohens D between two population
            condition_a_list <- temp_condition_a_df[,i]
            condition_b_list <- temp_condition_b_df[,i]
            cohend = cohen.d(condition_a_list,condition_b_list)
            # cohend_value <- abs(cohend$estimate)
            cohend_value <- cohend$estimate
  
            #calculate the significance of the linear model
            glm_results <- glm(data_df[,i] ~ data_df[,2])
                                   
            feature_coef <- (coef(summary(glm_results))[,1][2])
            feature_pval <- (coef(summary(glm_results))[,4][2])

            output_string <- paste(feature, "\t", feature_coef, "\t", cohend_value,"\t", feature_pval, "\n", sep="")
            cat(output_string, file=output_txt,append=TRUE)
            # break
        } 
    }
}


In [42]:
#Main
output_dir = '../../../analysis/age_stratified/differential_abundance_acpa_specific/'

#make directory if it does not exist
if (!dir.exists(output_dir)){
dir.create(output_dir)
} else {
    print("Dir already exists!")
}

data_file = '../../../preprocessed_data/age_stratified/high_low_age_3_omics.acpa_pos.tsv'
main(data_file, output_dir, 'high_low_acpa_pos')

data_file = '../../../preprocessed_data/age_stratified/high_low_age_3_omics.acpa_neg.tsv'
main(data_file, output_dir, 'high_low_acpa_neg')

data_file = '../../../preprocessed_data/age_stratified/high_low_age_3_omics.control.tsv'
main(data_file, output_dir, 'high_low_control')


[1] "Dir already exists!"
