In [1]:
#01_DifferentialAbundance_and_cohens_D
#
#use linear model to identify differentially abundant features (for each omics)
#[1] features (or predictors) that have significant coefficients (p < 0.05; P < 0.01)
#           -> whether ACPA status (predictor) has affected the abundance of the feature (response).
#[2] features Cohend's D above medium (i.e., 0.5)
#[3] adjust confounding effects if they are significant in marginal model.
#features that fulfills [1] + [2] will considerexd as differentially abundant.

library("effsize")
library(lme4)
library(lmerTest)
library(stringr)
library(effects)
library(dplyr)


Loading required package: Matrix


Attaching package: ‘lmerTest’


The following object is masked from ‘package:lme4’:

    lmer


The following object is masked from ‘package:stats’:

    step


Loading required package: carData

lattice theme set by effectsTheme()
See ?effectsTheme for details.


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union




In [4]:
main <- function(input_data_file, patient_info_file, output_dir, output_str){
      
    input_data_df <- read.csv(input_data_file, sep="\t", header=TRUE, row.names=1)
    patient_info_df <- read.csv(patient_info_file, sep="\t", header=TRUE, row.names=1)
    
    # Find common column names
    common_columns <- intersect(names(input_data_df), names(patient_info_df))
    selected_patient_info <- patient_info_df[, common_columns]
    
    # Combine rows with patient_info_df at the top
    merged_data_df <- bind_rows(selected_patient_info, input_data_df, .id = "source")
    
    # If you want to keep only unique rows based on row names, you can use distinct function
    merged_data_df <- select(merged_data_df, -source)
    merged_data_df <- distinct(merged_data_df, .keep_all = TRUE)

    #2nd module
    merged_data_df <- as.data.frame(t(merged_data_df))
    acpa_neg_vs_acpa_pos <- filter(merged_data_df, acpa == 1 | acpa == 2)
    acpa_neg_vs_acpa_pos$acpa[acpa_neg_vs_acpa_pos$acpa == 1] <- 0   #changing class 2 -> 0
    acpa_neg_vs_acpa_pos$acpa[acpa_neg_vs_acpa_pos$acpa == 2] <- 1   #changing class 2 -> 1
    run_cohenD_and_glm(acpa_neg_vs_acpa_pos, 1, 0, output_dir, output_str, '.negVSpos')
    
}

run_cohenD_and_glm <- function(data_df, condition_a_num, condition_b_num, output_dir, output_str, output_type){
    
    NUM_FEATURES <- ncol(data_df)
    temp_condition_a_df <- filter(data_df, acpa == condition_a_num) #this is necessary for cohends D
    temp_condition_b_df <- filter(data_df, acpa == condition_b_num) #this is necessary for cohends D   
    
    output_txt <- paste(output_dir, output_str, output_type, '.tsv', sep="") 
    if (file.exists(output_txt)) {
        #Delete file if it exists
        file.remove(output_txt)
    }

    output_string <- "\tcohenD\tfc_case_control\tcoeff\tpval\tall_adj_pval\n"
    
    cat(output_string, file=output_txt, append=TRUE)
    
    for (i in 1:NUM_FEATURES){
        if (i > 19){
            feature <- colnames(data_df)[i]
            # print (feature)
            #calculate cohens D between two population
            condition_a_list <- temp_condition_a_df[,i]
            condition_b_list <- temp_condition_b_df[,i]
            
            cohend = cohen.d(condition_a_list,condition_b_list)
            cohend_value <- cohend$estimate
            
            log2fc <- log2(mean(condition_a_list) / mean(condition_b_list))
            
            glm_results <- glm(data_df[,i] ~ acpa, data = data_df) #ACPA margnial model
            feature_coef <- (coef(summary(glm_results))[,1][2])
            feature_pval <- (coef(summary(glm_results))[,4][2])
            
            #calculate the significance of the linear model


            glm_results <- glm(acpa ~ data_df[,i] + sex + age + bmi + smoking + pred + bdmard + all_csdmard, data = data_df, family=binomial) #ACPA all adjust model            
            all_adj_coef <- (coef(summary(glm_results))[,1][2])
            all_adj_pval <- (coef(summary(glm_results))[,4][2])


 
            output_string <- paste(feature, "\t", cohend_value, "\t", log2fc, 
                                   "\t",feature_coef, "\t", feature_pval, "\t", 
                                   all_adj_pval, "\n", sep="")
            cat(output_string, file=output_txt,append=TRUE)
        } 
    }
}

In [5]:
#Main
output_dir = '../../../analysis/age_stratified/differential_abundance_logit/'

#make directory if it does not exist
if (!dir.exists(output_dir)){
dir.create(output_dir)
} else {
    print("Dir already exists!")
}

high_age_data_file = '../../../preprocessed_data/age_stratified/high_age_3_omics.tsv'
low_age_data_file = '../../../preprocessed_data/age_stratified/low_age_3_omics.tsv'
# med_age_data_file = '../../../preprocessed_data/age_stratified/mid_age_3_omics.tsv'

patient_info_file = '../../../preprocessed_data/meta/patient_info_for_statistics.v3.T.tsv'

[1] "Dir already exists!"


In [6]:
main(high_age_data_file, patient_info_file, output_dir, 'high_age')
main(low_age_data_file, patient_info_file, output_dir, 'low_age')
# main(med_age_data_file, patient_info_file, output_dir, 'mid_age')

“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: algorithm did not converge”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“glm.fit: fitted probabilities numerically 0 or 1 occurred”
“g