In [179]:
#02_report_differential_abundance_adjusting_drug.ipynb
#
#Purpose: Organize the output from 01_DifferentialAbundance_and_cohens_D_adjust_effect
#

import pandas as pd
import seaborn as sns

In [186]:
def main(data_dir, omics_type, p_threshold, data_dict):
    
    condition_list = ['cVSneg','cVSpos','negVSpos']
    
    for condition_type in condition_list:

        print (">>>>>>>> %s <<<<<<<<" % condition_type)
        data_file = "%s/%s.%s.tsv" % (data_dir, omics_type, condition_type)
        data_df = pd.read_csv(data_file, sep="\t", index_col=0)
        
        #[1st threshold] p < 0.05, no drug effect
        mask = (data_df['pval'] < p_threshold)
        subset_df = data_df[mask]
        marginal_model_list = list(subset_df.index.values)
        # subset_df.to_csv("%s/%s.%s.sig.tsv" % (data_dir, omics_type, condition_type), sep="\t")

        # sns.histplot(list(data_df["pval"]))
        print ("marginal pval < %s" % p_threshold)
        print ("log2fc > 0:", (subset_df['fc_case_control'] > 0).sum())
        print ("log2fc < 0:", (subset_df['fc_case_control'] < 0).sum())

        print ("---------")
        mask = (data_df['all_adj_pval'] < p_threshold) 
        subset_df = data_df[mask]
        all_adj_model_cohenD_list = list(subset_df.index.values)

        print ("all adj pval < %s" % p_threshold)
        print ("log2fc > 0:", (subset_df['fc_case_control'] > 0).sum())
        print ("log2fc < 0:", (subset_df['fc_case_control'] < 0).sum())
        # subset_df.to_csv("%s/%s.%s.sig.tsv" % (data_dir, omics_type, condition_type), sep="\t")
        # sns.histplot(list(data_df["all_adj_pval"]))
        
        print("\nMarginal P < %s: %s" % (p_threshold, len(marginal_model_list)))
        print("Adjusting ALL < %s: %s" % (p_threshold, len(all_adj_model_cohenD_list)))

        data_dict[omics_type, condition_type, "marginal", "NO_cohenD", p_threshold] = len(marginal_model_list)
        data_dict[omics_type, condition_type, "adjust_all", "NO_cohenD", p_threshold] = len(all_adj_model_cohenD_list)
        
    return data_dict



In [187]:
def main2(data_dir, omics_type, p_threshold, data_dict):
    condition_list = ['cVSneg','cVSpos','negVSpos']
    
    for condition_type in condition_list:

        print (">>>>>>>> %s <<<<<<<<" % condition_type)
        data_file = "%s/%s.%s.tsv" % (data_dir, omics_type, condition_type)
        data_df = pd.read_csv(data_file, sep="\t", index_col=0)
        
        mask = (data_df['pval'] < p_threshold) & (abs(data_df['cohenD']) > 0.5)
        subset_df = data_df[mask]
        marginal_model_list = list(subset_df.index.values)
        # subset_df.to_csv("%s/%s.%s.sig.tsv" % (data_dir, omics_type, condition_type), sep="\t")

        # sns.histplot(list(data_df["pval"]))
        print ("marginal pval < %s" % p_threshold)
        print ("log2fc > 0:", (subset_df['fc_case_control'] > 0).sum())
        print ("log2fc < 0:", (subset_df['fc_case_control'] < 0).sum())

        print ("---------")
        mask = (data_df['all_adj_pval'] < p_threshold) & (abs(data_df['cohenD']) > 0.5)
        subset_df = data_df[mask]
        all_adj_model_cohenD_list = list(subset_df.index.values)

        print ("all adj pval < %s" % p_threshold)
        print ("log2fc > 0:", (subset_df['fc_case_control'] > 0).sum())
        print ("log2fc < 0:", (subset_df['fc_case_control'] < 0).sum())
        subset_df.to_csv("%s/%s.%s.sig.cohenD.tsv" % (data_dir, omics_type, condition_type), sep="\t")
        # sns.histplot(list(data_df["all_adj_pval"]))
        
        print("\nMarginal P < %s: %s" % (p_threshold, len(marginal_model_list)))
        print("Adjusting ALL < %s: %s" % (p_threshold, len(all_adj_model_cohenD_list)))

        data_dict[omics_type, condition_type, "marginal", "with_cohenD", p_threshold] = len(marginal_model_list)
        data_dict[omics_type, condition_type, "adjust_all", "with_cohenD", p_threshold] = len(all_adj_model_cohenD_list)

    return data_dict


In [188]:
data_dir = '../../../analysis/statistics/linear_model/differential_abundance_linear'

omics_list = ['metabolomics','autoantibody','proteomics']
data_dict = {}

data_dict = main(data_dir, 'proteomics', 0.05, data_dict)
data_dict = main2(data_dir, 'proteomics', 0.05, data_dict)
data_dict = main(data_dir, 'proteomics', 0.03, data_dict)
data_dict = main2(data_dir, 'proteomics', 0.03, data_dict)
data_dict = main(data_dir, 'proteomics', 0.01, data_dict)
data_dict = main2(data_dir, 'proteomics', 0.01, data_dict)

>>>>>>>> cVSneg <<<<<<<<
marginal pval < 0.05
log2fc > 0: 126
log2fc < 0: 204
---------
all adj pval < 0.05
log2fc > 0: 665
log2fc < 0: 233

Marginal P < 0.05: 330
Adjusting ALL < 0.05: 898
>>>>>>>> cVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 151
log2fc < 0: 217
---------
all adj pval < 0.05
log2fc > 0: 435
log2fc < 0: 93

Marginal P < 0.05: 368
Adjusting ALL < 0.05: 528
>>>>>>>> negVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 105
log2fc < 0: 98
---------
all adj pval < 0.05
log2fc > 0: 114
log2fc < 0: 64

Marginal P < 0.05: 203
Adjusting ALL < 0.05: 178
>>>>>>>> cVSneg <<<<<<<<
marginal pval < 0.05
log2fc > 0: 72
log2fc < 0: 129
---------
all adj pval < 0.05
log2fc > 0: 40
log2fc < 0: 79

Marginal P < 0.05: 201
Adjusting ALL < 0.05: 119
>>>>>>>> cVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 78
log2fc < 0: 125
---------
all adj pval < 0.05
log2fc > 0: 28
log2fc < 0: 14

Marginal P < 0.05: 203
Adjusting ALL < 0.05: 42
>>>>>>>> negVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0

In [189]:
data_dict = main(data_dir, 'metabolomics', 0.05, data_dict)
data_dict = main2(data_dir, 'metabolomics', 0.05, data_dict)
data_dict = main(data_dir, 'metabolomics', 0.03, data_dict)
data_dict = main2(data_dir, 'metabolomics', 0.03, data_dict)

data_dict = main(data_dir, 'metabolomics', 0.01, data_dict)
data_dict = main2(data_dir, 'metabolomics', 0.01, data_dict)

>>>>>>>> cVSneg <<<<<<<<
marginal pval < 0.05
log2fc > 0: 34
log2fc < 0: 79
---------
all adj pval < 0.05
log2fc > 0: 54
log2fc < 0: 78

Marginal P < 0.05: 113
Adjusting ALL < 0.05: 132
>>>>>>>> cVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 21
log2fc < 0: 83
---------
all adj pval < 0.05
log2fc > 0: 10
log2fc < 0: 32

Marginal P < 0.05: 104
Adjusting ALL < 0.05: 42
>>>>>>>> negVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 17
log2fc < 0: 34
---------
all adj pval < 0.05
log2fc > 0: 10
log2fc < 0: 25

Marginal P < 0.05: 51
Adjusting ALL < 0.05: 35
>>>>>>>> cVSneg <<<<<<<<
marginal pval < 0.05
log2fc > 0: 23
log2fc < 0: 54
---------
all adj pval < 0.05
log2fc > 0: 12
log2fc < 0: 33

Marginal P < 0.05: 77
Adjusting ALL < 0.05: 45
>>>>>>>> cVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 19
log2fc < 0: 54
---------
all adj pval < 0.05
log2fc > 0: 7
log2fc < 0: 17

Marginal P < 0.05: 73
Adjusting ALL < 0.05: 24
>>>>>>>> negVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 5
log2fc < 0: 22

In [190]:
data_dict = main(data_dir, 'autoantibody', 0.05, data_dict)
data_dict = main2(data_dir, 'autoantibody', 0.05, data_dict)
data_dict = main(data_dir, 'autoantibody', 0.03, data_dict)
data_dict = main2(data_dir, 'autoantibody', 0.03, data_dict)


data_dict = main(data_dir, 'autoantibody', 0.01, data_dict)
data_dict = main2(data_dir, 'autoantibody', 0.01, data_dict)

>>>>>>>> cVSneg <<<<<<<<
marginal pval < 0.05
log2fc > 0: 16
log2fc < 0: 10
---------
all adj pval < 0.05
log2fc > 0: 49
log2fc < 0: 59

Marginal P < 0.05: 26
Adjusting ALL < 0.05: 108
>>>>>>>> cVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 48
log2fc < 0: 35
---------
all adj pval < 0.05
log2fc > 0: 42
log2fc < 0: 29

Marginal P < 0.05: 83
Adjusting ALL < 0.05: 71
>>>>>>>> negVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 104
log2fc < 0: 75
---------
all adj pval < 0.05
log2fc > 0: 83
log2fc < 0: 39

Marginal P < 0.05: 179
Adjusting ALL < 0.05: 122
>>>>>>>> cVSneg <<<<<<<<
marginal pval < 0.05
log2fc > 0: 4
log2fc < 0: 3
---------
all adj pval < 0.05
log2fc > 0: 2
log2fc < 0: 2

Marginal P < 0.05: 7
Adjusting ALL < 0.05: 4
>>>>>>>> cVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 17
log2fc < 0: 17
---------
all adj pval < 0.05
log2fc > 0: 4
log2fc < 0: 4

Marginal P < 0.05: 34
Adjusting ALL < 0.05: 8
>>>>>>>> negVSpos <<<<<<<<
marginal pval < 0.05
log2fc > 0: 61
log2fc < 0: 43
-----

In [191]:
data_df = pd.DataFrame(list(data_dict.items()), columns=['Column1', 'DEGS'])
data_df[['omics', 'disease_compare', 'model_type', 'CohenD', 'P-value']] = pd.DataFrame(data_df['Column1'].tolist(), index=data_df.index)
data_df = data_df[['omics', 'disease_compare', 'model_type', 'CohenD', 'P-value', 'DEGS']]
data_df.to_csv('/Users/m221138/RA_ACPA_multiomics/analysis/statistics/linear_model/differential_abundance_linear/data_investigation.linear.csv', index=False)