In [4]:
#04_summarize_every_results_ALTERNATIVE
#
#leverage results from *.padj.tsv* (created by "02_calculated_BH_pvalues")
#   - simply adding foldchange values
#   - apply thresholds of Cohen's D and P value
#   - summarize (filter) results based on cohens'D and p-value.

import pandas as pd
import statistics
import math

In [5]:
def make_foldchange_dict(data_profile_file, patient_info_file):
    
    fc_dict = {}
    data_profile_dict = {}
    patient_class_info_dict = {}
    
    input_data_df = pd.read_csv(data_profile_file, sep="\t", index_col=0)
    patient_info_df = pd.read_csv(patient_info_file, sep="\t")

    r, c = input_data_df.shape
    patientID_list = input_data_df.columns.values
    featureID_list = input_data_df.index.values
    
    #make: dict[feature, patientID] = value
    for i in range(r):
        feature = featureID_list[i]
        for j in range(c):
            patientID = patientID_list[j]
            data_profile_dict[feature, patientID] = input_data_df.iloc[i][j]
            
    r, c = patient_info_df.shape
    patientID_list = patient_info_df.columns.values[1:]

    #make: dict[class] = [patientID list]
    for i in range(r):
        feature = patient_info_df.iloc[i][0]

        if feature == "acpa":
            for j in range(1, c):
                patientID = patientID_list[j-1]
                value = int(patient_info_df.iloc[i][j])
                
                try: patient_class_info_dict[value].append(patientID)
                except KeyError: patient_class_info_dict[value] = [patientID]

    acpa_status_list = [0, 1, 2, 3]
    for feature in featureID_list:
        for acpa_status in acpa_status_list:
            
            if acpa_status == 3:
                fc_dict = make_abundance_list(feature, fc_dict, patient_class_info_dict, data_profile_dict, 2, 3)                    
                fc_dict = make_abundance_list(feature, fc_dict, patient_class_info_dict, data_profile_dict, 1, 3)                     
            else:
                fc_dict = make_abundance_list(feature, fc_dict, patient_class_info_dict, data_profile_dict, 2, 2)
                fc_dict = make_abundance_list(feature, fc_dict, patient_class_info_dict, data_profile_dict, 1, 1)
                fc_dict = make_abundance_list(feature, fc_dict, patient_class_info_dict, data_profile_dict, 0, 0)
                
    return fc_dict
                    
def make_abundance_list(feature, fc_dict, patient_class_info_dict, data_profile_dict, acpa_status, acpa_index):
    
    specific_patientID_list = patient_class_info_dict[acpa_status]
    value_list = []
    
    for patientID in specific_patientID_list:
        value = data_profile_dict[feature, patientID]
        value_list.append(value)
    fc_dict[feature, acpa_index] = value_list
    
    return fc_dict

def get_fc_value(acpa_group, feature,  fc_dict):
    
    if acpa_group == 'pos':
        acpa_index = 1
    if acpa_group == 'neg':
        acpa_index = 2
    if acpa_group == 'ra':
        acpa_index = 3
    if acpa_group == 'c':
        acpa_index = 0
    
    value = statistics.mean(fc_dict[feature, acpa_index])
    
    return value

In [6]:
analysis_result_dir = '../../../analysis/statistics/linear_model/differential_abundance_v2/'
patient_info_file = '../../../preprocessed_data/meta/patient_info.ML_ready.tsv'

comparison_list = ['cVSneg', 'cVSpos','cVSra','negVSpos']
condition_list = ['autoantibody','metabolomics','proteomics']

for condition in condition_list:
    
    if condition == 'metabolomics':
        data_profile_file = '../../../preprocessed_data/metabolomics/metabolone_raw_norm_preprocessed.v2.tsv'
    if condition == 'proteomics':
        data_profile_file = '../../../preprocessed_data/proteomics/somascan_anml.T.v2.tsv'
    if condition == 'autoantibody':
        data_profile_file = '../../../preprocessed_data/autoantibody/sengenics_qnorm_data.v2.tsv'
    
    fc_dict = make_foldchange_dict(data_profile_file, patient_info_file) #get fc information
    feature_info_dict = {}

    for comparison in comparison_list:
        data_file = '%s%s.%s.padj.tsv' % (analysis_result_dir, condition, comparison)
        data_df = pd.read_csv(data_file, sep="\t", index_col=0)
        feature_list = data_df.index.values
        r, c = data_df.shape
        for i in range(r):
            feature = feature_list[i]
            cohenD = data_df["cohenD"][i]
            pval = data_df["pval"][i]
            padj = data_df["padj"][i]
                  
            group_a = comparison.split('VS')[0]
            group_b = comparison.split('VS')[1]
            
            group_a_mean = get_fc_value(group_a, feature, fc_dict)
            group_b_mean = get_fc_value(group_b, feature, fc_dict)

            feature_fc = math.log(group_b_mean/group_a_mean, 2) #log2 fc
        
            feature_info_dict[comparison, feature] = [cohenD, pval, padj, feature_fc]
            

        output_file = '%s%s.%s.padj.v2.med.tsv' % (analysis_result_dir, condition, comparison)
        output_txt = open(output_file,'w')
        output_txt.write('\tcohenD\tfc(case/control)\tpval\tpadj\n')
        for feature in feature_list:
            
            cohenD = feature_info_dict[comparison, feature][0]
            pval = feature_info_dict[comparison, feature][1]
            padj = feature_info_dict[comparison, feature][2]
            fc = feature_info_dict[comparison, feature][3]
            if cohenD >= 0.5:
                if pval < 0.05:
                    output_txt.write('%s\t%s\t%s\t%s\t%s\n' % (feature, cohenD, fc, pval, padj))
                
        output_txt.close()