In [1]:
#STEP01_PREPROCESS_Ternary_Plot
#
#Prepare plot ready matrix for ternary plot

import pandas as pd
import statistics

In [2]:
def make_omics_dict(data_file, patient_info_dict, output_file, omics_type_str):
    
    #make omics_dict
    data_dict = {}
    data_df = pd.read_csv(data_file, sep="\t", index_col=0)
    feature_list = data_df.index.values
    patient_ID_list = data_df.columns.values
    
    print ("Omics type:", omics_type_str)
    print ('Sample Size:', len(patient_ID_list))
    print ('Feature Size:', len(feature_list))
    
    r, c = data_df.shape
    for i in range(r):
        feature = feature_list[i]
        
        for j in range(c):
            patient_ID = patient_ID_list[j]
            sample_status = patient_info_dict[patient_ID]
            value = data_df.iloc[i][j]
            
            try: data_dict[feature, sample_status].append(value)
            except KeyError: data_dict[feature,sample_status] = [value]
                
    #make omics_dict: complete
    
    #make output file
    output_file = '%s.%s.tsv' % (output_file, omics_type_str)
    print (output_file)
    output_txt = open(output_file, 'w')
    output_txt.write('feature\tcontrol\tacpa_neg\tacpa_pos\n')
    for feature in feature_list:        

        control_value = statistics.mean(data_dict[feature,0])
        acpa_neg_value = statistics.mean(data_dict[feature,2])
        acpa_pos_value = statistics.mean(data_dict[feature,1])
        
        output_txt.write('%s\t%s\t%s\t%s\n' %(feature, control_value, acpa_neg_value, acpa_pos_value))
    
    output_txt.close()
    
def make_patient_info_dict(data_file):
    
    data_dict = {}
    data_df = pd.read_csv(data_file,sep="\t", index_col=0)
    r, c = data_df.shape
    patient_list = data_df.columns.values
    
    for j in range(c):
        patient_ID = patient_list[j]
        value = data_df.iloc[0][j]

        data_dict[patient_ID] = value
            
    return data_dict
    

In [3]:
proteomics_data_file = '../../../preprocessed_data/proteomics/somascan_anml.T.v2.tsv'
metabolomics_data_file = '../../../preprocessed_data/metabolomics/metabolone_raw_norm_preprocessed.v2.tsv'
autoantibody_data_file = '../../../preprocessed_data/autoantibody/sengenics_qnorm_data.v2.tsv'
patient_info_file = '../../../preprocessed_data/meta/patient_info.ML_ready.tsv'

patient_info_dict = make_patient_info_dict(patient_info_file)


In [9]:
output_file = '../../../analysis/statistics/ternary_plots/ternary_plots'

In [10]:
make_omics_dict(proteomics_data_file, patient_info_dict, output_file, 'proteomics')
make_omics_dict(autoantibody_data_file, patient_info_dict, output_file, 'autoantibody')
make_omics_dict(metabolomics_data_file, patient_info_dict, output_file, 'metabolomics')

Omics type: proteomics
Sample Size: 120
Feature Size: 7272
../../../analysis/statistics/ternary_plots/ternary_plots.proteomics.tsv
Omics type: autoantibody
Sample Size: 120
Feature Size: 1610
../../../analysis/statistics/ternary_plots/ternary_plots.autoantibody.tsv
Omics type: metabolomics
Sample Size: 120
Feature Size: 1061
../../../analysis/statistics/ternary_plots/ternary_plots.metabolomics.tsv
