In [13]:
#01_prepare_boxplot_format         23.02.16
#
#Prepare making boxplots of 14 metabolites from age stratified analysis.
#The final goal is to create a boxplot of 14 metabolites from old popuplation, young population.
#old population will be on the bottom, while young population will be on the top.
#for each population there will be three boxplots for each metabolite (control, acpa-neg, acpa-pos)
#thus, on each side (either top or bottom, there will be 3 * 14 box plots)

import pandas as pd

def get_metabolite_list(data_file):
    
    temp_list = []
    
    data_open = open(data_file,'r')
    data_readlines = data_open.readlines()
    
    for i in range(len(data_readlines)):
        read = data_readlines[i]
        read = read.replace('\n','')
        
        temp_list.append(read)
    
    return temp_list

def get_data_dict(data_file, temp_list):
    
    data_df = pd.read_csv(data_file, sep="\t", index_col=0)
    r, c = data_df.shape

    patient_id_list = data_df.columns.values
    feature_list = data_df.index.values

    patient_class_dict = {}
    data_dict = {}

    for i in range(r):
        feature = feature_list[i]
        
        for j in range(c):  
            if feature in temp_list:
                patient_ID = patient_id_list[j]
                value = data_df.iloc[i][j]
                data_dict[feature, patient_ID] = value
                
            if feature == "acpa":
                patient_ID = patient_id_list[j]
                value = data_df.iloc[i][j]
                
                if value == 0:
                    value = "control"
                if value == 1:
                    value = "acpa_pos"
                if value == 2:
                    value = "acpa_neg"
                patient_class_dict[patient_ID] = value

    return data_dict, patient_class_dict

def make_boxplot_ready_file(metabolite_list, age_dict, age_patient_dict, age_of_interest, work_dir):

    #X = age
    #y = abundance
    #hue = acpa status
    output_file = "%s/metabolites.%s.ready.tsv" % (work_dir, age_of_interest)
    output_txt = open(output_file,'w')                       
    output_txt.write("metabolite\tacpa_status_age\tabundance\tage_group\n")
    
    for metabolite in metabolite_list:
          
        for patient_ID in list(age_patient_dict.keys()):
            value = age_dict[metabolite, patient_ID]
            acpa_status = age_patient_dict[patient_ID]
            
            output_txt.write("%s\t%s\t%s\t%s\n" % (metabolite, acpa_status, value, age_of_interest))
            
    output_txt.close()

In [14]:
low_age_file = '../../../preprocessed_data/age_stratified/low_age_3_omics.tsv'
old_age_file = '../../../preprocessed_data/age_stratified/high_age_3_omics.tsv'

work_dir = "../../../analysis/age_stratified/visualize_14metabolites"

metabolite_list_file = "%s/%s" % (work_dir, "metabolite.list")
metabolite_list = get_metabolite_list(metabolite_list_file)

low_age_dict, low_age_patient_dict = get_data_dict(low_age_file, metabolite_list)
old_age_dict, high_age_patient_dict = get_data_dict(old_age_file, metabolite_list)

make_boxplot_ready_file(metabolite_list, low_age_dict, low_age_patient_dict, "young", work_dir)
make_boxplot_ready_file(metabolite_list, old_age_dict, high_age_patient_dict, "old", work_dir)

