In [3]:
#01_PREPROCESS_RA_ART_metabolomics.ipynb

import pandas as pd
import numpy as np
import math
import statistics

In [4]:
def main(data_file,output_file):
    count_nan = 0
    data_dict = {}
    metabolite_dict = {}
    
    data_df = pd.read_csv(data_file, sep="\t", index_col=0, thousands=',')
    r, c = data_df.shape
    
    metabolite_list = data_df.index.values
    sample_list = data_df.columns.values
    
    print (len(metabolite_list))
    print (len(sample_list))
    nan_cutoff = int(len(sample_list) * 0.2) #if metabolites have more than 20% of N/A values, it will be removed 
    
    #output
    output_txt = open(output_file,'w')
    for sample in sample_list:
        output_txt.write('\t%s' % sample)
    output_txt.write('\n')
    
    #check which metabolites does not meet criteria (too many N/As)
    for i in range(r):
        metabolite = metabolite_list[i]
        value_list = data_df.iloc[i]
        normalized_value_list = []
        temp_list = []

        if metabolite == "ccp":
            output_txt.write("acpa")
            for value in value_list:
                output_txt.write('\t%s' % value)
            output_txt.write('\n')
        else:
            
            num_nan = value_list.isna().sum()
            metabolite_median = value_list.median()
            
            if num_nan < nan_cutoff:
                output_txt.write(metabolite)
                for value in value_list:
                    #if isinstance(value, np.float) == True or isinstance(value, np.float64) == True:
                    if math.isnan(value) == True:
                        if math.isnan(value) == True:
                            normalized_value_list.append('nan')
                        else:
                            normalized_value = float(value) / float(metabolite_median)
                            normalized_value_list.append(normalized_value)
                            temp_list.append(normalized_value)
    
                    else: #for np.object
                        normalized_value = float(value) / float(metabolite_median)
                        normalized_value_list.append(normalized_value)
                        temp_list.append(normalized_value)
    
                #replace nan to mimimum value
                min_value = min(temp_list)
                for i in range(len(normalized_value_list)):
                    value = normalized_value_list[i]
                    if value == 'nan':
                        normalized_value_list[i] = min_value
                        
                for value in normalized_value_list:
                    output_txt.write('\t%s' % value)
                output_txt.write('\n')
            else:
                print ("%s has too many nan (%s/%s). This will be excluded" % (metabolite, num_nan, nan_cutoff))
                count_nan += 1
            
    print (count_nan)
    output_txt.close()

In [10]:
#get ST, sample ID link
accrual_report_file = "../../../analysis_revision/external_validation/ra_art_metabolomics/raw_data/accrual_report_hd4.csv"
accrual_report_df = pd.read_csv(accrual_report_file, dtype=str)
accrual_report_df = accrual_report_df[["Sample Id","External_Participant_Id"]]

#get sample ID, ACPA link
ra_biobank_file = "../../../analysis_revision/external_validation/ra_art_metabolomics/raw_data/RA_biobank_2019June17_sheet1.csv"
ra_biobank_df = pd.read_csv(ra_biobank_file)
ra_biobank_df = ra_biobank_df[["External_Participant_Id", "ccp"]]

#get sample ID from plamsa data
plasma_m_file = "../../../analysis_revision/external_validation/ra_art_metabolomics/raw_data/RA_plasma_metabolomics.csv"
plasma_m_df = pd.read_csv(plasma_m_file, header=None, index_col=0)
plasma_m_df = plasma_m_df.T
plasma_m_sample_ID_list = list(plasma_m_df["sample_id"])

In [24]:
accrual_report_df = accrual_report_df[accrual_report_df['Sample Id'].isin(plasma_m_sample_ID_list)]

merged_df = pd.merge(accrual_report_df, ra_biobank_df, on='External_Participant_Id', how='inner')
merged_df = merged_df.drop_duplicates()

# merged_df.to_csv("../../analysis_addressing_feedback/external_validation/ra_plasma_metabolomics/data/RA_plasma_metabolomics.patient_info.csv", index=False)

final_df = pd.merge(merged_df, plasma_m_df, left_on='Sample Id', right_on='sample_id',  how='left')
final_df = final_df.dropna(subset=['ccp'])
# final_df["ccp"].replace({"positive": 1, "negative": 2}, inplace=True)

final_df["ccp"] = final_df["ccp"].replace({"positive": 1, "negative": 2}).infer_objects()
final_df = final_df.drop(columns=["External_Participant_Id","sample_id","PARENT SAMPLE ID","SAMPLE NAME","RESPONDER STATUS","SAMPLENUMBER","SAMPLE AMOUNT","SUBJECT ID","TIME POINT","etc"])
final_df = final_df.T
final_df.to_csv("../../../analysis_revision/external_validation/ra_art_metabolomics/data/RA_art_metabolomics.patient_info.tsv", sep="\t", header=False)



  final_df["ccp"] = final_df["ccp"].replace({"positive": 1, "negative": 2}).infer_objects()


In [25]:
data_file = '../../../analysis_revision/external_validation/ra_art_metabolomics/data/RA_art_metabolomics.patient_info.tsv'
output_file = '../../../analysis_revision/external_validation/ra_art_metabolomics/data/RA_art_metabolomics.scaled.patient_info.tsv'

In [26]:
main(data_file, output_file)

895
131
(2,4 or 2,5)-dimethylphenol sulfate has too many nan (48/26). This will be excluded
1-arachidonoyl-GPA (20:4) has too many nan (41/26). This will be excluded
1-hydroxy-2-naphthalenecarboxylate has too many nan (124/26). This will be excluded
1-oleoyl-GPS (18:1) has too many nan (58/26). This will be excluded
1-palmitoyl-GPA (16:0) has too many nan (63/26). This will be excluded
1-stearoyl-GPS (18:0)* has too many nan (39/26). This will be excluded
10-hydroxywarfarin has too many nan (123/26). This will be excluded
11-ketoetiocholanolone glucuronide has too many nan (67/26). This will be excluded
1H-indole-7-acetic acid has too many nan (43/26). This will be excluded
2'-deoxyuridine has too many nan (29/26). This will be excluded
2-acetamidophenol sulfate has too many nan (72/26). This will be excluded
2-hydroxyacetaminophen sulfate* has too many nan (71/26). This will be excluded
2-hydroxyibuprofen has too many nan (101/26). This will be excluded
2-isopropylmalate has too many 