In [32]:
#PREPROCESS_identify_cytokines_from_significant_proteins
#
#Purpose: identify differentially abundant cytokines from differentially abundant proteins.

#cytokines were defined by immport: https://www.immport.org/shared/genelists
#differentially abundant proteins were pre-identified from previous step.

import pandas as pd

In [33]:
def get_list_from_file(data_file):
    
    data_df = pd.read_csv(data_file, sep ="\t", header=None)
    feature_list = []
    r, c = data_df.shape
    
    for i in range(r):
        value = data_df.iloc[i][0]
        feature_list.append(value)
        
    return feature_list

def get_cytokine_list_from_file(data_file):
    
    data_df = pd.read_csv(data_file, sep ="\t")
    feature_list = []
    r, c = data_df.shape
    
    for i in range(r):
        value = data_df.iloc[i][1]
        feature_list.append(value)
        
    return feature_list

def find_significant_cytokine(data_dict, cytokine_list):
    
    all_cytokine_list = []
    for data_type in list(data_dict.keys()):
        significant_protein_list = data_dict[data_type]
        temp_cytokine_list = []
        
        for protein_symbol_with_barcode in significant_protein_list:
            protein_symbol = protein_symbol_with_barcode.split('_')[0]
            if protein_symbol in cytokine_list:
                temp_cytokine_list.append(protein_symbol_with_barcode)
                all_cytokine_list.append(protein_symbol_with_barcode)
        
    all_cytokine_list = list(set(all_cytokine_list))
    return (all_cytokine_list)
    

In [34]:
cytokine_file = '../../../etc/Cytokines.txt'
cytokine_list = get_cytokine_list_from_file(cytokine_file)

In [35]:
data_dir = '../../../analysis/statistics/gse/proteomics'
condition_list = ["cVSneg","cVSpos", "negVSpos"]
abundance_type_list = ["up","down"]

data_dict = {}
for abundance_type in abundance_type_list:
    for condition in condition_list:
        data_file = '%s/proteomics.diff.%s.%s.barcode.tsv' % (data_dir, condition, abundance_type)
        feature_list = get_list_from_file(data_file)  
        
        data_type = "%s_%s" %(condition, abundance_type)
        data_dict[data_type] = feature_list

all_cytokine_list = find_significant_cytokine(data_dict, cytokine_list)

In [36]:
data_dir = '../../../analysis/statistics/gse/proteomics'
condition_list = ["cVSneg","cVSpos", "negVSpos"]
abundance_type_list = ["up","down"]

data_dict = {}
for abundance_type in abundance_type_list:
    for condition in condition_list:
        data_file = '%s/proteomics.diff.%s.%s.barcode.tsv' % (data_dir, condition, abundance_type)
        feature_list = get_list_from_file(data_file)  
        
        data_type = "%s_%s" %(condition, abundance_type)
        data_dict[data_type] = feature_list

all_cytokine_list = find_significant_cytokine(data_dict, cytokine_list)

proteomic_file = '../../../preprocessed_data/proteomics/somascan_anml.T.v2.tsv'
protein_df = pd.read_csv(proteomic_file, sep="\t", index_col=0)
subset_protein_df = protein_df.loc[all_cytokine_list]

patient_info_file = '../../../preprocessed_data/meta/patient_info.ML_ready.tsv'
patient_df = pd.read_csv(patient_info_file, sep="\t",index_col=0)

acpa_info = patient_df.loc["acpa"]

subset_protein_df = pd.concat([acpa_info.to_frame().T, subset_protein_df, ], axis=0)
subset_protein_df.to_csv('../../../analysis/statistics/cytokine_boxplots/all_adj_cytokine_df.tsv', sep="\t")

In [37]:
all_cytokine_list

['NAMPT_5011-11',
 'HDGFL3_18899-82',
 'PDGFC_13658-31',
 'CXCL13_3487-32',
 'IL1RN_5353-89',
 'OGN_17224-12',
 'EDN3_15383-200',
 'EDN1_6495-14',
 'CAT_3488-64',
 'CCL15_14109-15',
 'C5_2381-52',
 'CCL15_18289-16',
 'CCL7_22969-12',
 'IL17C_9255-5',
 'NPPA_5443-62']