In [1]:
#PREPROCESS_make_correlation_ready_profile
#Preperation for the corrleation calculation between cytokine/chemokine and patientinformation/other omics
#
#[1] make profile of 120 samples (Healthy, ACPA+, ACPA-)
#[2] make profile of 80 samples (ACPA+, ACPA-)

import pandas as pd
from scipy import stats
import statistics
import numpy as np

In [13]:
def make_feature_list(data_file, common_feature_list):
    
    feature_list = []
    data_df = pd.read_csv(data_file, sep="\t")
    r, c = data_df.shape
    
    for i in range(r):
        feature = data_df["Symbol"][i]
        common_feature_list.append(feature)
        
    common_feature_list = list(set(common_feature_list))
    
    print ("Cytokines in DB list: ",len(common_feature_list))
    
    return common_feature_list

def make_p_data_dict(data_file, common_feature_list):
    
    data_dict = {}
    
    data_df = pd.read_csv(data_file, sep="\t", index_col=0)
    r, c = data_df.shape
    
    feature_list = data_df.index.values
    patientID_list = data_df.columns.values
    
    temp_feature_list = []
    uniq_feature_list = []

    for i in range(r):
        feature = feature_list[i]
        temp_feature_name = feature.split('_')[0]
        
        if temp_feature_name in uniq_feature_list:
            None
        
        if temp_feature_name in common_feature_list:  
            temp_feature_list.append(feature)
            uniq_feature_list.append(temp_feature_name)
            

            for j in range(c):
                patientID = patientID_list[j]            
                value = data_df.iloc[i][j]
                data_dict[feature, patientID] = value
                
    print ("Cytokines from proteomics data [including duplicates]: ", len(temp_feature_list))
    print ("Cytokines from proteomics data [removing duplicates]: ", len(list(set(uniq_feature_list))))
    
    return data_dict, temp_feature_list, list(patientID_list)

def make_data_dict(data_file):
    
    data_dict = {}
    
    data_df = pd.read_csv(data_file, sep="\t", index_col=0)
    r, c = data_df.shape
    
    feature_list = data_df.index.values
    patientID_list = data_df.columns.values

    for i in range(r):
        feature = feature_list[i]
         
        for j in range(c):
            patientID = patientID_list[j]            
            value = data_df.iloc[i][j]
            data_dict[feature, patientID] = value
    
    return data_dict, feature_list, list(patientID_list)

def main(patient_info_file, data_dict, feature_list, output_file):
    
    correlation_dict = {}

    patient_info_df = pd.read_csv(patient_info_file, sep="\t")
    patient_info_df = patient_info_df[patient_info_df['acpa'].isin([1,2])] #considering RA samples only; because healthy samples do not have full information.
    patientID_list = patient_info_df["sample_ID"]
    
    clinical_covariate_list = ["acpa","sex","bmi","smoking","crp","esr","cdai","sdai","das28esr","das28crp"]
    
    for feature in feature_list:
        feature_value_list = [] #this is set1 that will be used for correlation
        for patientID in patientID_list:
            value = data_dict[feature, str(int(patientID))]
            feature_value_list.append(value)
        
        for clinical_covariate in clinical_covariate_list:
            clinical_covariate_value_list = [] #this is set2 that will be used for correlation
            for patientID in patientID_list:

                value = patient_info_df[clinical_covariate].loc[patient_info_df["sample_ID"] == patientID].values[0]
                clinical_covariate_value_list.append(value)

            corr, pval = stats.spearmanr(feature_value_list, clinical_covariate_value_list, nan_policy="omit")
            correlation_dict[feature, clinical_covariate] = [corr, pval]

    #write output
    output_txt = open(output_file,'w')
    for clinical_covariate in clinical_covariate_list:
        output_txt.write('\t%s' % clinical_covariate)
    output_txt.write('\n')
    
    for feature in feature_list:
        output_txt.write(feature)
        for clinical_covariate in clinical_covariate_list:
            corr = correlation_dict[feature, clinical_covariate][0]
            pvalue = correlation_dict[feature, clinical_covariate][1]
            
            if pvalue < 0.05:
                output_txt.write('\t%s' % corr)
            else:
                output_txt.write('\t0')
                
        output_txt.write('\n')
    output_txt.close()


def main_v2(patient_info_file, p_data_dict, p_feature_list, aa_data_dict, aa_feature_list, output_file):
    
    correlation_dict = {}

    patient_info_df = pd.read_csv(patient_info_file, sep="\t")
    patient_info_df = patient_info_df[patient_info_df['acpa'].isin([0, 1, 2])] #considering RA samples only; because healthy samples do not have full information.
    patientID_list = patient_info_df["sample_ID"]
    
    for p_feature in p_feature_list:
        p_feature_value_list = [] #this is set1 that will be used for correlation
        for patientID in patientID_list:
            value = p_data_dict[p_feature, str(int(patientID))]
            p_feature_value_list.append(value)
        
        for aa_feature in aa_feature_list:
            aa_feature_value_list = [] #this is set2 that will be used for correlation
            for patientID in patientID_list:

                value = aa_data_dict[aa_feature, str(int(patientID))]
                aa_feature_value_list.append(value)

            corr, pval = stats.spearmanr(p_feature_value_list, aa_feature_value_list, nan_policy="omit")
            correlation_dict[p_feature, aa_feature] = [corr, pval]

    #write output
    output_txt = open(output_file,'w')
    for aa_feature in aa_feature_list:
        output_txt.write('\t%s' % aa_feature)
    output_txt.write('\n')
    
    for p_feature in p_feature_list:
        output_txt.write(p_feature)
        for aa_feature in aa_feature_list:
            corr = correlation_dict[p_feature, aa_feature][0]
            pvalue = correlation_dict[p_feature, aa_feature][1]
            
            if pvalue < 0.05:
                output_txt.write('\t%s' % corr)
            else:
                output_txt.write('\t0')
                
        output_txt.write('\n')
    output_txt.close()    
    

In [14]:
common_feature_list = []
cytokine_file = "../../../etc/Cytokines.txt"
# chemokine_file = "../../../etc/Chemokines.txt" #Biologically this should be a subset of cytokine (22.05.04: confirmed)
patient_info_file = "../../../preprocessed_data/meta/patient_info_for_statistics.tsv"

p_file = "../../../preprocessed_data/proteomics/somascan_anml.T.v2.tsv"
aa_file = "../../../preprocessed_data/autoantibody/sengenics_qnorm_data.v2.tsv"

common_feature_list = make_feature_list(cytokine_file, common_feature_list)

p_data_dict, p_feature_list, patientID_list = make_p_data_dict(p_file, common_feature_list)


aa_data_dict, aa_feature_list, patientID_list = make_data_dict(aa_file)

Cytokines in DB list:  456
Cytokines from proteomics data [including duplicates]:  507
Cytokines from proteomics data [removing duplicates]:  379


In [7]:
len(p_feature_list)

507

In [12]:
#correlation between proteins and clinical information
output_file = "../../../analysis/statistics/cytokine_correlation/cytokine_clinical_covariates_corr.tsv"
main(patient_info_file, p_data_dict, p_feature_list, output_file)


In [15]:
output_file = "../../../analysis/statistics/cytokine_correlation/cytokine_autoantibody_corr.tsv"
main_v2(patient_info_file, p_data_dict, p_feature_list, aa_data_dict, aa_feature_list, output_file)