In [1]:
import pandas as pd
from scipy import stats
import statistics
import numpy as np
import operator

#Correlation:
#Protein abundance of ACPA– RA ~ BMI of ACPA– RA -> protein abundance in ACPA– is related with ACPA– RA BMI
#BMI: ACPA–, ACPA+, Control
#AGE: ACPA–, ACPA+, Control
#
#DAS28-CRP: ACPA–, ACPA+
#CRP: ACPA–, ACPA+
#ESR: ACPA–, ACPA+


In [2]:
def make_data_dict(data_file):
    
    data_dict = {}
    
    data_df = pd.read_csv(data_file, sep="\t", index_col=0)
    r, c = data_df.shape
    
    feature_list = data_df.index.values
    patientID_list = data_df.columns.values

    for i in range(r):
        feature = feature_list[i]
         
        for j in range(c):
            patientID = patientID_list[j]            
            value = data_df.iloc[i][j]
            data_dict[feature, patientID] = value
    
    return data_dict, feature_list, list(patientID_list)


def make_correlation_dict(data_dict, patient_info_df, feature_list, clinical_covariate_list, sample_type, correlation_dict):
    
    patient_info_df = patient_info_df[patient_info_df['acpa'].isin([0,sample_type])] #considering RA samples only; because healthy samples do not have full information.
    patientID_list = patient_info_df["sample_ID"]
    
    for feature in feature_list:

        feature_value_list = [] #this is set1 that will be used for correlation
        for patientID in patientID_list:
            value = data_dict[feature, str(int(patientID))]
            feature_value_list.append(value)
        
        for clinical_covariate in clinical_covariate_list:
            clinical_covariate_value_list = [] #this is set2 that will be used for correlation
            for patientID in patientID_list:

                value = patient_info_df[clinical_covariate].loc[patient_info_df["sample_ID"] == patientID].values[0]
                clinical_covariate_value_list.append(value)

            corr, pval = stats.spearmanr(feature_value_list, clinical_covariate_value_list, nan_policy="omit")
            correlation_dict[feature, clinical_covariate, sample_type] = [corr, pval]
            
    return correlation_dict

def make_output(correlation_dict, feature_list, clinical_covariate, output_dir, omics_type):
    
    output_file = '%s/%s_%s.rho.tsv' % (output_dir, omics_type, clinical_covariate)
    output_txt = open(output_file,'w')
    output_txt.write('Rank(ACPA–)\tRho\tRA_phenotype\n')
    #designed final output
    #Scatter plot of X-axis: Rank (based on ACPA–, Y-axis: Rho
    
    #first rank the features in ACPA–
    correlation_rank_dict = {}
    for feature in feature_list:
        rho_value = correlation_dict[feature, clinical_covariate, 2][0]
        correlation_rank_dict[feature] = rho_value
    
    correlation_rank_dict = dict(sorted(correlation_rank_dict.items(), key=operator.itemgetter(1),reverse=True))

    rank = 0
    for feature in list(correlation_rank_dict.keys()):
        rank = rank + 1 # will be used as an index for X-axis
        
        #write ACPA- RA
        rho = correlation_dict[feature, clinical_covariate, 2][0]
        output_txt.write('%s\t%s\t%s\n' % (rank, rho, 2))
        
        #write ACPA+ RA
        rho = correlation_dict[feature, clinical_covariate, 1][0]
        output_txt.write('%s\t%s\t%s\n' % (rank, rho, 1))
        
    
    output_txt.close()

In [3]:
def main_v2(patient_info_file, data_dict, feature_list, output_dir, omics_type):
    
    correlation_dict = {}

    patient_info_df = pd.read_csv(patient_info_file, sep="\t")
    clinical_covariate_list = ["bmi","age"]
    
    correlation_dict = make_correlation_dict(data_dict, patient_info_df, feature_list, clinical_covariate_list, 1, correlation_dict)
    correlation_dict = make_correlation_dict(data_dict, patient_info_df, feature_list, clinical_covariate_list, 2, correlation_dict)
    correlation_dict = make_correlation_dict(data_dict, patient_info_df, feature_list, clinical_covariate_list, 0, correlation_dict)

    make_output_v2(correlation_dict, feature_list, 'bmi', output_dir, omics_type)
    make_output_v2(correlation_dict, feature_list, 'age', output_dir, omics_type)
    
def make_output_v2(correlation_dict, feature_list, clinical_covariate, output_dir, omics_type):
    
    output_file = '%s/%s_%s.rho.tsv' % (output_dir, omics_type, clinical_covariate)
    output_txt = open(output_file,'w')
    output_txt.write('Rank(ACPA–)\tRho\tRA_phenotype\n')
    #designed final output
    #Scatter plot of X-axis: Rank (based on ACPA–, Y-axis: Rho
    
    #first rank the features in ACPA–
    correlation_rank_dict = {}
    for feature in feature_list:
        rho_value = correlation_dict[feature, clinical_covariate, 2][0]
        correlation_rank_dict[feature] = rho_value
    
    correlation_rank_dict = dict(sorted(correlation_rank_dict.items(), key=operator.itemgetter(1),reverse=True))

    rank = 0
    for feature in list(correlation_rank_dict.keys()):
        rank = rank + 1 # will be used as an index for X-axis
        
        #write ACPA- RA
        rho = correlation_dict[feature, clinical_covariate, 2][0]
        output_txt.write('%s\t%s\t%s\n' % (rank, rho, 2))
        
        #write ACPA+ RA
        rho = correlation_dict[feature, clinical_covariate, 1][0]
        output_txt.write('%s\t%s\t%s\n' % (rank, rho, 1))
        
        rho = correlation_dict[feature, clinical_covariate, 0][0]
        output_txt.write('%s\t%s\t%s\n' % (rank, rho, 0))
        
    
    output_txt.close()

In [4]:
patient_info_file = '../../../preprocessed_data/meta/patient_info_for_statistics.tsv'
output_dir = "../../../analysis/statistics/omics_clinical_feature_correlation"

# p_file = "../../../preprocessed_data/proteomics/somascan_anml.T.v2.tsv"
# p_data_dict, p_feature_list, patientID_list = make_data_dict(p_file)

# aa_file = '/Users/m221138/RA_acpa_multiomics/preprocessed_data/autoantibody/sengenics_qnorm_data.v2.tsv'
# aa_data_dict, aa_feature_list, patientID_list = make_data_dict(aa_file)

# #correlation between proteins and clinical information
# output_file = "../../../analysis/statistics/omics_clinical_feature_correlation/autoantibody_patient_info_correlation..tsv"
# main(patient_info_file, aa_data_dict, aa_feature_list, output_file)


m_file = '/Users/m221138/RA_acpa_multiomics/preprocessed_data/metabolomics/metabolone_raw_norm_preprocessed.v2.tsv'
m_data_dict, m_feature_list, patientID_list = make_data_dict(m_file)

main_v2(patient_info_file, m_data_dict, m_feature_list, output_dir, 'metabolomics')




In [52]:

aa_file = '/Users/m221138/RA_acpa_multiomics/preprocessed_data/autoantibody/sengenics_qnorm_data.v2.tsv'
aa_data_dict, aa_feature_list, patientID_list = make_data_dict(aa_file)

main_v2(patient_info_file, aa_data_dict, aa_feature_list, output_dir, 'autoantibody')

p_file = '/Users/m221138/RA_acpa_multiomics/preprocessed_data/proteomics/somascan_anml.T.v2.tsv'

main_v2(patient_info_file, p_data_dict, p_feature_list, output_dir, 'proteomics')
