In [None]:
import pandas as pd
import numpy as np

## Reading in ISB Paper Data

### ISB clinical data for INCOV cohort

https://data.mendeley.com/datasets/96v329bg7g/1

In [2]:
isb_incov_clinical = pd.read_excel("data/Table S1.xlsx", sheet_name = 1)
isb_incov_clinical.columns = isb_incov_clinical.columns.str.replace(' ', '_').str.lower()
isb_incov_clinical

Unnamed: 0,study_subject_id,blood_draw,observation_days_since_enrollment,onset_of_symptoms_days_before_enrollment,observation_days_since_onset_of_symptoms,encounter_location,who_ordinal_scale,mechanical_ventilation,respiratory_support,systolic_bp,...,isoplexis_cd8_assay,isoplexis_cd4_assay,isoplexis_monocyte_assay,isoplexis_nk_assay,sars-cov-2_antibody_assay,autoantibody_\nassay,sara-cov-2_rnaemia_assay,sara-cov-2_nasal_swab_assay,ebv_viremia_assay,cmv_viremia_assay
0,INCOV001,INCOV001-T1,0.92,16.04,16.96,Hospital,3,0,,139.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
1,INCOV001,INCOV001-T2,5.92,16.04,21.96,Hospital,3,0,,140.0,...,No,No,No,No,No,No,Yes,No,No,No
2,INCOV001,INCOV001-T3,105.21,16.04,121.25,Home (mobile phlebotomy),<=2,0,,,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No
3,INCOV002,INCOV002-T1,0.71,6.33,7.04,ICU,5,0,High flow nasal cannula (HFNC),96.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
4,INCOV002,INCOV002-T2,5.75,6.33,12.08,ICU,7,1,Other,125.0,...,No,No,No,No,No,No,Yes,No,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,INCOV212,INCOV212-T1,0.67,12.62,13.29,Hospital,3,0,,140.0,...,,,,,No,No,Yes,Yes,No,No
521,INCOV212,INCOV212-T2,18.96,12.62,31.58,Clinic,1 or 2,0,,,...,,,,,No,No,Yes,Yes,No,No
522,INCOV213,INCOV213-T1,0.54,9.17,9.71,ICU,5,0,High flow nasal cannula (HFNC),108.0,...,,,,,No,No,Yes,No,No,No
523,INCOV213,INCOV213-T2,3.54,9.17,12.71,ICU,5,0,High flow nasal cannula (HFNC),125.0,...,,,,,No,No,Yes,No,No,No


In [3]:
# Create a dictionary mapping the old column names to the new column names
incov_rename = {
    'who_ordinal_scale': 'who_severity',
    'age_at_baseline': 'age',
    'chronic_kidney_disease': 'kidney_disease', #this is for common features with the other dataset
    'chronic_obstructive_pulmonary_disease': 'copd',
    'diabetes': 'new_column5',
    'old_column6': 'new_column6'
}

# Use the rename() function to rename the columns
isb_incov_clinical = isb_incov_clinical.rename(columns=incov_rename)
isb_incov_clinical

Unnamed: 0,study_subject_id,blood_draw,observation_days_since_enrollment,onset_of_symptoms_days_before_enrollment,observation_days_since_onset_of_symptoms,encounter_location,who_severity,mechanical_ventilation,respiratory_support,systolic_bp,...,isoplexis_cd8_assay,isoplexis_cd4_assay,isoplexis_monocyte_assay,isoplexis_nk_assay,sars-cov-2_antibody_assay,autoantibody_\nassay,sara-cov-2_rnaemia_assay,sara-cov-2_nasal_swab_assay,ebv_viremia_assay,cmv_viremia_assay
0,INCOV001,INCOV001-T1,0.92,16.04,16.96,Hospital,3,0,,139.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
1,INCOV001,INCOV001-T2,5.92,16.04,21.96,Hospital,3,0,,140.0,...,No,No,No,No,No,No,Yes,No,No,No
2,INCOV001,INCOV001-T3,105.21,16.04,121.25,Home (mobile phlebotomy),<=2,0,,,...,Yes,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No
3,INCOV002,INCOV002-T1,0.71,6.33,7.04,ICU,5,0,High flow nasal cannula (HFNC),96.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
4,INCOV002,INCOV002-T2,5.75,6.33,12.08,ICU,7,1,Other,125.0,...,No,No,No,No,No,No,Yes,No,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
520,INCOV212,INCOV212-T1,0.67,12.62,13.29,Hospital,3,0,,140.0,...,,,,,No,No,Yes,Yes,No,No
521,INCOV212,INCOV212-T2,18.96,12.62,31.58,Clinic,1 or 2,0,,,...,,,,,No,No,Yes,Yes,No,No
522,INCOV213,INCOV213-T1,0.54,9.17,9.71,ICU,5,0,High flow nasal cannula (HFNC),108.0,...,,,,,No,No,Yes,No,No,No
523,INCOV213,INCOV213-T2,3.54,9.17,12.71,ICU,5,0,High flow nasal cannula (HFNC),125.0,...,,,,,No,No,Yes,No,No,No


For analysis, only the first timepoint is needed.

In [4]:
isb_incov_clinical_t1_data = isb_incov_clinical[isb_incov_clinical['blood_draw'].str.endswith('-T1')]

Also, only the the observations with proteomic data and metabolomic data is needed.

In [5]:
isb_incov_clinical_t1_data = isb_incov_clinical_t1_data[(isb_incov_clinical_t1_data['proteomics'] != 'No') & (isb_incov_clinical_t1_data['metabolomics'] != 'No')]
isb_incov_clinical_t1_data

Unnamed: 0,study_subject_id,blood_draw,observation_days_since_enrollment,onset_of_symptoms_days_before_enrollment,observation_days_since_onset_of_symptoms,encounter_location,who_severity,mechanical_ventilation,respiratory_support,systolic_bp,...,isoplexis_cd8_assay,isoplexis_cd4_assay,isoplexis_monocyte_assay,isoplexis_nk_assay,sars-cov-2_antibody_assay,autoantibody_\nassay,sara-cov-2_rnaemia_assay,sara-cov-2_nasal_swab_assay,ebv_viremia_assay,cmv_viremia_assay
0,INCOV001,INCOV001-T1,0.92,16.04,16.96,Hospital,3,0,,139.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
3,INCOV002,INCOV002-T1,0.71,6.33,7.04,ICU,5,0,High flow nasal cannula (HFNC),96.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
6,INCOV003,INCOV003-T1,0.54,16.54,17.08,ICU,7,1,Other,89.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
9,INCOV004,INCOV004-T1,0.75,2.21,2.96,Hospital,4,0,Nasal cannula,137.0,...,No,No,No,No,No,Yes,Yes,No,No,No
11,INCOV005,INCOV005-T1,0.00,14.21,14.21,Hospital,4,0,Nasal cannula,126.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,INCOV198,INCOV198-T1,0.67,8.21,8.88,Hospital,6,1,Other,113.0,...,,,,,No,Yes,Yes,Yes,Yes,Yes
495,INCOV199,INCOV199-T1,0.62,27.12,27.74,ICU,7,1,Other,132.0,...,,,,,No,No,Yes,Yes,No,No
497,INCOV200,INCOV200-T1,0.62,19.42,20.04,ICU,7,1,Other,114.0,...,No,No,No,No,No,Yes,Yes,Yes,No,No
499,INCOV201,INCOV201-T1,1.62,7.08,8.70,ICU,7,1,Other,116.0,...,,,,,No,Yes,Yes,Yes,Yes,Yes


In [6]:
isb_incov_clinical_t1_data.rename(columns={'study_subject_id': 'subject_id'}, inplace=True)

In [7]:
isb_incov_clinical_t1_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 198 entries, 0 to 502
Data columns (total 47 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   subject_id                                198 non-null    object 
 1   blood_draw                                198 non-null    object 
 2   observation_days_since_enrollment         198 non-null    float64
 3   onset_of_symptoms_days_before_enrollment  198 non-null    float64
 4   observation_days_since_onset_of_symptoms  198 non-null    float64
 5   encounter_location                        198 non-null    object 
 6   who_severity                              198 non-null    object 
 7   mechanical_ventilation                    198 non-null    int64  
 8   respiratory_support                       198 non-null    object 
 9   systolic_bp                               139 non-null    float64
 10  diastolic_bp                          

In [8]:
isb_incov_clinical_t1_data["proteomics"].value_counts()

Yes    198
Name: proteomics, dtype: int64

Next, I am checking the percentage missing in each column but the missing columns aren't needed for analyis, so I can move on.

In [9]:
isb_incov_clinical_t1_data.isnull().sum() * 100 / len(isb_incov_clinical_t1_data)

subject_id                                   0.000000
blood_draw                                   0.000000
observation_days_since_enrollment            0.000000
onset_of_symptoms_days_before_enrollment     0.000000
observation_days_since_onset_of_symptoms     0.000000
encounter_location                           0.000000
who_severity                                 0.000000
mechanical_ventilation                       0.000000
respiratory_support                          0.000000
systolic_bp                                 29.797980
diastolic_bp                                29.797980
temperature                                 29.797980
pulse                                       29.797980
blood_oxygenation                           29.797980
chest_xray_ct                               27.777778
bmi_at_baseline                             29.292929
sex                                          0.000000
age                                          0.000000
ethnicity                   

### ISB proteomics data

In [13]:
isb_proteomics = pd.read_excel("data/Table S2.xlsx", sheet_name = 1)
isb_proteomics

Unnamed: 0,Patient Subject ID,Blood Draw,Healthy or INCOV,age,sex,BMI,LGALS9_CVD2,TNFRSF10A_CVD2,AGRP_CVD2,TNFRSF10B_CVD2,...,EGFL7_ODA,RCOR1_ODA,CA14_ODA,STX8_ODA,PRKAB1_ODA,MAP4K5_ODA,NUB1_ODA,ENTPD2_ODA,MAGED1_ODA,GPR56_ODA
0,1004350,,Healthy,32,Female,23.130000,0.018738,-0.177699,1.010091,0.094147,...,0.244369,0.704403,0.354924,1.781293,2.285133,0.972785,1.869674,-0.400720,1.120317,-0.606187
1,1010823,,Healthy,64,Female,21.660000,0.309203,-0.036492,0.189023,0.418735,...,-0.028634,-0.057806,-0.456894,0.162826,0.077889,0.689977,0.562815,-0.501666,-0.737454,0.390353
2,1013541,,Healthy,41,Female,23.460000,0.298717,-0.140791,0.018454,-1.417946,...,-0.131302,-0.450438,0.060958,1.749543,1.565091,0.695397,0.198925,-0.017642,-0.292817,-0.812126
3,1016929,,Healthy,49,Male,25.240000,0.667058,0.750248,0.480728,0.132137,...,-1.035753,-0.675164,-0.636819,0.244157,-0.158365,0.423878,0.684548,-0.486275,1.634976,-0.963599
4,1019071,,Healthy,53,Female,23.470000,0.222519,0.185525,0.300923,-0.531894,...,0.530495,0.199890,0.830008,1.251163,0.522049,1.008268,0.953621,0.094314,0.911377,0.335281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812,INCOV210,INCOV210-T1,INCOV,43,Female,37.530864,-0.622214,0.556100,0.752388,0.668038,...,2.666567,1.668093,-0.420048,1.514181,1.813371,1.262130,1.341295,1.391966,1.885676,1.599052
813,INCOV212,INCOV212-T2,INCOV,42,Female,41.419527,2.255815,3.864351,3.532702,3.140393,...,2.941637,-0.453591,1.819288,0.422113,0.507713,0.341370,0.183331,0.291589,0.770418,0.263618
814,INCOV212,INCOV212-T1,INCOV,42,Female,41.419527,3.289477,5.586997,5.192877,5.303065,...,2.336255,-0.069623,2.434879,0.807309,0.808298,0.642704,0.941103,-0.467671,0.687298,0.325864
815,INCOV213,INCOV213-T2,INCOV,42,Male,33.926954,2.807695,2.944503,1.353525,5.737100,...,2.526673,2.025476,-3.181148,0.117583,-0.169242,0.336285,0.525564,3.155780,1.367215,4.922213


In [14]:
protein_names = isb_proteomics.columns[6:].tolist()

def remove_tag(protein):
    return protein.split("_")[0]

clean_protein_names = [remove_tag(protein) for protein in protein_names]

with open('clean_proteins.txt', 'w') as f:
    for protein in clean_protein_names:
        f.write("%s\n" % protein)

column_mapping = dict(zip(protein_names, clean_protein_names))

isb_proteomics.rename(columns=column_mapping, inplace=True)

isb_proteomics.to_excel('updated_isb_proteomics.xlsx', index=False)
isb_proteomics

Unnamed: 0,Patient Subject ID,Blood Draw,Healthy or INCOV,age,sex,BMI,LGALS9,TNFRSF10A,AGRP,TNFRSF10B,...,EGFL7,RCOR1,CA14,STX8,PRKAB1,MAP4K5,NUB1,ENTPD2,MAGED1,GPR56
0,1004350,,Healthy,32,Female,23.130000,0.018738,-0.177699,1.010091,0.094147,...,0.244369,0.704403,0.354924,1.781293,2.285133,0.972785,1.869674,-0.400720,1.120317,-0.606187
1,1010823,,Healthy,64,Female,21.660000,0.309203,-0.036492,0.189023,0.418735,...,-0.028634,-0.057806,-0.456894,0.162826,0.077889,0.689977,0.562815,-0.501666,-0.737454,0.390353
2,1013541,,Healthy,41,Female,23.460000,0.298717,-0.140791,0.018454,-1.417946,...,-0.131302,-0.450438,0.060958,1.749543,1.565091,0.695397,0.198925,-0.017642,-0.292817,-0.812126
3,1016929,,Healthy,49,Male,25.240000,0.667058,0.750248,0.480728,0.132137,...,-1.035753,-0.675164,-0.636819,0.244157,-0.158365,0.423878,0.684548,-0.486275,1.634976,-0.963599
4,1019071,,Healthy,53,Female,23.470000,0.222519,0.185525,0.300923,-0.531894,...,0.530495,0.199890,0.830008,1.251163,0.522049,1.008268,0.953621,0.094314,0.911377,0.335281
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
812,INCOV210,INCOV210-T1,INCOV,43,Female,37.530864,-0.622214,0.556100,0.752388,0.668038,...,2.666567,1.668093,-0.420048,1.514181,1.813371,1.262130,1.341295,1.391966,1.885676,1.599052
813,INCOV212,INCOV212-T2,INCOV,42,Female,41.419527,2.255815,3.864351,3.532702,3.140393,...,2.941637,-0.453591,1.819288,0.422113,0.507713,0.341370,0.183331,0.291589,0.770418,0.263618
814,INCOV212,INCOV212-T1,INCOV,42,Female,41.419527,3.289477,5.586997,5.192877,5.303065,...,2.336255,-0.069623,2.434879,0.807309,0.808298,0.642704,0.941103,-0.467671,0.687298,0.325864
815,INCOV213,INCOV213-T2,INCOV,42,Male,33.926954,2.807695,2.944503,1.353525,5.737100,...,2.526673,2.025476,-3.181148,0.117583,-0.169242,0.336285,0.525564,3.155780,1.367215,4.922213


Once again, only the first timepoint is needed. Also, I will only be using. COVID patients and not healthy controls.

In [15]:
isb_proteomics_t1 = isb_proteomics[(isb_proteomics['Blood Draw'].str.endswith('-T1')) & (isb_proteomics['Healthy or INCOV'] == 'INCOV')]
isb_proteomics_t1

Unnamed: 0,Patient Subject ID,Blood Draw,Healthy or INCOV,age,sex,BMI,LGALS9,TNFRSF10A,AGRP,TNFRSF10B,...,EGFL7,RCOR1,CA14,STX8,PRKAB1,MAP4K5,NUB1,ENTPD2,MAGED1,GPR56
289,INCOV001,INCOV001-T1,INCOV,77,Female,33.657783,2.668531,3.099609,0.705863,2.141790,...,0.638182,-0.705450,-1.612659,-0.593469,-0.596764,-0.796593,-1.206721,-0.530558,0.610169,4.567199
292,INCOV002,INCOV002-T1,INCOV,39,Male,27.900747,3.607274,1.528715,1.750012,3.131619,...,-0.131315,2.049932,-4.237300,0.280148,0.322202,1.118726,1.311234,-2.340461,1.856097,-0.021463
295,INCOV003,INCOV003-T1,INCOV,64,Male,24.546988,2.953096,4.105836,1.221295,3.712461,...,1.477143,1.079921,-5.751694,-0.192531,0.375482,0.825320,0.532453,-1.744499,1.552228,3.638741
298,INCOV004,INCOV004-T1,INCOV,76,Male,23.215788,3.016487,3.330754,3.318305,2.609300,...,2.344226,-0.222570,-1.477135,-1.342922,-0.186438,-0.450782,-0.730134,-1.431956,0.857295,3.260736
300,INCOV005,INCOV005-T1,INCOV,75,Male,29.703121,2.613899,4.373015,2.135831,5.139291,...,0.046332,1.334650,-1.970992,-1.129451,0.038011,0.398932,-0.386438,1.232084,1.440381,-0.306113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
807,INCOV206,INCOV206-T1,INCOV,67,Male,33.037475,3.925704,4.481338,2.950986,4.424560,...,2.197081,2.351290,-4.032202,1.334862,1.614995,1.730729,1.966007,1.251564,2.722881,1.567108
810,INCOV209,INCOV209-T1,INCOV,35,Female,40.854454,3.532917,13.356042,10.040362,14.369688,...,5.109097,9.918594,5.845989,3.024152,3.143168,1.257952,2.020599,2.041171,2.665735,1.684295
812,INCOV210,INCOV210-T1,INCOV,43,Female,37.530864,-0.622214,0.556100,0.752388,0.668038,...,2.666567,1.668093,-0.420048,1.514181,1.813371,1.262130,1.341295,1.391966,1.885676,1.599052
814,INCOV212,INCOV212-T1,INCOV,42,Female,41.419527,3.289477,5.586997,5.192877,5.303065,...,2.336255,-0.069623,2.434879,0.807309,0.808298,0.642704,0.941103,-0.467671,0.687298,0.325864


In [None]:
isb_proteomics_t1.rename(columns={'Patient Subject ID': 'subject_id'}, inplace=True)

In [17]:
uniprot = pd.read_excel("uniprot-ids.xlsx")
uniprot

  warn("Workbook contains no default style, apply openpyxl's default")


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length
0,LILRA5,A6NI73,reviewed,LIRA5_HUMAN,Leukocyte immunoglobulin-like receptor subfami...,LILRA5 ILT11 LILRB7 LIR9,Homo sapiens (Human),299
1,SNAP23,O00161,reviewed,SNP23_HUMAN,Synaptosomal-associated protein 23 (SNAP-23) (...,SNAP23,Homo sapiens (Human),211
2,LGALS9,O00182,reviewed,LEG9_HUMAN,Galectin-9 (Gal-9) (Ecalectin) (Tumor antigen ...,LGALS9,Homo sapiens (Human),355
3,STXBP3,O00186,reviewed,STXB3_HUMAN,Syntaxin-binding protein 3 (Platelet Sec1 prot...,STXBP3,Homo sapiens (Human),592
4,TNFRSF10A,O00220,reviewed,TR10A_HUMAN,Tumor necrosis factor receptor superfamily mem...,TNFRSF10A APO2 DR4 TRAILR1,Homo sapiens (Human),468
...,...,...,...,...,...,...,...,...
453,ENTPD2,Q9Y5L3,reviewed,ENTP2_HUMAN,Ectonucleoside triphosphate diphosphohydrolase...,ENTPD2 CD39L1,Homo sapiens (Human),495
454,MAGED1,Q9Y5V3,reviewed,MAGD1_HUMAN,Melanoma-associated antigen D1 (MAGE tumor ant...,MAGED1 NRAGE PP2250 PRO2292,Homo sapiens (Human),778
455,GPR56,Q9Y653,reviewed,AGRG1_HUMAN,Adhesion G-protein coupled receptor G1 (G-prot...,ADGRG1 GPR56 TM7LN4 TM7XN1 UNQ540/PRO1083,Homo sapiens (Human),693
456,IKBKG,Q9Y6K9,reviewed,NEMO_HUMAN,NF-kappa-B essential modulator (NEMO) (FIP-3) ...,IKBKG FIP3 NEMO,Homo sapiens (Human),419


In [18]:
# group by HGNC ID and select the first Uniprot ID in each group
uniprot_unique = uniprot.groupby('From')['Entry'].first().reset_index()

uniprot_unique.to_excel('unique_uniprot_mapping.xlsx', index=False)

Now, I need to filter the proteomic dataset to match the INCOV clinical dataset subject IDs.

In [19]:
shared_ids = set(isb_proteomics_t1['subject_id']).intersection(isb_incov_clinical_t1_data['subject_id'])
mod_isb_proteomics_t1 = isb_proteomics_t1[isb_proteomics_t1['subject_id'].isin(shared_ids)]
mod_isb_incov_clinical_t1_data = isb_incov_clinical_t1_data[isb_incov_clinical_t1_data['subject_id'].isin(shared_ids)]

In [20]:
mod_isb_incov_clinical_t1_data

Unnamed: 0,subject_id,blood_draw,observation_days_since_enrollment,onset_of_symptoms_days_before_enrollment,observation_days_since_onset_of_symptoms,encounter_location,who_severity,mechanical_ventilation,respiratory_support,systolic_bp,...,isoplexis_cd8_assay,isoplexis_cd4_assay,isoplexis_monocyte_assay,isoplexis_nk_assay,sars-cov-2_antibody_assay,autoantibody_\nassay,sara-cov-2_rnaemia_assay,sara-cov-2_nasal_swab_assay,ebv_viremia_assay,cmv_viremia_assay
0,INCOV001,INCOV001-T1,0.92,16.04,16.96,Hospital,3,0,,139.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
3,INCOV002,INCOV002-T1,0.71,6.33,7.04,ICU,5,0,High flow nasal cannula (HFNC),96.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
6,INCOV003,INCOV003-T1,0.54,16.54,17.08,ICU,7,1,Other,89.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
9,INCOV004,INCOV004-T1,0.75,2.21,2.96,Hospital,4,0,Nasal cannula,137.0,...,No,No,No,No,No,Yes,Yes,No,No,No
11,INCOV005,INCOV005-T1,0.00,14.21,14.21,Hospital,4,0,Nasal cannula,126.0,...,No,No,No,No,No,Yes,Yes,No,Yes,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,INCOV198,INCOV198-T1,0.67,8.21,8.88,Hospital,6,1,Other,113.0,...,,,,,No,Yes,Yes,Yes,Yes,Yes
495,INCOV199,INCOV199-T1,0.62,27.12,27.74,ICU,7,1,Other,132.0,...,,,,,No,No,Yes,Yes,No,No
497,INCOV200,INCOV200-T1,0.62,19.42,20.04,ICU,7,1,Other,114.0,...,No,No,No,No,No,Yes,Yes,Yes,No,No
499,INCOV201,INCOV201-T1,1.62,7.08,8.70,ICU,7,1,Other,116.0,...,,,,,No,Yes,Yes,Yes,Yes,Yes


In [21]:
mod_isb_proteomics_t1

Unnamed: 0,subject_id,Blood Draw,Healthy or INCOV,age,sex,BMI,LGALS9,TNFRSF10A,AGRP,TNFRSF10B,...,EGFL7,RCOR1,CA14,STX8,PRKAB1,MAP4K5,NUB1,ENTPD2,MAGED1,GPR56
289,INCOV001,INCOV001-T1,INCOV,77,Female,33.657783,2.668531,3.099609,0.705863,2.141790,...,0.638182,-0.705450,-1.612659,-0.593469,-0.596764,-0.796593,-1.206721,-0.530558,0.610169,4.567199
292,INCOV002,INCOV002-T1,INCOV,39,Male,27.900747,3.607274,1.528715,1.750012,3.131619,...,-0.131315,2.049932,-4.237300,0.280148,0.322202,1.118726,1.311234,-2.340461,1.856097,-0.021463
295,INCOV003,INCOV003-T1,INCOV,64,Male,24.546988,2.953096,4.105836,1.221295,3.712461,...,1.477143,1.079921,-5.751694,-0.192531,0.375482,0.825320,0.532453,-1.744499,1.552228,3.638741
298,INCOV004,INCOV004-T1,INCOV,76,Male,23.215788,3.016487,3.330754,3.318305,2.609300,...,2.344226,-0.222570,-1.477135,-1.342922,-0.186438,-0.450782,-0.730134,-1.431956,0.857295,3.260736
300,INCOV005,INCOV005-T1,INCOV,75,Male,29.703121,2.613899,4.373015,2.135831,5.139291,...,0.046332,1.334650,-1.970992,-1.129451,0.038011,0.398932,-0.386438,1.232084,1.440381,-0.306113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,INCOV198,INCOV198-T1,INCOV,40,Female,48.906250,2.724864,10.007715,5.673322,7.681026,...,4.092034,5.720195,-1.787520,2.262770,3.850830,1.704246,3.060482,1.068625,4.436255,1.308288
795,INCOV199,INCOV199-T1,INCOV,57,Male,28.095734,3.117667,6.406123,3.428378,5.976971,...,1.137260,2.992898,-4.655404,1.986283,2.249505,1.926585,3.077505,-0.283804,3.354330,1.632752
797,INCOV200,INCOV200-T1,INCOV,51,Male,31.577343,3.494963,7.675585,4.119787,9.449792,...,2.314785,2.765097,-2.855228,1.717302,1.869355,1.482842,1.360112,-0.862702,2.952369,1.537086
799,INCOV201,INCOV201-T1,INCOV,50,Female,39.304611,2.152516,0.860845,-1.971908,1.576786,...,1.989141,2.443872,-6.584477,1.423127,1.192990,0.999786,1.140251,1.298974,1.905874,1.578177


In [22]:
isb_mapping_dataframe = pd.read_excel('unique_uniprot_mapping.xlsx')
isb_mapping_dict = isb_mapping_dataframe.set_index('From')['Entry'].to_dict()

new_columns = [isb_mapping_dict.get(col, col) for col in mod_isb_proteomics_t1.columns]
mod_isb_proteomics_t1.columns = new_columns
mod_isb_proteomics_t1

Unnamed: 0,subject_id,Blood Draw,Healthy or INCOV,age,sex,BMI,O00182,O00220,O00253,O14763,...,Q9UHF1,Q9UKL0,Q9ULX7,Q9UNK0,Q9Y478,Q9Y4K4,Q9Y5A7,Q9Y5L3,Q9Y5V3,Q9Y653
289,INCOV001,INCOV001-T1,INCOV,77,Female,33.657783,2.668531,3.099609,0.705863,2.141790,...,0.638182,-0.705450,-1.612659,-0.593469,-0.596764,-0.796593,-1.206721,-0.530558,0.610169,4.567199
292,INCOV002,INCOV002-T1,INCOV,39,Male,27.900747,3.607274,1.528715,1.750012,3.131619,...,-0.131315,2.049932,-4.237300,0.280148,0.322202,1.118726,1.311234,-2.340461,1.856097,-0.021463
295,INCOV003,INCOV003-T1,INCOV,64,Male,24.546988,2.953096,4.105836,1.221295,3.712461,...,1.477143,1.079921,-5.751694,-0.192531,0.375482,0.825320,0.532453,-1.744499,1.552228,3.638741
298,INCOV004,INCOV004-T1,INCOV,76,Male,23.215788,3.016487,3.330754,3.318305,2.609300,...,2.344226,-0.222570,-1.477135,-1.342922,-0.186438,-0.450782,-0.730134,-1.431956,0.857295,3.260736
300,INCOV005,INCOV005-T1,INCOV,75,Male,29.703121,2.613899,4.373015,2.135831,5.139291,...,0.046332,1.334650,-1.970992,-1.129451,0.038011,0.398932,-0.386438,1.232084,1.440381,-0.306113
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
792,INCOV198,INCOV198-T1,INCOV,40,Female,48.906250,2.724864,10.007715,5.673322,7.681026,...,4.092034,5.720195,-1.787520,2.262770,3.850830,1.704246,3.060482,1.068625,4.436255,1.308288
795,INCOV199,INCOV199-T1,INCOV,57,Male,28.095734,3.117667,6.406123,3.428378,5.976971,...,1.137260,2.992898,-4.655404,1.986283,2.249505,1.926585,3.077505,-0.283804,3.354330,1.632752
797,INCOV200,INCOV200-T1,INCOV,51,Male,31.577343,3.494963,7.675585,4.119787,9.449792,...,2.314785,2.765097,-2.855228,1.717302,1.869355,1.482842,1.360112,-0.862702,2.952369,1.537086
799,INCOV201,INCOV201-T1,INCOV,50,Female,39.304611,2.152516,0.860845,-1.971908,1.576786,...,1.989141,2.443872,-6.584477,1.423127,1.192990,0.999786,1.140251,1.298974,1.905874,1.578177


### ISB metabolomics data

In [23]:
isb_metabolomics = pd.read_excel("data/Table S2.xlsx", sheet_name = 2)
isb_metabolomics

Unnamed: 0,Patient Subject ID,Blood Draw,Healthy or INCOV,age,sex,BMI,S-1-pyrroline-5-carboxylate,spermidine,1-methylnicotinamide,"12,13-DiHOME",...,X - 24328,X - 24334,X - 24337,X - 24544,X - 24546,X - 24549,X - 24556,X - 24588,X - 24728,X - 24812
0,1003758,,Healthy,32,Female,23.780000,-0.622529,-0.394431,0.698251,1.477890,...,-1.857690,-2.729039,-1.146087,-1.199086,-1.068852,-0.083940,-1.328494,-0.643512,-0.473671,-0.363236
1,1008097,,Healthy,59,Male,31.460000,0.156314,-0.194857,0.479207,-1.669247,...,0.015472,-0.135790,-0.431602,0.092435,1.734445,1.187141,1.670724,-0.262267,-0.181727,-0.264447
2,1008631,,Healthy,68,Female,28.830000,0.435353,-1.311269,-1.099885,-0.265720,...,-0.829879,0.353224,-0.046810,-1.364595,-0.813300,0.240120,-0.086881,1.157493,0.004717,-0.644791
3,1012545,,Healthy,65,Female,21.120000,-0.804976,-0.144867,1.087421,1.563804,...,-1.580401,0.143999,0.157220,-0.218786,-1.883384,0.347607,1.303321,-5.742557,-0.852678,0.155332
4,1022407,,Healthy,60,Female,39.950000,-0.867989,0.929882,1.162524,-0.548043,...,1.343469,-0.711064,-1.363756,-0.521077,0.265962,-0.239208,-0.529915,0.530060,-1.562613,0.926657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
678,INCOV201,INCOV201-T3,INCOV,50,Female,39.304611,0.497230,1.825152,-1.204183,0.543891,...,-0.877750,0.182126,-0.593760,-3.842966,-1.674621,0.941804,-3.478461,1.176148,0.585056,-1.051671
679,INCOV202,INCOV202-T2,INCOV,60,Female,31.506773,0.441976,2.035292,-0.990884,-0.444287,...,-4.155680,3.454273,2.298541,-3.158653,-1.177325,0.188264,-3.927164,2.595386,0.466174,1.492759
680,INCOV202,INCOV202-T1,INCOV,60,Female,31.506773,-1.858486,0.428411,-1.236431,-0.305914,...,-1.455341,3.164866,3.462950,-0.834531,-0.026876,0.072260,-3.858769,1.869304,1.695597,2.111666
681,INCOV203,INCOV203-T2,INCOV,87,Male,26.132248,0.107257,-0.422334,0.951580,0.765640,...,-1.939484,1.840646,-0.352211,-1.990198,1.370559,-0.711186,-0.298813,1.239272,0.896582,-1.756949


Again, only rows from INCOV patients at timepoint 1 are needed. 

In [24]:
isb_metabolomics_t1 = isb_metabolomics[(isb_metabolomics['Blood Draw'].str.endswith('-T1')) & (isb_metabolomics['Healthy or INCOV'] == 'INCOV')]
isb_metabolomics_t1

Unnamed: 0,Patient Subject ID,Blood Draw,Healthy or INCOV,age,sex,BMI,S-1-pyrroline-5-carboxylate,spermidine,1-methylnicotinamide,"12,13-DiHOME",...,X - 24328,X - 24334,X - 24337,X - 24544,X - 24546,X - 24549,X - 24556,X - 24588,X - 24728,X - 24812
178,INCOV001,INCOV001-T1,INCOV,77,Female,33.657783,0.068685,0.665893,-1.679621,-0.703882,...,0.701431,1.912282,-0.198062,-0.623344,1.779875,0.052900,0.671860,1.508485,1.568532,-0.041754
183,INCOV002,INCOV002-T1,INCOV,39,Male,27.900747,-1.187555,-0.567794,1.871186,0.647271,...,-0.338354,1.643217,1.981271,0.097096,0.056467,1.763907,-2.628693,0.732005,0.920549,0.773528
185,INCOV003,INCOV003-T1,INCOV,64,Male,24.546988,-1.296606,0.403254,-1.307841,3.152668,...,0.930477,-0.282517,2.423767,1.482135,1.856671,-1.443675,-0.286963,1.662694,0.517780,-0.126247
187,INCOV004,INCOV004-T1,INCOV,76,Male,23.215788,-0.491871,-1.233147,-0.624956,-2.422120,...,0.080321,0.631064,1.885535,-2.725175,-1.175923,-0.053804,-0.327184,0.774229,1.189508,1.656377
189,INCOV005,INCOV005-T1,INCOV,75,Male,29.703121,-0.532420,1.351813,-1.534131,-0.431423,...,2.166012,1.620263,1.204338,1.025409,3.048954,-1.245780,-3.488464,-0.851633,0.423788,0.200879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
673,INCOV199,INCOV199-T1,INCOV,57,Male,28.095734,-0.729741,1.653161,-1.464627,1.344628,...,1.910986,1.519368,0.939460,0.964566,2.313483,2.025355,-0.290771,2.154851,2.547856,0.265263
674,INCOV200,INCOV200-T1,INCOV,51,Male,31.577343,0.116347,-0.200388,1.061998,0.320042,...,0.390832,2.478347,0.162621,-0.420097,1.151720,-7.625419,0.275502,1.773904,0.999825,1.834743
677,INCOV201,INCOV201-T1,INCOV,50,Female,39.304611,0.333550,0.512767,-0.818307,-0.125508,...,0.436248,0.695044,0.134369,-3.427230,-5.591746,0.293415,-3.732031,-0.152455,-0.024661,-0.901643
680,INCOV202,INCOV202-T1,INCOV,60,Female,31.506773,-1.858486,0.428411,-1.236431,-0.305914,...,-1.455341,3.164866,3.462950,-0.834531,-0.026876,0.072260,-3.858769,1.869304,1.695597,2.111666


In [None]:
isb_metabolomics_t1.rename(columns={'Patient Subject ID': 'subject_id'}, inplace=True)

In [26]:
shared_ids = set(isb_metabolomics_t1['subject_id']).intersection(isb_incov_clinical_t1_data['subject_id'])
mod_isb_metabolomics_t1 = isb_metabolomics_t1[isb_metabolomics_t1['subject_id'].isin(shared_ids)]
mod_isb_metabolomics_t1 = mod_isb_metabolomics_t1[mod_isb_metabolomics_t1['subject_id'].isin(shared_ids)]

In [27]:
mod_isb_metabolomics_t1

Unnamed: 0,subject_id,Blood Draw,Healthy or INCOV,age,sex,BMI,S-1-pyrroline-5-carboxylate,spermidine,1-methylnicotinamide,"12,13-DiHOME",...,X - 24328,X - 24334,X - 24337,X - 24544,X - 24546,X - 24549,X - 24556,X - 24588,X - 24728,X - 24812
178,INCOV001,INCOV001-T1,INCOV,77,Female,33.657783,0.068685,0.665893,-1.679621,-0.703882,...,0.701431,1.912282,-0.198062,-0.623344,1.779875,0.052900,0.671860,1.508485,1.568532,-0.041754
183,INCOV002,INCOV002-T1,INCOV,39,Male,27.900747,-1.187555,-0.567794,1.871186,0.647271,...,-0.338354,1.643217,1.981271,0.097096,0.056467,1.763907,-2.628693,0.732005,0.920549,0.773528
185,INCOV003,INCOV003-T1,INCOV,64,Male,24.546988,-1.296606,0.403254,-1.307841,3.152668,...,0.930477,-0.282517,2.423767,1.482135,1.856671,-1.443675,-0.286963,1.662694,0.517780,-0.126247
187,INCOV004,INCOV004-T1,INCOV,76,Male,23.215788,-0.491871,-1.233147,-0.624956,-2.422120,...,0.080321,0.631064,1.885535,-2.725175,-1.175923,-0.053804,-0.327184,0.774229,1.189508,1.656377
189,INCOV005,INCOV005-T1,INCOV,75,Male,29.703121,-0.532420,1.351813,-1.534131,-0.431423,...,2.166012,1.620263,1.204338,1.025409,3.048954,-1.245780,-3.488464,-0.851633,0.423788,0.200879
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
670,INCOV198,INCOV198-T1,INCOV,40,Female,48.906250,0.851562,1.679351,0.920220,5.953539,...,3.691353,3.398422,3.154111,1.580332,2.683657,-1.199948,3.619145,4.316469,3.869013,1.055983
673,INCOV199,INCOV199-T1,INCOV,57,Male,28.095734,-0.729741,1.653161,-1.464627,1.344628,...,1.910986,1.519368,0.939460,0.964566,2.313483,2.025355,-0.290771,2.154851,2.547856,0.265263
674,INCOV200,INCOV200-T1,INCOV,51,Male,31.577343,0.116347,-0.200388,1.061998,0.320042,...,0.390832,2.478347,0.162621,-0.420097,1.151720,-7.625419,0.275502,1.773904,0.999825,1.834743
677,INCOV201,INCOV201-T1,INCOV,50,Female,39.304611,0.333550,0.512767,-0.818307,-0.125508,...,0.436248,0.695044,0.134369,-3.427230,-5.591746,0.293415,-3.732031,-0.152455,-0.024661,-0.901643


## Final Datasets

#### ISB Clinical

In [48]:
mod_isb_incov_clinical_t1_data.to_excel("isb_clinical.xlsx", index=False)

#### ISB proteomics

In [49]:
mod_isb_proteomics_t1.to_excel("isb_proteomics.xlsx", index=False)

#### ISB metabolomics

In [50]:
mod_isb_metabolomics_t1.to_excel("isb_metabolomics.xlsx", index=False)