In [1]:
import os
# move to dash dir
os.chdir("../../../src/dash/")

In [2]:
### load shared data ###
from data import get_omics_data, get_biomolecule_names
import datetime

print()
print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print("Loading data for app...")
print()
# load metabolomics data matrix
print("Loading metabolomics data...")
metabolomics_df, metabolomics_quant_range = get_omics_data(dataset='metabolomics', with_metadata=True)
print("Metabolomics data shape: {}".format(metabolomics_df.shape))
print("Loading lipidomics data...")
lipidomics_df, lipidomics_quant_range = get_omics_data(dataset='lipidomics', with_metadata=True)
print("Lipidomics data shape: {}".format(lipidomics_df.shape))
print("Loading proteomics data...")
proteomics_df, proteomics_quant_range = get_omics_data(dataset='proteomics', with_metadata=True)
print("Proteomics data shape: {}".format(proteomics_df.shape))
print("Loading transcriptomics data...")
transcriptomics_df, transcriptomics_quant_range = get_omics_data(dataset='transcriptomics', with_metadata=True)
print("Transcriptomics data shape: {}".format(transcriptomics_df.shape))

# make biomolecule_name_dict
metabolomics_biomolecule_names_dict = get_biomolecule_names(dataset='metabolomics')
lipidomics_biomolecule_names_dict = get_biomolecule_names(dataset='lipidomics')
proteomics_biomolecule_names_dict = get_biomolecule_names(dataset='proteomics')
transcriptomics_biomolecule_names_dict = get_biomolecule_names(dataset='transcriptomics')

# define dataset dictionaries
dataset_dict = {
        "Proteins":"proteomics",
        "Lipids":"lipidomics",
        "Metabolites":"metabolomics",
        "Transcripts":"transcriptomics",
        "Combined Biomolecules":"combined"
    }

df_dict = {
    "proteomics":proteomics_df,
    "lipidomics":lipidomics_df,
    "metabolomics":metabolomics_df,
    "transcriptomics":transcriptomics_df
}

quant_value_range_dict = {
    "proteomics":proteomics_quant_range,
    "lipidomics":lipidomics_quant_range,
    "metabolomics":metabolomics_quant_range,
    "transcriptomics":transcriptomics_quant_range
}

global_names_dict = {
    "proteomics":proteomics_biomolecule_names_dict,
    "lipidomics":lipidomics_biomolecule_names_dict,
    "metabolomics":metabolomics_biomolecule_names_dict,
    "transcriptomics":transcriptomics_biomolecule_names_dict,
    "combined":{**proteomics_biomolecule_names_dict,
                **lipidomics_biomolecule_names_dict,
                **metabolomics_biomolecule_names_dict}
}


2020-06-19 13:43:59
Loading data for app...

Loading metabolomics data...
Metabolomics data shape: (129, 174)
Loading lipidomics data...
Lipidomics data shape: (129, 3376)
Loading proteomics data...
Proteomics data shape: (129, 536)
Loading transcriptomics data...
Transcriptomics data shape: (125, 13282)
Getting biomolecule names for dataset: metabolomics
Getting biomolecule names for dataset: lipidomics
Getting biomolecule names for dataset: proteomics
Getting biomolecule names for dataset: transcriptomics


In [3]:
proteomics_df.head()

Unnamed: 0_level_0,7593,7596,7597,7599,7600,7601,7602,7605,7606,7607,...,Vent_free_days,DM,Hospital_free_days_45,Ferritin_NG/ML,CRP_MG/L,DDIMER_mg/L_FEU,Procalcitonin_NG/ML,Lactate_MMol/L,Fibrinogen,SOFA
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,37.995543,27.176361,28.294477,24.783368,23.926601,23.490571,25.490816,29.287382,29.505121,29.909421,...,0,0,0,946,73.1,1.3,36.0,0.9,513.0,8.0
2.0,37.353091,29.419228,29.04851,27.573331,24.671039,22.343591,27.166913,29.576452,30.081443,29.514012,...,28,0,39,1060,,1.03,0.37,,,
3.0,37.527875,27.174171,28.953215,25.619161,24.528623,20.714471,25.589237,29.621968,29.398338,29.436129,...,28,1,18,1335,53.2,1.48,0.07,,513.0,
4.0,37.673128,28.879702,29.52746,23.893562,25.875326,24.170967,25.280226,30.96678,30.360452,30.041791,...,28,0,39,583,251.1,1.32,0.98,0.87,949.0,
5.0,37.983542,27.262485,28.605867,18.440251,20.888495,17.354782,25.654472,29.020542,29.175695,29.628988,...,23,0,27,800,355.8,0.69,4.92,1.48,929.0,7.0


In [178]:
col_names = [proteomics_biomolecule_names_dict[col] \
     if col in proteomics_biomolecule_names_dict else col \
     for col in proteomics_df.columns.values]

In [179]:
drop_cols = ['Sample_label', 'Albany_sampleID', 'DM']
keep_cols = ['COVID', 'ICU_1', 'Gender', 'Age_less_than_90', 'Hospital_free_days_45']
metadata_columns = proteomics_df.columns[proteomics_quant_range:]
#proteomics_df[metadata_columns.sort_values()].drop(drop_cols, axis=1).dropna(axis=1)
metadata_df = proteomics_df[keep_cols]

In [180]:
import numpy as np
metadata_df = metadata_df.replace('', np.nan).dropna()

In [181]:
COVID_list = []
ICU_list = []
age_list = []
int_bool_dict = {
    0:"False",
    1:"True"
}
for index, row in metadata_df.iterrows():
    COVID = int_bool_dict[row['COVID']]
    COVID_list.append(COVID)
    ICU = int_bool_dict[row['ICU_1']]
    ICU_list.append(ICU)
    
    age_list.append(int(row['Age_less_than_90']))
    
metadata_df['COVID'] = COVID_list
metadata_df['ICU_1'] = ICU_list
metadata_df['Age_less_than_90'] = age_list

In [182]:
metadata_df.head()

Unnamed: 0_level_0,COVID,ICU_1,Gender,Age_less_than_90,Hospital_free_days_45
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,True,False,M,39,0
2.0,True,False,M,63,39
3.0,True,False,M,33,18
4.0,True,False,M,49,39
5.0,True,False,M,49,27


In [183]:
quant_df = proteomics_df.loc[metadata_df.index][proteomics_df.columns[:proteomics_quant_range]]
quant_df.head()

Unnamed: 0_level_0,7593,7596,7597,7599,7600,7601,7602,7605,7606,7607,...,8319,8321,8322,8323,8328,8330,8334,8335,8336,8337
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,37.995543,27.176361,28.294477,24.783368,23.926601,23.490571,25.490816,29.287382,29.505121,29.909421,...,25.746683,22.593546,22.024047,23.733969,19.755864,29.920821,25.980045,22.915739,30.15774,22.821315
2.0,37.353091,29.419228,29.04851,27.573331,24.671039,22.343591,27.166913,29.576452,30.081443,29.514012,...,25.580983,19.026956,24.628786,22.508744,19.399144,30.791802,26.601871,22.352374,29.011826,20.147061
3.0,37.527875,27.174171,28.953215,25.619161,24.528623,20.714471,25.589237,29.621968,29.398338,29.436129,...,24.452494,17.439869,17.624139,20.428779,23.950737,30.332235,26.391377,21.239148,30.755732,21.002496
4.0,37.673128,28.879702,29.52746,23.893562,25.875326,24.170967,25.280226,30.96678,30.360452,30.041791,...,25.934676,22.237947,22.410592,22.664706,22.214572,30.400396,26.618349,20.050655,31.037739,19.830364
5.0,37.983542,27.262485,28.605867,18.440251,20.888495,17.354782,25.654472,29.020542,29.175695,29.628988,...,25.11577,23.014475,20.86944,23.92098,23.752258,30.274865,26.143827,24.123605,30.690791,18.2133


In [184]:
# update col names
col_names = ["('" + proteomics_biomolecule_names_dict[col] + "')" for col in quant_df.columns.values]
quant_df.columns = col_names
quant_df.head()

Unnamed: 0_level_0,('[P] Alpha-1-antitrypsin'),('[P] Immunoglobulin lambda variable 4-69'),('[P] Immunoglobulin lambda variable 8-61'),('[P] Immunoglobulin lambda variable 10-54'),('[P] Immunoglobulin lambda variable 7-46'),('[P] Immunoglobulin lambda variable 5-37'),('[P] Immunoglobulin lambda variable 2-18'),('[P] Immunoglobulin lambda variable 3-10'),('[P] Immunoglobulin lambda variable 3-9'),('[P] Immunoglobulin kappa variable 2-28'),...,('[P] Prenylcysteine oxidase 1'),('[P] N-acetylglucosamine-1-phosphotransferase subunit gamma'),('[P] Coronin-1C'),('[P] Multiple inositol polyphosphate phosphatase 1'),('[P] Angiopoietin-related protein 3'),('[P] IgGFc-binding protein'),('[P] Histone H2B'),('[P] Low affinity immunoglobulin gamma Fc region receptor II-a (Fragment)'),('[P] Apolipoprotein A-II'),('[P] Neutrophil gelatinase-associated lipocalin')
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,37.995543,27.176361,28.294477,24.783368,23.926601,23.490571,25.490816,29.287382,29.505121,29.909421,...,25.746683,22.593546,22.024047,23.733969,19.755864,29.920821,25.980045,22.915739,30.15774,22.821315
2.0,37.353091,29.419228,29.04851,27.573331,24.671039,22.343591,27.166913,29.576452,30.081443,29.514012,...,25.580983,19.026956,24.628786,22.508744,19.399144,30.791802,26.601871,22.352374,29.011826,20.147061
3.0,37.527875,27.174171,28.953215,25.619161,24.528623,20.714471,25.589237,29.621968,29.398338,29.436129,...,24.452494,17.439869,17.624139,20.428779,23.950737,30.332235,26.391377,21.239148,30.755732,21.002496
4.0,37.673128,28.879702,29.52746,23.893562,25.875326,24.170967,25.280226,30.96678,30.360452,30.041791,...,25.934676,22.237947,22.410592,22.664706,22.214572,30.400396,26.618349,20.050655,31.037739,19.830364
5.0,37.983542,27.262485,28.605867,18.440251,20.888495,17.354782,25.654472,29.020542,29.175695,29.628988,...,25.11577,23.014475,20.86944,23.92098,23.752258,30.274865,26.143827,24.123605,30.690791,18.2133


In [190]:
row_names = []
for index, row in metadata_df.iterrows():
    #('ML1', 'tissue: thyroid', 'histology: carcinoma', 'sub-histology: follicular_carcinoma', 'gender: F')
    # ('1', 'COVID: 1', 'ICU_1: 1'...)
    out_list = []
    for col in keep_cols:
        value = row[col]
        col_str = "'{}: {}'".format(col, value)
        out_list.append(col_str)
    out_str = "('" + str(int(index)) + "', " + ", ".join(out_list) + ")"
    #out_str = "(" + ", ".join(out_list) + ")"
    row_names.append(out_str)

In [191]:
quant_df.index = row_names

In [192]:
quant_df.T.head()

Unnamed: 0,"('1', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 39', 'Hospital_free_days_45: 0')","('2', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 63', 'Hospital_free_days_45: 39')","('3', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 33', 'Hospital_free_days_45: 18')","('4', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 49', 'Hospital_free_days_45: 39')","('5', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 49', 'Hospital_free_days_45: 27')","('6', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 45', 'Hospital_free_days_45: 36')","('7', 'COVID: True', 'ICU_1: False', 'Gender: F', 'Age_less_than_90: 38', 'Hospital_free_days_45: 42')","('8', 'COVID: True', 'ICU_1: True', 'Gender: M', 'Age_less_than_90: 78', 'Hospital_free_days_45: 0')","('9', 'COVID: True', 'ICU_1: True', 'Gender: F', 'Age_less_than_90: 64', 'Hospital_free_days_45: 0')","('10', 'COVID: True', 'ICU_1: True', 'Gender: M', 'Age_less_than_90: 62', 'Hospital_free_days_45: 0')",...,"('120', 'COVID: False', 'ICU_1: True', 'Gender: F', 'Age_less_than_90: 84', 'Hospital_free_days_45: 41')","('121', 'COVID: False', 'ICU_1: True', 'Gender: M', 'Age_less_than_90: 88', 'Hospital_free_days_45: 0')","('122', 'COVID: False', 'ICU_1: True', 'Gender: F', 'Age_less_than_90: 66', 'Hospital_free_days_45: 29')","('123', 'COVID: False', 'ICU_1: True', 'Gender: F', 'Age_less_than_90: 62', 'Hospital_free_days_45: 40')","('124', 'COVID: False', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 71', 'Hospital_free_days_45: 36')","('125', 'COVID: False', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 63', 'Hospital_free_days_45: 43')","('126', 'COVID: False', 'ICU_1: False', 'Gender: F', 'Age_less_than_90: 42', 'Hospital_free_days_45: 40')","('127', 'COVID: False', 'ICU_1: False', 'Gender: F', 'Age_less_than_90: 32', 'Hospital_free_days_45: 43')","('128', 'COVID: False', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 62', 'Hospital_free_days_45: 44')","('129', 'COVID: False', 'ICU_1: True', 'Gender: M', 'Age_less_than_90: 36', 'Hospital_free_days_45: 0')"
('[P] Alpha-1-antitrypsin'),37.995543,37.353091,37.527875,37.673128,37.983542,37.489959,37.615303,37.534702,37.525762,37.844902,...,37.73018,38.082377,37.661959,37.195828,37.365447,37.564109,37.683884,37.282698,37.655909,37.754401
('[P] Immunoglobulin lambda variable 4-69'),27.176361,29.419228,27.174171,28.879702,27.262485,29.535232,27.657446,30.323779,28.194885,29.253091,...,27.731627,27.621776,27.249976,27.403813,27.133006,27.793462,26.876193,27.715091,26.965212,28.376997
('[P] Immunoglobulin lambda variable 8-61'),28.294477,29.04851,28.953215,29.52746,28.605867,29.251895,26.943355,29.914134,28.94729,29.108465,...,28.977874,27.961132,29.085045,28.319069,29.418182,29.003245,28.002763,28.660543,28.665092,29.839541
('[P] Immunoglobulin lambda variable 10-54'),24.783368,27.573331,25.619161,23.893562,18.440251,25.033513,19.384221,22.054705,27.642444,27.163654,...,27.36578,26.182814,27.649039,25.888229,26.724164,26.577443,27.996742,21.713502,19.659693,20.145198
('[P] Immunoglobulin lambda variable 7-46'),23.926601,24.671039,24.528623,25.875326,20.888495,26.576723,25.110084,25.695856,26.054428,26.153098,...,25.276113,24.855716,24.183591,25.328755,24.374313,25.152285,24.717171,25.369895,25.790296,26.34611


In [195]:
from scipy import stats

clustergram_df = quant_df.apply(stats.zscore).T.round(2)
clustergram_df.head()

Unnamed: 0,"('1', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 39', 'Hospital_free_days_45: 0')","('2', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 63', 'Hospital_free_days_45: 39')","('3', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 33', 'Hospital_free_days_45: 18')","('4', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 49', 'Hospital_free_days_45: 39')","('5', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 49', 'Hospital_free_days_45: 27')","('6', 'COVID: True', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 45', 'Hospital_free_days_45: 36')","('7', 'COVID: True', 'ICU_1: False', 'Gender: F', 'Age_less_than_90: 38', 'Hospital_free_days_45: 42')","('8', 'COVID: True', 'ICU_1: True', 'Gender: M', 'Age_less_than_90: 78', 'Hospital_free_days_45: 0')","('9', 'COVID: True', 'ICU_1: True', 'Gender: F', 'Age_less_than_90: 64', 'Hospital_free_days_45: 0')","('10', 'COVID: True', 'ICU_1: True', 'Gender: M', 'Age_less_than_90: 62', 'Hospital_free_days_45: 0')",...,"('120', 'COVID: False', 'ICU_1: True', 'Gender: F', 'Age_less_than_90: 84', 'Hospital_free_days_45: 41')","('121', 'COVID: False', 'ICU_1: True', 'Gender: M', 'Age_less_than_90: 88', 'Hospital_free_days_45: 0')","('122', 'COVID: False', 'ICU_1: True', 'Gender: F', 'Age_less_than_90: 66', 'Hospital_free_days_45: 29')","('123', 'COVID: False', 'ICU_1: True', 'Gender: F', 'Age_less_than_90: 62', 'Hospital_free_days_45: 40')","('124', 'COVID: False', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 71', 'Hospital_free_days_45: 36')","('125', 'COVID: False', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 63', 'Hospital_free_days_45: 43')","('126', 'COVID: False', 'ICU_1: False', 'Gender: F', 'Age_less_than_90: 42', 'Hospital_free_days_45: 40')","('127', 'COVID: False', 'ICU_1: False', 'Gender: F', 'Age_less_than_90: 32', 'Hospital_free_days_45: 43')","('128', 'COVID: False', 'ICU_1: False', 'Gender: M', 'Age_less_than_90: 62', 'Hospital_free_days_45: 44')","('129', 'COVID: False', 'ICU_1: True', 'Gender: M', 'Age_less_than_90: 36', 'Hospital_free_days_45: 0')"
('[P] Alpha-1-antitrypsin'),1.08,-0.71,-0.22,0.18,1.04,-0.33,0.02,-0.21,-0.23,0.66,...,0.34,1.32,0.15,-1.15,-0.68,-0.12,0.21,-0.91,0.13,0.41
('[P] Immunoglobulin lambda variable 4-69'),-1.04,1.2,-1.05,0.66,-0.96,1.31,-0.56,2.1,-0.03,1.03,...,-0.49,-0.6,-0.97,-0.82,-1.09,-0.43,-1.34,-0.51,-1.25,0.15
('[P] Immunoglobulin lambda variable 8-61'),-0.46,0.2,0.11,0.61,-0.19,0.37,-1.64,0.95,0.11,0.25,...,0.13,-0.75,0.23,-0.44,0.52,0.16,-0.71,-0.14,-0.14,0.89
('[P] Immunoglobulin lambda variable 10-54'),0.22,1.19,0.51,-0.08,-1.97,0.31,-1.64,-0.72,1.21,1.05,...,1.12,0.71,1.21,0.61,0.9,0.84,1.33,-0.84,-1.55,-1.38
('[P] Immunoglobulin lambda variable 7-46'),-0.54,-0.1,-0.19,0.61,-2.34,1.02,0.15,0.5,0.71,0.77,...,0.25,0.0,-0.39,0.28,-0.28,0.18,-0.08,0.31,0.56,0.89


In [196]:
clustergram_df.to_csv("../../data/clustergrammer/proteomics.txt", sep='\t')

# Notes on formatting:

1. Apparently need to include sample number, otherwise get "internal server error"
2. Can't have missing values (at least in colnames/ rownames)