# Build the get_multi_omics_data() function for dash app and general querying

Adapting from `src/dash/data.py` function `get_metabolomics_data()` and `1.0_get_lipidomics_data.ipynb`

In [24]:
from sqlalchemy import create_engine, MetaData, Table, select, join
import pandas as pd

# SQLite path (updated to include new version of proteomics data)
db_path = 'sqlite:///../../../data/SQLite Database/20200525/Covid-19 Study DB.sqlite'

def get_omics_data(with_metadata=False, dataset="proteomics"):

    omics_id_dict = {
        "proteomics":1,
        "lipidomics":2,
        "metabolomics":3,
        "transcriptomics":4
    }

    omics_id = omics_id_dict[dataset]
    
    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
    engine = create_engine(db_path)

    # Establish connection
    connection = engine.connect()

    # pull table into df
    query = "SELECT * from {}_measurements".format(dataset)
    omics_measurements_df = pd.read_sql_query(query, connection)

    # pull table into df
    query = "SELECT * from {}_runs".format(dataset)
    omics_runs_df = pd.read_sql_query(query, connection)

    # pull table into df
    query = "SELECT * from rawfiles WHERE ome_id={} AND sample_ID<>-1 and keep=1".format(omics_id)
    rawfiles_df = pd.read_sql_query(query, connection)

    # pull table into df
    deidentified_patient_metadata_df = pd.read_sql_query("SELECT * from deidentified_patient_metadata", connection)

    # make sure the merge by columns are all the same type -> pandas seems sensitive to this
    omics_measurements_df = omics_measurements_df.astype({'replicate_id': 'int32'})
    omics_runs_df = omics_runs_df.astype({'replicate_id': 'int32', 'rawfile_id': 'int32'})
    rawfiles_df = rawfiles_df.astype({'rawfile_id': 'int32', 'sample_id': 'int32'})
    deidentified_patient_metadata_df = deidentified_patient_metadata_df.astype({'sample_id': 'int32'})

    joined_df = omics_measurements_df\
                .join(omics_runs_df.set_index('replicate_id'), on='replicate_id')\
                .join(rawfiles_df.set_index('rawfile_id'), on='rawfile_id')\
                .join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')

    # drop samples that are missing COVID or ICU status
    joined_df.dropna(subset=['ICU_1','COVID'], inplace=True)

    # pivot to wide format
    wide_df = joined_df.pivot_table(index='sample_id', columns='biomolecule_id', values='normalized_abundance')
    wide_df.columns = [str(col) for col in wide_df.columns]

    query = "SELECT * from biomolecules WHERE omics_id={}".format(omics_id)
    # get biomolecule names
    biomolecules_df = pd.read_sql_query(query, connection)

    # close DB connection
    connection.close()

    # build biomolecule name dict and drop list
    biomolecule_name_dict = {}
    biomolecule_drop_list = []
    for index, row in biomolecules_df.iterrows():
        biomolecule_id = str(row['biomolecule_id'])
        standardized_name = row['standardized_name']
        biomolecule_name_dict[biomolecule_id] = standardized_name

        keep = row['keep']
        if keep!="1":
            biomolecule_drop_list.append(biomolecule_id)

    # drop biomolecules
    wide_df.drop(biomolecule_drop_list, axis=1, inplace=True)

    # replace wide_df column names
    new_col_names = []
    for col in wide_df.columns:
        new_col_names.append(biomolecule_name_dict[str(col)])
    wide_df.columns = new_col_names
    
    # record quant value range 
    quant_value_range = wide_df.shape[1]

    # optional return matrix with clinical metadata
    if with_metadata:

        combined_df = wide_df.join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')#.dropna()
        return combined_df, quant_value_range

    return wide_df, quant_value_range


In [25]:
proteomics_df, quant_value_range = get_omics_data(dataset='proteomics')
proteomics_df.head()

Unnamed: 0_level_0,A0A024R6I7;A0A0G2JRN3,A0A075B6H9,A0A075B6I0,A0A075B6I4;A0A1W2PQ80,A0A075B6I9,A0A075B6J1,A0A075B6J9,A0A075B6K4,A0A075B6K5,A0A075B6P5;P01615,...,Q9UHG3;Q9UHG3-2,Q9UJJ9;H0YEA7;A0A087WWA2,Q9ULV4;Q9ULV4-2;Q9ULV4-3;B4E3S0,Q9UNW1;Q9UNW1-3;Q9UNW1-2;Q9UNW1-4,Q9Y5C1,Q9Y6R7;A0A087WXI2,U3KQK0;Q99879;Q99877;Q93079;Q5QNW6;P62807;P58876;P57053;O60814;Q5QNW6-2;Q99880;Q96A08,V9GY83;P12318-2;P12318;F5GXY9;F5GX41;A0A087WXE5;H0YGT0;P31995-4;P31995-3;P31995-2;P31994-5;P31994-2;P31994-3;P31994-4;P31994;P31995,V9GYM3;P02652;V9GYE3;V9GYG9,X6R8F3;P80188;P80188-2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,37.710878,28.960165,29.486177,26.675595,19.370255,25.517883,26.493429,30.410945,31.08841,30.999944,...,26.111498,22.120729,21.529743,23.989798,22.891598,30.206645,26.352539,23.188534,28.569244,21.488319
2.0,37.240139,28.102745,28.188532,27.733115,19.9902,19.812219,24.84532,29.028257,31.023136,31.772071,...,25.419409,19.892124,20.184781,23.674653,17.394864,30.151342,27.400478,22.795547,29.451244,21.452066
3.0,37.637201,28.665126,27.422219,20.724576,24.93256,22.113618,25.717407,31.93793,30.425124,30.072398,...,26.430952,19.69028,17.899317,23.66958,18.550299,30.789783,25.951804,19.724862,29.252595,19.450616
4.0,38.145233,27.830079,29.649296,26.203592,26.040009,24.332789,26.232187,30.945938,29.962098,29.797982,...,26.023184,22.85331,22.356007,23.315618,22.881664,29.681153,27.352118,21.661894,29.594468,22.576394
5.0,37.896564,29.34363,29.503473,27.011693,25.772082,25.1694,26.94949,32.916129,32.403777,32.16294,...,23.095389,18.843929,21.258394,23.700071,23.480745,31.005242,27.209181,24.242418,29.649005,19.543226


In [26]:
proteomics_df.shape

(124, 517)

In [27]:
quant_value_range

517

In [None]:
metabolomics

In [28]:
lipidomics_df, quant_value_range = get_omics_data(dataset='lipidomics')
lipidomics_df.head()

Unnamed: 0_level_0,Unknown Lipid RT1.093 +_Duplicate,Unknown Lipid RT1.094 +_Duplicate,Unknown Lipid RT1.107 +_Duplicate,Unknown Lipid RT1.109 +_Duplicate,Unknown Lipid RT1.119 -_Duplicate,Unknown Lipid RT1.119 +_Duplicate,Unknown Lipid RT1.120 -_Duplicate,Unknown Lipid RT1.121 -_Duplicate,Unknown Lipid RT1.122 -_Duplicate,Unknown Lipid RT1.122 -_Duplicate,...,Unknown Lipid RT35.174 +,TG 18:1_18:1_24:0 +,Unknown Lipid RT35.179 +,Unknown Lipid RT35.373 +,Unknown Lipid RT35.391 +,Unknown Lipid RT35.392 +,TG 60:1 +,Unknown Lipid RT35.592 +,Unknown Lipid RT42.382 +,Unknown Lipid RT42.388 -
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,15.976462,19.03358,20.875503,16.584828,16.56637,15.861481,19.60165,15.468,18.107826,16.353128,...,15.282698,17.996415,14.874285,15.054255,14.150701,14.073726,16.298505,15.216885,16.091088,15.733284
2.0,13.687458,19.211487,20.978229,18.217534,18.007366,15.790173,17.426756,16.879111,17.960593,17.25146,...,14.568996,16.871878,14.586308,14.079343,14.63852,12.953482,14.994721,14.302267,16.351197,15.998386
3.0,17.979818,18.662757,20.678124,17.964309,15.720253,16.016798,17.037753,15.689912,17.204381,16.119386,...,17.223982,19.712136,16.51387,15.507988,15.15269,13.634892,17.583291,16.667818,15.977472,16.57695
4.0,17.467381,19.262132,20.969724,17.733312,15.043289,15.084114,19.073448,17.482569,17.209061,16.133293,...,17.328677,20.166055,17.689385,16.094259,15.617449,14.919465,18.043384,17.263973,15.385761,16.650355
5.0,15.554856,18.942669,20.718091,18.071819,18.516299,18.096767,16.834873,15.669407,17.200442,15.690189,...,17.156789,19.899135,17.430503,16.170194,15.29336,14.85979,17.946729,17.179654,15.176656,15.532393


In [37]:
quant_columns = lipidomics_df.columns[:quant_value_range].to_list()

In [38]:
quant_df = lipidomics_df[quant_columns]

In [39]:
quant_df.head()

Unnamed: 0_level_0,Unknown Lipid RT1.093 +_Duplicate,Unknown Lipid RT1.094 +_Duplicate,Unknown Lipid RT1.107 +_Duplicate,Unknown Lipid RT1.109 +_Duplicate,Unknown Lipid RT1.119 -_Duplicate,Unknown Lipid RT1.119 +_Duplicate,Unknown Lipid RT1.120 -_Duplicate,Unknown Lipid RT1.121 -_Duplicate,Unknown Lipid RT1.122 -_Duplicate,Unknown Lipid RT1.122 -_Duplicate,...,Unknown Lipid RT35.174 +,TG 18:1_18:1_24:0 +,Unknown Lipid RT35.179 +,Unknown Lipid RT35.373 +,Unknown Lipid RT35.391 +,Unknown Lipid RT35.392 +,TG 60:1 +,Unknown Lipid RT35.592 +,Unknown Lipid RT42.382 +,Unknown Lipid RT42.388 -
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,15.976462,19.03358,20.875503,16.584828,16.56637,15.861481,19.60165,15.468,18.107826,16.353128,...,15.282698,17.996415,14.874285,15.054255,14.150701,14.073726,16.298505,15.216885,16.091088,15.733284
2.0,13.687458,19.211487,20.978229,18.217534,18.007366,15.790173,17.426756,16.879111,17.960593,17.25146,...,14.568996,16.871878,14.586308,14.079343,14.63852,12.953482,14.994721,14.302267,16.351197,15.998386
3.0,17.979818,18.662757,20.678124,17.964309,15.720253,16.016798,17.037753,15.689912,17.204381,16.119386,...,17.223982,19.712136,16.51387,15.507988,15.15269,13.634892,17.583291,16.667818,15.977472,16.57695
4.0,17.467381,19.262132,20.969724,17.733312,15.043289,15.084114,19.073448,17.482569,17.209061,16.133293,...,17.328677,20.166055,17.689385,16.094259,15.617449,14.919465,18.043384,17.263973,15.385761,16.650355
5.0,15.554856,18.942669,20.718091,18.071819,18.516299,18.096767,16.834873,15.669407,17.200442,15.690189,...,17.156789,19.899135,17.430503,16.170194,15.29336,14.85979,17.946729,17.179654,15.176656,15.532393


In [34]:
len(lipidomics_df.columns[:quant_value_range])

7235

In [35]:
len(quant_columns)

7235

In [40]:
len(quant_columns)

7235

In [45]:
from collections import Counter

Counter(quant_df.columns.duplicated())

Counter({False: 6814, True: 1875})

In [46]:
quant_df = quant_df.loc[:,~quant_df.columns.duplicated()]
quant_df.shape

(128, 6814)