# Get proteomics data for linear regression analysis


In [1]:
from sqlalchemy import create_engine, MetaData, Table, select, join
import pandas as pd

# SQLite path (updated to include new version of proteomics data)
db_path = 'sqlite:///../../../data/SQLite Database/20200602/Covid-19 Study DB.sqlite'
#db_path = 'sqlite:////Volumes/projects/All_20200428_COVID_plasma_multiomics/SQLite Database/Covid-19 Study DB.sqlite'


def get_proteomics_data(with_metadata=False):
    # Create an engine that connects to the Covid-19 Study DB.sqlite file: engine
    engine = create_engine(db_path)

    # Establish connection
    connection = engine.connect()

    # pull table into df
    proteomics_measurements_df = pd.read_sql_query("SELECT * from proteomics_measurements", connection)

    # pull table into df
    proteomics_runs_df = pd.read_sql_query("SELECT * from proteomics_runs", connection)

    # pull table into df
    rawfiles_df = pd.read_sql_query("SELECT * from rawfiles WHERE ome_id=1 AND sample_ID<>-1 and keep=1", connection)

    # pull table into df
    #deidentified_patient_metadata_df = pd.read_sql_query("SELECT * from deidentified_patient_metadata", connection)
    
    ## NOTE : reading in patient metadata with SAPS2, SOFA, PF_ratio, PF_ratio_numeric here
    deidentified_patient_metadata_path = "../../../reference/deidentified_patient_metadata_HFD_45_SAPS_SOFA_PF.csv"
    deidentified_patient_metadata_df = pd.read_csv(deidentified_patient_metadata_path)

    # make sure the merge by columns are all the same type -> pandas seems sensitive to this
    proteomics_measurements_df = proteomics_measurements_df.astype({'replicate_id': 'int32'})
    proteomics_runs_df = proteomics_runs_df.astype({'replicate_id': 'int32', 'rawfile_id': 'int32'})
    rawfiles_df = rawfiles_df.astype({'rawfile_id': 'int32', 'sample_id': 'int32'})
    deidentified_patient_metadata_df = deidentified_patient_metadata_df.astype({'sample_id': 'int32'})

    joined_df = proteomics_measurements_df\
                .join(proteomics_runs_df.set_index('replicate_id'), on='replicate_id')\
                .join(rawfiles_df.set_index('rawfile_id'), on='rawfile_id')\
                .join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')

    # drop samples that are missing COVID or ICU status
    joined_df.dropna(subset=['ICU_1','COVID'], inplace=True)

    # pivot to wide format
    wide_df = joined_df.pivot_table(index='sample_id', columns='biomolecule_id', values='normalized_abundance')
    wide_df.columns = [str(col) for col in wide_df.columns]

    # get biomolecule names
    biomolecules_df = pd.read_sql_query("SELECT * from biomolecules WHERE omics_id=1", connection)
    
    # close DB connection
    connection.close()

    # build biomolecule name dict 
    biomolecule_name_dict = {}
    biomolecule_drop_list = []
    for index, row in biomolecules_df.iterrows():
        biomolecule_id = str(row['biomolecule_id'])
        standardized_name = row['standardized_name']
        biomolecule_name_dict[biomolecule_id] = standardized_name

        keep = row['keep']
        if keep!="1":
            biomolecule_drop_list.append(biomolecule_id)

    # replace wide_df column names
    """new_col_names = []
    for col in wide_df.columns:
        new_col_names.append(biomolecule_name_dict[str(col)])
    wide_df.columns = new_col_names"""

    # optional return matrix with clinical metadata
    if with_metadata:

        combined_df = wide_df.join(deidentified_patient_metadata_df.set_index('sample_id'), on='sample_id')#.dropna()
        return combined_df
    
    return wide_df

In [2]:
proteomics_df = get_proteomics_data(with_metadata=True)
proteomics_df

Unnamed: 0_level_0,7593,7594,7595,7596,7597,7598,7599,7600,7601,7602,...,APACHEII,Charlson_score,Mech_Ventilation,Vent_free_days,DM,Hospital_free_days_45,SAPS2,SOFA,PF_ratio,PF_ratio_numeric
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,37.995543,22.751400,22.122531,27.176361,28.294477,22.830609,24.783368,23.926601,23.490571,25.490816,...,15.0,0.0,1.0,0.0,0.0,0.0,47.0,8.0,134,134.0
2.0,37.353091,21.477142,21.106922,29.419228,29.048510,26.660849,27.573331,24.671039,22.343591,27.166913,...,,2.0,0.0,28.0,0.0,39.0,,,,
3.0,37.527875,20.647824,19.148225,27.174171,28.953215,20.212142,25.619161,24.528623,20.714471,25.589237,...,,2.0,0.0,28.0,1.0,18.0,,,,
4.0,37.673128,21.828886,23.396629,28.879702,29.527460,26.439300,23.893562,25.875326,24.170967,25.280226,...,,1.0,0.0,28.0,0.0,39.0,,,,
5.0,37.983542,18.769891,18.695882,27.262485,28.605867,22.957979,18.440251,20.888495,17.354782,25.654472,...,19.0,1.0,1.0,23.0,0.0,27.0,51.0,7.0,193,193.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124.0,37.365447,18.888922,22.348868,27.133006,29.418182,25.645154,26.724164,24.374313,24.805480,27.039675,...,,5.0,0.0,28.0,1.0,36.0,,,,
125.0,37.564109,19.975562,24.375109,27.793462,29.003245,26.466999,26.577443,25.152285,24.429947,26.009474,...,4.0,2.0,0.0,28.0,0.0,43.0,12.0,1.0,,
126.0,37.683884,18.226595,19.727447,26.876193,28.002763,20.797804,27.996742,24.717171,23.581184,26.701340,...,,2.0,0.0,28.0,0.0,40.0,,,,
127.0,37.282698,22.212080,20.632306,27.715091,28.660543,17.828118,21.713502,25.369895,24.291863,26.368563,...,,0.0,0.0,28.0,0.0,43.0,,,,


# Test functions

In [4]:
# get colors
color_list = []
for sample_id, row in proteomics_df.iterrows():

    ICU_1 = str(row['ICU_1']).split(".")[0]
    COVID = str(row['COVID']).split(".")[0]

    if pd.isnull(ICU_1):
        color = "Col12"

    elif ICU_1 == "1" and COVID == "1":
        color = "COVID_ICU"

    elif ICU_1 == "1" and COVID == "0":
        color = "NONCOVID_ICU"

    elif ICU_1 == "0" and COVID == "1":
        color = 'COVID_NONICU'

    elif ICU_1 == "0" and COVID == "0":
        color = "NONCOVID_NONICU"

    color_list.append(color)

In [5]:
from collections import Counter

Counter(color_list)

Counter({'COVID_NONICU': 51,
         'COVID_ICU': 51,
         'NONCOVID_NONICU': 10,
         'NONCOVID_ICU': 15})

In [6]:
proteomics_df['color_by'] = color_list

In [7]:
proteomics_df.to_csv("../../../data/proteomics_measurements_w_clinical_metadata.csv")