# Big Data for Health (CSE6250) 

Goal: Using Lightgbm model to predict sepsis onset using MIMIC III Data

Author: Zhensheng Wang
         
Created: 11/13/2021

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score
from tableone import TableOne
import lightgbm as lgbm
from hyperopt import fmin, hp, tpe, STATUS_OK, Trials
from hyperopt.pyll.base import scope

import os, gc
# os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= os.path.join(os.getcwd(), "bdfh.json")
# from google.cloud import bigquery
# bqclient = bigquery.Client()

## Load Data

In [2]:
case_labs = pd.read_csv('./data_v2/case_48h_labs_ex3h.csv')
case_vitals = pd.read_csv('./data_v2/case_48h_vitals_ex3h.csv')
case_static = pd.read_csv('./data_v2/static_variables_cases_ex3h.csv')

control_labs = pd.read_csv('./data_v2/control_48h_labs_ex3h.csv')
control_vitals = pd.read_csv('./data_v2/control_48h_vitals_ex3h.csv')
control_static = pd.read_csv('./data_v2/static_variables_controls_ex3h.csv')

## Clean data

In [8]:
def race_recode(df):
    cond_white = df['ethnicity'].str.contains('WHITE')
    cond_black = df['ethnicity'].str.contains('BLACK')
    cond_asian = df['ethnicity'].str.contains('ASIAN')
    cond_hispa = df['ethnicity'].str.contains('HISPANIC')

    df.loc[cond_white, 'ethnicity'] = 'WHITE'
    df.loc[cond_black, 'ethnicity'] = 'BLACK'
    df.loc[cond_asian, 'ethnicity'] = 'ASIAN'
    df.loc[cond_hispa, 'ethnicity'] = 'HISPANIC'
    df.loc[~(cond_white | cond_black | cond_asian | cond_hispa), 'ethnicity'] = 'OTHER'
    
    df['ethnicity'] = df['ethnicity'].apply(lambda x: x[0] + x[1:].lower())
    return df

In [10]:
case_labs = case_labs.drop(columns=['chart_time', 'icustay_id', 'sepsis_onset', 'hr_feature'])
apply_dict = {col: ['mean', 'median', 'std'] for col in case_labs.columns if col != 'subject_id'}
case_labs = case_labs.groupby('subject_id').agg(apply_dict)
# case_labs = case_labs.fillna(case_labs.median())
case_labs.columns = ['_'.join(col) for col in case_labs.columns]

case_static = case_static[['subject_id', 'gender', 'admission_age', 'ethnicity']]
case_static['label'] = 1
case_static = race_recode(case_static)
case_static['ethnicity'] = LabelEncoder().fit_transform(case_static['ethnicity'])
case_static['gender'] = LabelEncoder().fit_transform(case_static['gender'])

case_vitals = case_vitals.drop(columns=['chart_time', 'icustay_id', 'sepsis_onset', 'hr_feature'])
apply_dict = {col: ['mean', 'median', 'std'] for col in case_vitals.columns if col != 'subject_id'}
case_vitals = case_vitals.groupby('subject_id').agg(apply_dict)
# case_vitals = case_vitals.fillna(case_vitals.median())
case_vitals.columns = ['_'.join(col) for col in case_vitals.columns]

case_all = case_static.merge(case_labs, on='subject_id').merge(case_vitals, on='subject_id')


control_labs = control_labs.drop(columns=['chart_time', 'icustay_id', 'control_onset_time', 'hr_feature'])
apply_dict = {col: ['mean', 'median', 'std'] for col in control_labs.columns if col != 'subject_id'}
control_labs = control_labs.groupby('subject_id').agg(apply_dict)
# control_labs = control_labs.fillna(control_labs.median())
control_labs.columns = ['_'.join(col) for col in control_labs.columns]

control_static = control_static[['subject_id', 'gender', 'admission_age', 'ethnicity']]
control_static['label'] = 0
control_static = race_recode(control_static)
control_static['ethnicity'] = LabelEncoder().fit_transform(control_static['ethnicity'])
control_static['gender'] = LabelEncoder().fit_transform(control_static['gender'])

control_vitals = control_vitals.drop(columns=['chart_time', 'icustay_id', 'control_onset_time', 'hr_feature'])
apply_dict = {col: ['mean', 'median', 'std'] for col in control_vitals.columns if col != 'subject_id'}
control_vitals = control_vitals.groupby('subject_id').agg(apply_dict)
# control_vitals = control_vitals.fillna(control_vitals.median())
control_vitals.columns = ['_'.join(col) for col in control_vitals.columns]

control_all = control_static.merge(control_labs, on='subject_id').merge(control_vitals, on='subject_id')



In [11]:
case_control_all = pd.concat((case_all, control_all), ignore_index=True).sample(frac=1).reset_index(drop=True)

X, y = case_control_all.drop(columns=['subject_id', 'label']), case_control_all.label
X = MinMaxScaler().fit_transform(X)

## Model training and prediction

In [12]:
print(f"Number of features included: {X.shape[1]}")
print(f"Number of observations: {X.shape[0]}")
print(y.value_counts())

Number of features included: 135
Number of observations: 5657
0    5041
1     616
Name: label, dtype: int64


## Lightgbm CV

In [16]:
lgbm_param = {
        'num_leaves': scope.int(hp.quniform('num_leaves', 2, 21, 1)), # hp.choice('num_leaves', np.arange(2, 21)),
        'learning_rate': hp.uniform('learning_rate', 0.005, 0.1),
        'feature_fraction': hp.uniform('feature_fraction', 0.01, 0.8),
        'max_depth': scope.int(hp.quniform('max_depth', 2, 11, 1)), #hp.choice('max_depth', np.arange(2, 11)),
        'objective': 'binary',
        # 'boosting_type': 'dart',
        'metric': 'auc',
        'verbose': -1,
        # 'device_type': 'gpu'
    }

def f_lgbm(params):
    tr_data = lgbm.Dataset(X, y)
    res = lgbm.cv(params, tr_data, num_boost_round=1000, early_stopping_rounds=100, seed=42, return_cvbooster=True)
    return {'loss': -np.mean(res['auc-mean']).round(5), 'status': STATUS_OK, 'boosters': res['cvbooster'].boosters}

In [17]:
trials = Trials()
lgbm_best = fmin(f_lgbm, lgbm_param, algo=tpe.suggest, rstate=np.random.RandomState(42), max_evals=50, trials=trials)

100%|██████████| 50/50 [01:07<00:00,  1.36s/trial, best loss: -0.76581]


In [18]:
lgbm_best

{'feature_fraction': 0.2053922909202318,
 'learning_rate': 0.013791375580212269,
 'max_depth': 10.0,
 'num_leaves': 21.0}

In [19]:
# dir(res['cvbooster'])
feature_imp = np.zeros((X.shape[1]))
for booster in trials.best_trial['result']['boosters']:
    feature_imp += booster.feature_importance() / 5
    

In [23]:
pd.DataFrame(dict(cols = case_control_all.drop(columns=['subject_id', 'label']).columns, feature_importance = feature_imp)). \
    sort_values('feature_importance', ascending=False, ignore_index=True).head(10)

Unnamed: 0,cols,feature_importance
0,SysBP_std,273.2
1,TempC_mean,258.8
2,WBC_std,258.2
3,HeartRate_std,251.4
4,POTASSIUM_mean,224.4
5,BUN_median,213.2
6,BILIRUBIN_mean,207.6
7,MeanBP_mean,203.4
8,SODIUM_mean,200.6
9,CREATININE_median,191.4
