# Big Data for Health (CSE6250) 
Goal: Sepsis prediction using MIMIC III Data

Author: Zhensheng Wang
         
Created: 10/19/2021



In [148]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import roc_auc_score
from tableone import TableOne
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]= os.path.join(os.getcwd(), "bdfh.json")
from google.cloud import bigquery
bqclient = bigquery.Client()

In [8]:
import lightgbm as lgbm

## Query the data

In [2]:
with open('cse-6250-group-project/bigquery/sepsis_demographic.sql', 'r') as f:
    query_string = f.read()

In [3]:
query_string = query_string.replace('\n', ' ')

In [135]:
# Download query results. (change to your own query code)

# sepsis = bqclient.query("select * from cdcproject.BDFH.sepsis").result().to_dataframe()
# nonsepsis = bqclient.query("select * from cdcproject.BDFH.Nonsepsis").result().to_dataframe()
# angus_sepsis = bqclient.query("select * from cdcproject.BDFH.angus_sepsis").result().to_dataframe()
# sirs =  bqclient.query("select * from cdcproject.BDFH.sirs").result().to_dataframe()
# sofa =  bqclient.query("select * from cdcproject.BDFH.sofa").result().to_dataframe()

# sepsis.to_csv('Data/sepsis.csv', index=False)
# nonsepsis.to_csv('Data/nonsepsis.csv', index=False)
# angus_sepsis.to_csv('Data/angus_sepsis.csv', index=False)
# sirs.to_csv('Data/sirs.csv', index=False)
# sofa.to_csv('Data/sofa.csv', index=False)

sepsis = pd.read_csv('Data/sepsis.csv')
nonsepsis = pd.read_csv('Data/Nonsepsis.csv')
angus_sepsis = pd.read_csv('Data/angus_sepsis.csv')
sirs = pd.read_csv('Data/sirs.csv')
sofa = pd.read_csv('Data/sofa.csv')


# Copy the dataframe

In [14]:
print(f"Number of sepsis patients: {len(sepsis)}")
print(f"Number of control patients: {len(nonsepsis)}")
# sepsis.head(5)

# df_sepsis = sepsis.copy()
# df_nonsepsis = nonsepsis.copy()

Number of sepsis patients: 5035
Number of control patients: 42836


## Clean the data

In [137]:
def data_clean(df):

    # race recode
    cond_white = df['ETHNICITY'].str.contains('WHITE')
    cond_black = df['ETHNICITY'].str.contains('BLACK')
    cond_asian = df['ETHNICITY'].str.contains('ASIAN')
    cond_hispa = df['ETHNICITY'].str.contains('HISPANIC')

    df.loc[cond_white, 'ETHNICITY'] = 'WHITE'
    df.loc[cond_black, 'ETHNICITY'] = 'BLACK'
    df.loc[cond_asian, 'ETHNICITY'] = 'ASIAN'
    df.loc[cond_hispa, 'ETHNICITY'] = 'HISPANIC'
    df.loc[~(cond_white | cond_black | cond_asian | cond_hispa), 'ETHNICITY'] = 'OTHER'

    df['ETHNICITY'] = df['ETHNICITY'].apply(lambda x: x[0] + x[1:].lower())

    # marital status recode
    cond_other_marital = df['MARITAL_STATUS'].str.contains('SEPARATED|LIFE PARTNER', na = False)
    cond_unknown_marital = df['MARITAL_STATUS'].str.contains('UNKNOWN', na = False) | df['MARITAL_STATUS'].isna()

    df.loc[cond_other_marital, 'MARITAL_STATUS'] = 'OTHER'
    df.loc[cond_unknown_marital, 'MARITAL_STATUS'] = 'UNKNOWN'
    
    df['MARITAL_STATUS'] = df['MARITAL_STATUS'].apply(lambda x: x[0] + x[1:].lower())
    df['gender'] = df['gender'].apply(lambda x: 'Female' if x == 'F' else 'Male')
    df['age_admit'] = np.where(df['age_admit'] >= 85, 85, df['age_admit'])

    return df

sepsis = data_clean(sepsis)
nonsepsis = data_clean(nonsepsis)

sepsis['Sepsis'] = 1
nonsepsis['Sepsis'] = 0
df_table1 = pd.concat((sepsis, nonsepsis), 0).reset_index(drop=True)

## Table 1 Descriptive statistics

In [138]:
columns = ['ETHNICITY', 'gender', 'INSURANCE', 'MARITAL_STATUS', 'los', 'age_admit']
categorical = ['ETHNICITY', 'gender', 'INSURANCE', 'MARITAL_STATUS']
order = {
    'ETHNICITY': ['White', 'Black', 'Hispanic', 'Asian', 'Other'],
    'MARITAL_STATUS': ['Single', 'Married', 'Divorced', 'Widowed', 'Other', 'Unknown']
    }
label = {
    'age_admit': 'Age (yrs) at first admission',
    'los': 'Length of stay (days)',
    'ETHNICITY': 'Race/Ethnicity',
    'MARITAL_STATUS': 'Marital status',
    'gender': 'Gender',
    'INSURANCE': 'Insurance'
}

t1 = TableOne(
    df_table1, 
    columns = columns, 
    categorical = categorical,
    nonnormal = ['los'],  
    groupby = 'Sepsis', 
    limit = 6, 
    order = order,
    pval = False,
    isnull = False,
    rename = label
)

t1
# print(t1.tabulate(tablefmt="latex"))




Unnamed: 0_level_0,Unnamed: 1_level_0,Grouped by Sepsis,Grouped by Sepsis,Grouped by Sepsis
Unnamed: 0_level_1,Unnamed: 1_level_1,Overall,0,1
n,,47871,42836,5035
"Race/Ethnicity, n (%)",White,33193 (69.3),29530 (68.9),3663 (72.8)
"Race/Ethnicity, n (%)",Black,4066 (8.5),3548 (8.3),518 (10.3)
"Race/Ethnicity, n (%)",Hispanic,1727 (3.6),1563 (3.6),164 (3.3)
"Race/Ethnicity, n (%)",Asian,1733 (3.6),1568 (3.7),165 (3.3)
"Race/Ethnicity, n (%)",Other,7152 (14.9),6627 (15.5),525 (10.4)
"Gender, n (%)",Female,20988 (43.8),18748 (43.8),2240 (44.5)
"Gender, n (%)",Male,26883 (56.2),24088 (56.2),2795 (55.5)
"Insurance, n (%)",Government,1620 (3.4),1501 (3.5),119 (2.4)
"Insurance, n (%)",Medicaid,4624 (9.7),4151 (9.7),473 (9.4)


## Feature creation

In [183]:
def feature_create(df):
    df_angus_sepsis = angus_sepsis.drop(columns=['hadm_id', 'explicit_sepsis', 'angus']).drop_duplicates(ignore_index=True)
    df_sofa = sofa[['subject_id', 'SOFA']].groupby('subject_id')['SOFA'].max().to_frame().reset_index()
    df_sirs = sirs[['subject_id', 'sirs']].groupby('subject_id')['sirs'].max().to_frame().reset_index()

    mms = MinMaxScaler()

    df = df.merge(df_angus_sepsis, how='inner', on='subject_id') \
        .merge(df_sofa, how='inner', on='subject_id') \
        .merge(df_sirs, how='inner', on='subject_id')
    df = df.drop_duplicates('subject_id', ignore_index=True).drop(columns=['subject_id', 'dob', 'dod', 'age_death', 'admit_time'])

    df = pd.concat((
        df.drop(columns=['gender', 'ETHNICITY', 'INSURANCE', 'MARITAL_STATUS']),
        pd.get_dummies(df['gender'], dummy_na=False, prefix='gender'), 
        pd.get_dummies(df['ETHNICITY'], dummy_na=False, prefix='ethnicity'),
        pd.get_dummies(df['MARITAL_STATUS'], dummy_na=False, prefix='marital_status'),
        pd.get_dummies(df['INSURANCE'], dummy_na=False, prefix='insurance')), axis = 1).reset_index(drop=True)

    df[df.columns] = mms.fit_transform(df)
    return df


## Model training and prediction

In [184]:
df_model = feature_create(df_table1)
X = df_model.drop(columns=['Sepsis']).values
y = df_model['Sepsis'].values


skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
avg_auc = 0
feature_importance = np.zeros((X.shape[1], ))

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    tr_x, tr_y = X[tr_idx], y[tr_idx]
    val_x, val_y = X[val_idx], y[val_idx]
    model = GradientBoostingClassifier()
    model.fit(tr_x, tr_y)
    pred = model.predict_proba(val_x)[:, 1]
    fold_score = roc_auc_score(val_y, pred)
    
    feature_importance += model.feature_importances_ / skf.n_splits
    print(f"Fold - {fold} AUC: {fold_score:.3f}")
    avg_auc += fold_score / skf.n_splits
    
print(f'Average AUC: {avg_auc:.3f}')


## Feature importance

In [186]:
pd.DataFrame(dict(
    var = df_model.drop(columns=['Sepsis']).columns, 
    importance = feature_importance)).sort_values('importance', ascending=False)


Unnamed: 0,var,importance
2,infection,0.381073
5,SOFA,0.232552
3,organ_dysfunction,0.121912
0,los,0.120063
6,sirs,0.095356
1,age_admit,0.023491
12,ethnicity_Other,0.010303
18,marital_status_Unknown,0.003421
4,mech_vent,0.002747
22,insurance_Medicare,0.002687
