Import libraries

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from numpy import array
from statistics import mean
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings("ignore")

Load data

In [2]:
train_df = pd.read_csv("TrainingWiDS2021.csv")
test_df = pd.read_csv("UnlabeledWiDS2021.csv")

Drop unnecessary columns

In [3]:
train_df = train_df.drop(columns=['Unnamed: 0'])
test_df = test_df.drop(columns=['Unnamed: 0'])

In [6]:
train_df.drop(['encounter_id', 'hospital_id', 'icu_id'], axis=1)

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,...,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,68.0,22.732803,0,Caucasian,M,180.3,Floor,Floor,admit,CTICU,...,,,0,0,0,0,0,0,0,1
1,77.0,27.421875,0,Caucasian,F,160.0,Floor,Floor,admit,Med-Surg ICU,...,51.0,51.0,0,0,0,0,0,0,0,1
2,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,Accident & Emergency,admit,Med-Surg ICU,...,,,0,0,0,0,0,0,0,0
3,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,Operating Room / Recovery,admit,CTICU,...,337.0,337.0,0,0,0,0,0,0,0,0
4,19.0,,0,Caucasian,M,188.0,,Accident & Emergency,admit,Med-Surg ICU,...,,,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130152,50.0,29.287256,0,Caucasian,M,175.3,Emergency Department,Accident & Emergency,admit,Cardiac ICU,...,,,0,0,0,0,0,0,0,0
130153,79.0,29.653433,0,Caucasian,F,162.6,Direct Admit,Accident & Emergency,admit,MICU,...,,,0,0,0,0,0,0,0,0
130154,73.0,32.265371,0,African American,M,177.8,Emergency Department,Accident & Emergency,admit,Cardiac ICU,...,163.0,163.0,0,0,0,0,0,0,0,1
130155,81.0,24.408579,0,Caucasian,M,185.4,Emergency Department,Accident & Emergency,admit,Med-Surg ICU,...,,,0,0,0,0,0,0,0,0


Train data

In [7]:
X = train_df.drop('diabetes_mellitus', axis=1)
y = train_df['diabetes_mellitus']

kf = KFold(n_splits=5, random_state=42, shuffle=True)

auc = []

Gradient Boosting

In [None]:
k = 1

for train_index, val_index in kf.split(X):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Fill in NaNs
    train_mode = X_train[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']].mode(axis=0).iloc[0]
    X_train[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']] = X_train[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']].fillna(train_mode)
    X_val[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']] = X_val[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']].fillna(train_mode)
        
    train_median = train_df.median(axis=0)
    X_train.fillna(train_median, inplace=True)
    X_val.fillna(train_median, inplace=True)
    
    # One-hot encode
    encoder = OneHotEncoder(handle_unknown='ignore') 
    train_encoded = encoder.fit_transform(array(X_train[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']])).toarray()
    names = encoder.get_feature_names(['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type'])
    X_train[names] = train_encoded
    X_val[names] = encoder.transform(array(X_val[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']])).toarray()
    X_train.drop(['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type'], axis=1, inplace=True)
    X_val.drop(['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type'], axis=1, inplace=True)
    
    # Fit model
    gradient_booster = GradientBoostingClassifier(learning_rate=0.1)
    gradient_booster.fit(X_train, y_train)
    
    prediction = gradient_booster.predict_proba(X_val)[:, 1]
    auc.append(roc_auc_score(y_val, prediction))
    print('AUC', str(k), ':', roc_auc_score(y_val, prediction))
    
    k += 1
    
print('Mean AUC with GBM:', mean(auc))

print(classification_report(y_val, gradient_booster.predict(X_val)))

XGBoost

In [12]:
k = 1

for train_index, val_index in kf.split(X):
    # Split data
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    # Fill in NaNs
    train_mode = X_train[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']].mode(axis=0).iloc[0]
    X_train[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']] = X_train[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']].fillna(train_mode)
    X_val[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']] = X_val[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']].fillna(train_mode)
        
    train_median = train_df.median(axis=0)
    X_train.fillna(train_median, inplace=True)
    X_val.fillna(train_median, inplace=True)
    
    # One-hot encode
    encoder = OneHotEncoder(handle_unknown='ignore') 
    train_encoded = encoder.fit_transform(array(X_train[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']])).toarray()
    names = encoder.get_feature_names(['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type'])
    X_train[names] = train_encoded
    X_val[names] = encoder.transform(array(X_val[['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']])).toarray()
    X_train.drop(['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type'], axis=1, inplace=True)
    X_val.drop(['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type'], axis=1, inplace=True)
    
    # Fit XGBoost
    xgb_classifier = XGBClassifier(learning_rate=0.1)
    xgb_classifier.fit(X_train, y_train)
    
    prediction = xgb_classifier.predict_proba(X_val)[:, 1]
    auc.append(roc_auc_score(y_val, prediction))
    print('AUC', str(k), ':', roc_auc_score(y_val, prediction))
    
    k += 1
    
print('Mean AUC with XGBoost:', mean(auc))

print(classification_report(y_val, xgb_classifier.predict(X_val)))

              precision    recall  f1-score   support

           0       0.86      0.94      0.90     20411
           1       0.65      0.43      0.52      5620

    accuracy                           0.83     26031
   macro avg       0.76      0.68      0.71     26031
weighted avg       0.81      0.83      0.81     26031

