In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from collections import Counter

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
random_state = 0

In [4]:
df_train = pd.read_csv('dataset_diabetes/diabetic_data_train.csv')

In [5]:
# Removing several medication columns also
remove_cols = ['encounter_id', 'patient_nbr', 'weight', 'payer_code', 'medical_specialty', 'examide', 
    'citoglipton', 'chlorpropamide', 'acetohexamide', 'tolbutamide', 'acarbose', 'miglitol', 
    'troglitazone', 'tolazamide', 'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
    'metformin-rosiglitazone', 'metformin-pioglitazone']

In [6]:
df_train = df_train[[col for col in df_train.columns if col not in remove_cols]]

In [7]:
df_train.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'glimepiride', 'glipizide',
       'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin', 'change',
       'diabetesMed', 'readmitted'],
      dtype='object')

## Create Labels

In [8]:
[(key, val/len(df_train.readmitted)) for (key, val) in Counter(df_train.readmitted).most_common()]

[('NO', 0.5393799439886012),
 ('>30', 0.3491868520611212),
 ('<30', 0.1114332039502776)]

In [9]:
def binarize_readmitted(x):
    if x in ['<30']: # readmitted in less than 30 days is positive class
        return 1 
    else:
        return 0 

In [10]:
df_train['label'] = df_train.apply(lambda x: binarize_readmitted(x['readmitted']), axis=1)

In [11]:
[(key, val/len(df_train.readmitted)) for (key, val) in Counter(df_train['label']).most_common()]

[(0, 0.8885667960497224), (1, 0.1114332039502776)]

## Process Features

In [12]:
def map_age(x):
    if x == '[0-10)':
        return 5
    elif x == '[10-20)':
        return 15
    elif x == '[20-30)':
        return 25
    elif x == '[30-40)':
        return 35
    elif x == '[40-50)':
        return 45
    elif x == '[50-60)':
        return 55
    elif x == '[60-70)':
        return 65
    elif x == '[70-80)':
        return 75
    elif x == '[80-90)':
        return 85
    else:
        return 95

admission_dict = {1: 'Emergency', 2: 'Urgent', 3: 'Elective', 4: 'Newborn', 7: 'Trauma Center'}
def map_admission(x):
    if x in [5,6,8]:
        return 'Not Available'
    else:
        return admission_dict[x]
    
def discharged_home(x):
    if x == 1:
        return 1 # Discharged home
    else:
        return 0 # Other 

def map_source(x):
    if x in [1,2,3]:
        return 'Referral' 
    elif x == 7:
        return 'Emergency Room'
    else:
        return 'Other'

def map_diag(x):
    if x[0].isdigit() == False:
        return 'other'
    
    float_x = float(x)
    if (float_x >= 390 and float_x <= 459) or float_x == 785: # 390–459, 785
        return 'circulatory'
    elif (float_x >= 460 and float_x <= 519) or float_x == 786: # 460–519, 786
        return 'respiratory'
    elif (float_x >= 520 and float_x <= 579) or float_x == 787: # 520–579, 787
        return 'digestive'
    elif float_x >= 250 and float_x < 251: # 250.xx
        return 'diabetes'
    elif float_x >= 800 and float_x <= 999: # 800–999
        return 'injury'
    elif float_x >= 710 and float_x <= 739: # 710–739
        return 'musculoskeletal'
    elif (float_x >= 580 and float_x <= 629) or float_x == 788: # 580–629, 788
        return 'genitourinary'
    elif float_x >= 140 and float_x <= 239: # 140–239
        return 'neoplasms'
    else:
        return 'other'

def binarize_yn(x):
    if x in ['Yes', 'Ch']: 
        return 1 
    else:
        return 0 

In [13]:
df_train['age_processed'] = df_train.apply(lambda x: map_age(x['age']), axis=1)
df_train['admission_type_processed'] = df_train.apply(lambda x: map_admission(x['admission_type_id']), axis=1)
df_train['discharged_processed'] = df_train.apply(lambda x: discharged_home(x['discharge_disposition_id']), axis=1)
df_train['source_processed'] = df_train.apply(lambda x: map_source(x['admission_source_id']), axis=1)

In [14]:
df_train['diag1_processed'] = df_train.apply(lambda x: map_diag(x['diag_1']), axis=1)
df_train['diag2_processed'] = df_train.apply(lambda x: map_diag(x['diag_2']), axis=1)
df_train['diag3_processed'] = df_train.apply(lambda x: map_diag(x['diag_3']), axis=1)
df_train['change_processed'] = df_train.apply(lambda x: binarize_yn(x['change']), axis=1)
df_train['diabetesMed_processed'] = df_train.apply(lambda x: binarize_yn(x['diabetesMed']), axis=1)

## One Hot Encodings

In [15]:
ohe_cols = ['admission_type_processed', 'source_processed',
    'diag1_processed', 'diag2_processed', 'diag3_processed', 'race', 'gender', 
    'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide', 
    'glimepiride', 'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin']

Inspecting values/counts for each of the categorical variables

In [16]:
for col in ohe_cols:
    print(col)
    print(Counter(df_train[col]), '\n')

admission_type_processed
Counter({'Emergency': 43218, 'Elective': 15052, 'Urgent': 14845, 'Not Available': 8274, 'Trauma Center': 13, 'Newborn': 10}) 

source_processed
Counter({'Emergency Room': 45990, 'Referral': 24668, 'Other': 10754}) 

diag1_processed
Counter({'circulatory': 24429, 'other': 14636, 'respiratory': 11530, 'digestive': 7549, 'diabetes': 6987, 'injury': 5491, 'genitourinary': 4122, 'musculoskeletal': 3929, 'neoplasms': 2739}) 

diag2_processed
Counter({'circulatory': 25456, 'other': 21617, 'diabetes': 10217, 'respiratory': 8679, 'genitourinary': 6717, 'digestive': 3318, 'neoplasms': 2039, 'injury': 1971, 'musculoskeletal': 1398}) 

diag3_processed
Counter({'other': 24576, 'circulatory': 24159, 'diabetes': 13759, 'respiratory': 5865, 'genitourinary': 5354, 'digestive': 3129, 'injury': 1560, 'musculoskeletal': 1517, 'neoplasms': 1493}) 

race
Counter({'Caucasian': 60871, 'AfricanAmerican': 15325, '?': 1824, 'Hispanic': 1650, 'Other': 1225, 'Asian': 517}) 

gender
Counter

In [17]:
# Remove 2 rows where gender is invalid (probably doesn't matter but just incase)
df_train = df_train[df_train.gender != 'Unknown/Invalid']
print(len(df_train))

81410


In [18]:
df_train_sub = df_train[ohe_cols]

In [19]:
ohe = OneHotEncoder(categories='auto')
train_ohe = ohe.fit_transform(df_train_sub).toarray()

In [20]:
# obtain column names for one hot encoded columns 
feature_labels = []
feature_labels_ = ohe.categories_
for col, values in zip(df_train_sub.columns, feature_labels_):
    for val in values:
        feature_labels.append(col+'_'+val)

In [32]:
df_train_ohe = pd.DataFrame(train_ohe, columns = feature_labels)

In [35]:
cols = ['age_processed', 'discharged_processed', 'change_processed', 'diabetesMed_processed', 
        'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
        'number_outpatient','number_emergency', 'number_inpatient', 'number_diagnoses']

df_train_num = df_train[cols]

In [36]:
print(df_train_ohe.shape, df_train_num.shape)

(81410, 88) (81410, 12)


In [38]:
# Reindexing because df_train_ohe has different indices than df_train_num (due to removing the two invalid gender entries)
# and not doing this will create null entries when concatenating 
df_train_ohe.reset_index(drop=True, inplace=True)
df_train_num.reset_index(drop=True, inplace=True)

In [39]:
X_train = pd.concat([df_train_ohe, df_train_num], axis=1)

In [40]:
for col in X_train.columns:
    print(col)
    print(Counter(X_train[col]), '\n')

admission_type_processed_Elective
Counter({0.0: 66358, 1.0: 15052}) 

admission_type_processed_Emergency
Counter({1.0: 43216, 0.0: 38194}) 

admission_type_processed_Newborn
Counter({0.0: 81400, 1.0: 10}) 

admission_type_processed_Not Available
Counter({0.0: 73136, 1.0: 8274}) 

admission_type_processed_Trauma Center
Counter({0.0: 81397, 1.0: 13}) 

admission_type_processed_Urgent
Counter({0.0: 66565, 1.0: 14845}) 

source_processed_Emergency Room
Counter({1.0: 45988, 0.0: 35422}) 

source_processed_Other
Counter({0.0: 70656, 1.0: 10754}) 

source_processed_Referral
Counter({0.0: 56742, 1.0: 24668}) 

diag1_processed_circulatory
Counter({0.0: 56981, 1.0: 24429}) 

diag1_processed_diabetes
Counter({0.0: 74423, 1.0: 6987}) 

diag1_processed_digestive
Counter({0.0: 73861, 1.0: 7549}) 

diag1_processed_genitourinary
Counter({0.0: 77288, 1.0: 4122}) 

diag1_processed_injury
Counter({0.0: 75921, 1.0: 5489}) 

diag1_processed_musculoskeletal
Counter({0.0: 77481, 1.0: 3929}) 

diag1_processed

In [48]:
y_train = df_train['label'].values

## Cross Validation Pipeline

In [49]:
pipe = Pipeline([
        ('sampling', SMOTE(random_state=random_state)),
        ('model', RandomForestClassifier())
    ])

pipe.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'sampling', 'model', 'sampling__k_neighbors', 'sampling__n_jobs', 'sampling__random_state', 'sampling__sampling_strategy', 'model__bootstrap', 'model__ccp_alpha', 'model__class_weight', 'model__criterion', 'model__max_depth', 'model__max_features', 'model__max_leaf_nodes', 'model__max_samples', 'model__min_impurity_decrease', 'model__min_impurity_split', 'model__min_samples_leaf', 'model__min_samples_split', 'model__min_weight_fraction_leaf', 'model__n_estimators', 'model__n_jobs', 'model__oob_score', 'model__random_state', 'model__verbose', 'model__warm_start'])

In [50]:
param_grid = {
    'model__n_estimators': [100, 500],
    'model__max_depth': [4, 6, 8, 10, 12],
    'model__random_state': [random_state]
}

# Recall = TP/TP+FN -> Of the ones that are positive, how many did we indicate as positive
# AUROC = area under curve TPR (TP/TP+FN) vs. FPR (FP/TN+FP) at various thresholds
scoring = ['accuracy', 'roc_auc', 'recall']

In [51]:
clf = GridSearchCV(pipe, param_grid, cv=4, verbose=5, scoring=scoring, refit='roc_auc', return_train_score=True)

In [52]:
best_model = clf.fit(X_train, y_train)

Fitting 4 folds for each of 10 candidates, totalling 40 fits
[CV] model__max_depth=4, model__n_estimators=100, model__random_state=0 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model__max_depth=4, model__n_estimators=100, model__random_state=0, accuracy=(train=0.846, test=0.844), recall=(train=0.085, test=0.082), roc_auc=(train=0.577, test=0.566), total=   6.0s
[CV] model__max_depth=4, model__n_estimators=100, model__random_state=0 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.6s remaining:    0.0s


[CV]  model__max_depth=4, model__n_estimators=100, model__random_state=0, accuracy=(train=0.843, test=0.842), recall=(train=0.093, test=0.085), roc_auc=(train=0.578, test=0.577), total=   6.1s
[CV] model__max_depth=4, model__n_estimators=100, model__random_state=0 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.4s remaining:    0.0s


[CV]  model__max_depth=4, model__n_estimators=100, model__random_state=0, accuracy=(train=0.833, test=0.833), recall=(train=0.110, test=0.122), roc_auc=(train=0.580, test=0.566), total=   5.9s
[CV] model__max_depth=4, model__n_estimators=100, model__random_state=0 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   20.0s remaining:    0.0s


[CV]  model__max_depth=4, model__n_estimators=100, model__random_state=0, accuracy=(train=0.841, test=0.843), recall=(train=0.100, test=0.091), roc_auc=(train=0.580, test=0.573), total=   6.0s
[CV] model__max_depth=4, model__n_estimators=500, model__random_state=0 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   26.6s remaining:    0.0s


[CV]  model__max_depth=4, model__n_estimators=500, model__random_state=0, accuracy=(train=0.842, test=0.842), recall=(train=0.092, test=0.090), roc_auc=(train=0.578, test=0.570), total=  24.8s
[CV] model__max_depth=4, model__n_estimators=500, model__random_state=0 
[CV]  model__max_depth=4, model__n_estimators=500, model__random_state=0, accuracy=(train=0.839, test=0.837), recall=(train=0.102, test=0.096), roc_auc=(train=0.579, test=0.575), total=  24.1s
[CV] model__max_depth=4, model__n_estimators=500, model__random_state=0 
[CV]  model__max_depth=4, model__n_estimators=500, model__random_state=0, accuracy=(train=0.835, test=0.833), recall=(train=0.110, test=0.112), roc_auc=(train=0.580, test=0.567), total=  25.3s
[CV] model__max_depth=4, model__n_estimators=500, model__random_state=0 
[CV]  model__max_depth=4, model__n_estimators=500, model__random_state=0, accuracy=(train=0.842, test=0.843), recall=(train=0.095, test=0.091), roc_auc=(train=0.580, test=0.573), total=  27.6s
[CV] mode

[CV]  model__max_depth=12, model__n_estimators=100, model__random_state=0, accuracy=(train=0.889, test=0.888), recall=(train=0.003, test=0.000), roc_auc=(train=0.763, test=0.602), total=  11.5s
[CV] model__max_depth=12, model__n_estimators=500, model__random_state=0 
[CV]  model__max_depth=12, model__n_estimators=500, model__random_state=0, accuracy=(train=0.889, test=0.889), recall=(train=0.002, test=0.000), roc_auc=(train=0.764, test=0.606), total=  52.1s
[CV] model__max_depth=12, model__n_estimators=500, model__random_state=0 
[CV]  model__max_depth=12, model__n_estimators=500, model__random_state=0, accuracy=(train=0.889, test=0.889), recall=(train=0.001, test=0.001), roc_auc=(train=0.767, test=0.604), total=  53.1s
[CV] model__max_depth=12, model__n_estimators=500, model__random_state=0 
[CV]  model__max_depth=12, model__n_estimators=500, model__random_state=0, accuracy=(train=0.889, test=0.889), recall=(train=0.001, test=0.000), roc_auc=(train=0.761, test=0.605), total=  52.0s
[C

[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed: 18.2min finished
