In [92]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import random
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, recall_score,roc_auc_score
from sklearn import preprocessing
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler 

In [93]:
data = pd.read_csv('./final_data/final_data.csv')
print(data.shape)
data.columns

(84432, 44)


Index(['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'citoglipton', 'insulin',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'target', 'self_pay',
       'medicaid/medicare', 'coverByInsurance', 'f_diag', 's_diag', 't_diag',
       'clustering3', 'clustering4', 'clustering2', 'medical_specialty_new',
       'discharge_id', 'admission_id'],
      dtype='object')

In [94]:
y = data['target']

In [95]:
data['admission_type_id'] = data['admission_type_id'].apply(str)

In [96]:
data.dtypes

race                      object
gender                    object
age                       object
admission_type_id         object
time_in_hospital           int64
num_lab_procedures         int64
num_procedures             int64
num_medications            int64
number_outpatient          int64
number_emergency           int64
number_inpatient           int64
number_diagnoses           int64
max_glu_serum             object
A1Cresult                 object
metformin                 object
repaglinide               object
nateglinide               object
chlorpropamide            object
acetohexamide             object
glipizide                 object
glyburide                 object
tolbutamide               object
pioglitazone              object
rosiglitazone             object
acarbose                  object
miglitol                  object
citoglipton               object
insulin                   object
metformin-pioglitazone    object
change                    object
diabetesMe

In [97]:
cols_num = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_emergency', 'number_inpatient',
           'number_diagnoses']

In [98]:
X = data.drop(['target'], axis = 1)

In [99]:
X.columns

Index(['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'citoglipton', 'insulin',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'self_pay',
       'medicaid/medicare', 'coverByInsurance', 'f_diag', 's_diag', 't_diag',
       'clustering3', 'clustering4', 'clustering2', 'medical_specialty_new',
       'discharge_id', 'admission_id'],
      dtype='object')

In [100]:
dummied_features = pd.get_dummies(X, drop_first = True)

In [101]:
dummied_features.dtypes

time_in_hospital                                      int64
num_lab_procedures                                    int64
num_procedures                                        int64
num_medications                                       int64
number_outpatient                                     int64
number_emergency                                      int64
number_inpatient                                      int64
number_diagnoses                                      int64
self_pay                                              int64
medicaid/medicare                                     int64
coverByInsurance                                      int64
clustering3                                           int64
clustering4                                           int64
clustering2                                           int64
race_Asian                                            uint8
race_Caucasian                                        uint8
race_Hispanic                           

In [102]:
X_train, X_test, y_train, y_test = train_test_split(dummied_features, y, test_size=0.3, random_state=42, stratify=y, shuffle = True)

In [103]:
scaler = preprocessing.StandardScaler()

In [104]:
scaler.fit(X_train[cols_num])

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [105]:
print('Original dataset shape %s' % Counter(y_train))

Original dataset shape Counter({0: 52372, 1: 6730})


In [106]:
rus = RandomUnderSampler(random_state=42)

In [107]:
X_res, y_res = rus.fit_resample(X_train, y_train)

In [108]:
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({0: 6730, 1: 6730})


In [109]:
gbm = GradientBoostingClassifier(loss = "deviance")
max_feature = ["sqrt", "auto"]
max_depth = range(1,10,1)
n_est = range(200,1000,200)
learning_rate= [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2]
grid_param = {"max_depth": max_depth, "n_estimators": n_est, "learning_rate":learning_rate, 
               "min_samples_split": range(2,10,2),"min_samples_leaf": range(2,10,2), "max_features": max_feature}
n_folds = 3

rs = RandomizedSearchCV(gbm, grid_param, cv = n_folds, n_iter=10, scoring = 'roc_auc')

In [110]:
rs.fit(X_res, y_res)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sampl...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'max_depth': range(1, 10), 'n_estimators': range(200, 1000, 200), 'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2], 'min_samples_split': range(2, 10, 2), 'min_samples_leaf': range(2, 10, 2), 'max_features': ['sqrt', 'auto']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [111]:
rs.best_params_

{'n_estimators': 400,
 'min_samples_split': 6,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 3,
 'learning_rate': 0.075}

In [112]:
gbm.set_params(max_features = "sqrt", learning_rate = 0.075, max_depth = 3, loss = "deviance",
                                n_estimators = 400, min_samples_split = 6, min_samples_leaf = 4)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.075, loss='deviance', max_depth=3,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=4, min_samples_split=6,
              min_weight_fraction_leaf=0.0, n_estimators=400,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [113]:
gbm.fit(X_res, y_res)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.075, loss='deviance', max_depth=3,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=4, min_samples_split=6,
              min_weight_fraction_leaf=0.0, n_estimators=400,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [114]:
print(roc_auc_score(y_test, gbm.predict(X_test)))

0.6124017384301236


In [115]:
print(confusion_matrix(y_test, gbm.predict(X_test)))

[[15074  7371]
 [ 1289  1596]]


In [116]:
gbm_coefs_df = pd.concat([pd.DataFrame(X_test.columns), pd.DataFrame(gbm.feature_importances_)], axis = 1)
gbm_coefs_df.columns = ["Col", "Coef"]
gbm_coefs_df = gbm_coefs_df.sort_values(by="Coef", ascending = False)

In [81]:
gbm_coefs_df

Unnamed: 0,Col,Coef
6,number_inpatient,0.345657
142,discharge_id_22,0.077450
3,num_medications,0.044597
5,number_emergency,0.043726
0,time_in_hospital,0.036288
146,discharge_id_5,0.034175
1,num_lab_procedures,0.029222
7,number_diagnoses,0.020691
144,discharge_id_3,0.015399
10,coverByInsurance,0.014107


In [117]:
import xgboost as xgb

In [118]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': [1000,2000,3000],
              'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
              'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
              'max_depth': [2,3,5],
              'max_feature' : ["sqrt", "auto"],
              "min_samples_split": [2,3,5,7],
              "min_samples_leaf": [2,3,5,7]
             }
numFolds = 3
clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = numFolds,  
                         n_iter = 10,  
                         scoring = 'roc_auc')

In [47]:
X_train = X_train.rename({"max_glu_serum_>300":"max_glu_serum_greater300", "A1Cresult_>8":"A1Cresult_greater8"}, axis='columns')

In [48]:
X_train.columns = X_train.columns.str.replace(",", "")

In [49]:
X_train.columns = X_train.columns.str.replace(" ", "_")

In [50]:
X_train.columns = X_train.columns.str.replace("[", "")

In [51]:
X_train.columns = X_train.columns.str.replace("]", "")

In [52]:
X_test.columns = X_train.columns.str.replace(",", "")

In [53]:
X_test.columns = X_train.columns.str.replace(" ", "_")

In [119]:
clf.fit(X_res, y_res)

KeyboardInterrupt: 