In [28]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import random
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, recall_score, roc_auc_score
from collections import Counter

In [2]:
data = pd.read_csv('./final_data/final_data.csv')
print(data.shape)
data.columns

(84432, 44)


Index(['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'citoglipton', 'insulin',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'target', 'self_pay',
       'medicaid/medicare', 'coverByInsurance', 'f_diag', 's_diag', 't_diag',
       'clustering3', 'clustering4', 'clustering2', 'medical_specialty_new',
       'discharge_id', 'admission_id'],
      dtype='object')

In [3]:
data.dtypes

race                      object
gender                    object
age                       object
admission_type_id          int64
time_in_hospital           int64
num_lab_procedures         int64
num_procedures             int64
num_medications            int64
number_outpatient          int64
number_emergency           int64
number_inpatient           int64
number_diagnoses           int64
max_glu_serum             object
A1Cresult                 object
metformin                 object
repaglinide               object
nateglinide               object
chlorpropamide            object
acetohexamide             object
glipizide                 object
glyburide                 object
tolbutamide               object
pioglitazone              object
rosiglitazone             object
acarbose                  object
miglitol                  object
citoglipton               object
insulin                   object
metformin-pioglitazone    object
change                    object
diabetesMe

In [4]:
y = data['target']

In [5]:
data = data.drop(['target'], axis = 1)

In [6]:
cat_cols = list(data.select_dtypes(include=[object]).columns)
for col in cat_cols:
    data[col] = pd.Categorical(data[col])

In [7]:
le = preprocessing.LabelEncoder()
col_to_encode = data[list(data.select_dtypes(include=['category']).columns)]
for col in col_to_encode:
    data[col] = le.fit_transform(data[col])

In [8]:
data.dtypes

race                      int32
gender                    int32
age                       int32
admission_type_id         int64
time_in_hospital          int64
num_lab_procedures        int64
num_procedures            int64
num_medications           int64
number_outpatient         int64
number_emergency          int64
number_inpatient          int64
number_diagnoses          int64
max_glu_serum             int32
A1Cresult                 int32
metformin                 int32
repaglinide               int32
nateglinide               int32
chlorpropamide            int32
acetohexamide             int32
glipizide                 int32
glyburide                 int32
tolbutamide               int32
pioglitazone              int32
rosiglitazone             int32
acarbose                  int32
miglitol                  int32
citoglipton               int32
insulin                   int32
metformin-pioglitazone    int32
change                    int32
diabetesMed               int32
self_pay

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=42, stratify=y, shuffle = True)

In [12]:
gbm = GradientBoostingClassifier(loss = "deviance")
max_feature = ["sqrt", "auto"]
max_depth = range(1,10,1)
n_est = range(200,1000,200)
learning_rate= [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2]
grid_param = {"max_depth": max_depth, "n_estimators": n_est, "learning_rate":learning_rate, 
               "min_samples_split": range(2,10,2),"min_samples_leaf": range(2,10,2), "max_features": max_feature}
n_folds = 3

rs = RandomizedSearchCV(gbm, grid_param, cv = n_folds, n_iter=10, scoring = 'roc_auc')

In [32]:
weight_ratio = float(len(y_train[y_train == 0]))/float(len(y_train[y_train == 
1]))
w_array = np.array([1]*y_train.shape[0])
w_array[y_train==1] = weight_ratio
w_array[y_train==0] = 1- weight_ratio

In [37]:
print('Original dataset shape %s' % Counter(w_array))

Original dataset shape Counter({-6: 52372, 7: 6730})


In [44]:
rs.fit(X_train, y_train)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.025, loss='deviance', max_depth=7,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=8, min_s...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'max_depth': range(1, 10), 'n_estimators': range(200, 1000, 200), 'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2], 'min_samples_split': range(2, 10, 2), 'min_samples_leaf': range(2, 10, 2), 'max_features': ['sqrt', 'auto']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [45]:
rs.best_params_

{'n_estimators': 200,
 'min_samples_split': 8,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 6,
 'learning_rate': 0.01}

In [46]:
gbm.set_params(max_features = "auto", learning_rate = 0.01, max_depth = 6, loss = "deviance",
                                n_estimators = 200, min_samples_split = 8, min_samples_leaf = 2)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=6,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=8,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [47]:
gbm.fit(X_train, y_train, sample_weight = w_array)

  self.prior = self.scale * np.log(pos / neg)
  np.sum(sample_weight * ((y * pred) - np.logaddexp(0.0, pred))))


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=6,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=2, min_samples_split=8,
              min_weight_fraction_leaf=0.0, n_estimators=200,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [48]:
print(roc_auc_score(y_test, gbm.predict(X_test)))

0.5


In [49]:
print(confusion_matrix(y_test, gbm.predict(X_test)))

[[22445     0]
 [ 2885     0]]


In [51]:
max_feature = ["sqrt", "auto"]
max_depth = [3,4,5,6]
n_est = range(50,150,5)
learning_rate= [0.01, 0.1, 0.3]
subsample=[0.9, 0.95, 1.0]
grid_param = [{"max_depth": max_depth, "n_estimators": n_est, "learning_rate":learning_rate, 
               "min_samples_split":[2,3,5,7],"min_samples_leaf": [2,3,5,7], 
               "subsample":subsample, "max_features": max_feature}]
n_folds = 5

clf = GridSearchCV(gbm, grid_param, cv = n_folds, refit = False, n_jobs= -1)
#add shuffle = True, add score = recall in gridsearch

In [52]:
clf.fit(X_train, y_train) 

KeyboardInterrupt: 

In [None]:
clf.best_params_