In [25]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import random
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, recall_score,roc_auc_score

In [2]:
data = pd.read_csv('new_data.csv')
print(data.shape)
data.columns

(86500, 53)


Index(['Unnamed: 0', 'race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'self_pay',
       'medicaid/medicare', 'coverByInsurance', 'f_diag', 's_diag', 't_diag',
       'clustering3', 'clustering4', 'clustering2', 'target'],
      dtype='object')

In [3]:
y = data['target']

In [4]:
data = data.drop(['Unnamed: 0', 'target'], axis = 1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.4, random_state=42, stratify=y, shuffle = True)

In [7]:
weight = pd.Series(np.zeros(len(y_train.index)), index=y_train.index)
for i in y_train.index:
    if y_train[i] == 1:
        weight[i] = 9
    elif y_train[i] == 0:
        weight[i] = 1

In [56]:
newGBM = GradientBoostingClassifier(max_features = "sqrt", learning_rate = 0.2, max_depth = 3, loss = "deviance", subsample = 0.95
                                   , n_estimators = 50, min_samples_split = 0.17272727272727273, min_samples_leaf = 0.13636363636363638)

In [57]:
newGBM.fit(X_train, y_train, sample_weight = weight)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=3,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=0.13636363636363638,
              min_samples_split=0.17272727272727273,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=0.95, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [10]:
newGBM.score(X_train, y_train)

0.5654720616570328

In [11]:
newGBM.score(X_test, y_test)

0.5684971098265896

In [20]:
y_scores_gb = newGBM.decision_function(X_test)
fpr_gb, tpr_gb, _ = roc_curve(y_test, y_scores_gb)
roc_auc_gb = auc(fpr_gb, tpr_gb)

print("Area under ROC curve", roc_auc_gb)

Area under ROC curve 0.6459600134573296


In [21]:
y_scores_gb_train = newGBM.decision_function(X_train)
fpr_gb_train, tpr_gb_train, _ = roc_curve(y_train, y_scores_gb_train)
roc_auc_gb_train = auc(fpr_gb_train, tpr_gb_train)

print("Area under ROC curve ", roc_auc_gb_train)

Area under ROC curve  0.6478821047012131


In [22]:
print(confusion_matrix(y_test, newGBM.predict(X_test)))

[[17128 13611]
 [ 1319  2542]]


#==========================test for final_data set=============================

In [2]:
data = pd.read_csv('./final_data/final_data.csv')
print(data.shape)
data.columns

(84432, 44)


Index(['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'citoglipton', 'insulin',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'target', 'self_pay',
       'medicaid/medicare', 'coverByInsurance', 'f_diag', 's_diag', 't_diag',
       'clustering3', 'clustering4', 'clustering2', 'medical_specialty_new',
       'discharge_id', 'admission_id'],
      dtype='object')

In [3]:
y = data['target']

In [4]:
cat_cols = list(data.select_dtypes(include=[object]).columns)
for col in cat_cols:
    data[col] = pd.Categorical(data[col])

In [5]:
le = preprocessing.LabelEncoder()
col_to_encode = data[list(data.select_dtypes(include=['category']).columns)]
for col in col_to_encode:
    data[col] = le.fit_transform(data[col])

In [6]:
data = data.drop(['target'], axis = 1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.4, random_state=42, stratify=y, shuffle = True)

In [8]:
weight = pd.Series(np.zeros(len(y_train.index)), index=y_train.index)
for i in y_train.index:
    if y_train[i] == 1:
        weight[i] = 9
    elif y_train[i] == 0:
        weight[i] = 1

In [39]:
newGBM = GradientBoostingClassifier(max_features = "auto", learning_rate = 0.2, max_depth = 2, loss = "deviance", subsample = 0.95
                                   , n_estimators = 50, min_samples_split = 3, min_samples_leaf = 5)

In [40]:
newGBM.fit(X_train, y_train, sample_weight = weight)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=2,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=5, min_samples_split=3,
              min_weight_fraction_leaf=0.0, n_estimators=50,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=0.95, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [41]:
y_scores_gb_final = newGBM.decision_function(X_test)
fpr_gb_final, tpr_gb_final, _ = roc_curve(y_test, y_scores_gb_final)
roc_auc_gb_final = auc(fpr_gb_final, tpr_gb_final)

print("Area under ROC curve", roc_auc_gb_final)

Area under ROC curve 0.6532581813179968


In [42]:
print(confusion_matrix(y_test, newGBM.predict(X_test)))

[[15465 14462]
 [ 1143  2703]]


In [None]:
#====================XBM===============================

In [13]:
import xgboost as xgb

In [47]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic')
param_dist = {'n_estimators': [1000,2000,3000],
              'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
              'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
              'max_depth': [2,3,5],
              'max_feature' : ["sqrt", "auto"],
              "min_samples_split": [2,3,5,7],
              "min_samples_leaf": [2,3,5,7]
             }
numFolds = 3
clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = numFolds,  
                         n_iter = 10,  
                         scoring = 'recall')

In [48]:
clf.fit(X_train, y_train, sample_weight = weight)

RandomizedSearchCV(cv=3, error_score='raise-deprecating',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'n_estimators': [1000, 2000, 3000], 'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2], 'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0], 'max_depth': [2, 3, 5], 'max_feature': ['sqrt', 'auto'], 'min_samples_split': [2, 3, 5, 7], 'min_samples_leaf': [2, 3, 5, 7]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='recall', verbose=0)

In [50]:
clf.best_params_

{'subsample': 0.9,
 'n_estimators': 2000,
 'min_samples_split': 3,
 'min_samples_leaf': 7,
 'max_feature': 'sqrt',
 'max_depth': 2,
 'learning_rate': 0.01}

In [51]:
clf_xgb.set_params(min_samples_split = 3, min_samples_leaf = 7, max_depth = 2, learning_rate = 0.01, n_estimators = 2000,
                  subsample = 0.9, max_feature = 'sqrt')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=2, max_feature='sqrt', min_child_weight=1,
       min_samples_leaf=7, min_samples_split=3, missing=None,
       n_estimators=2000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.9)

In [52]:
clf_xgb.fit(X_train, y_train, sample_weight = weight)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=2, max_feature='sqrt', min_child_weight=1,
       min_samples_leaf=7, min_samples_split=3, missing=None,
       n_estimators=2000, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.9)

In [53]:
y_pred = clf_xgb.predict_proba(X_test)[:,1]
print('auc for xgb:', roc_auc_score(y_test,y_pred))

auc for xgb: 0.6575631531960915


In [54]:
print(confusion_matrix(y_test, clf_xgb.predict(X_test)))

[[16085 13842]
 [ 1190  2656]]


In [None]:
#====================XBM with dummyfied the numeric===============================