In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import random
import re
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, recall_score,roc_auc_score
from sklearn import preprocessing

In [2]:
data = pd.read_csv('./final_data/final_data.csv')
print(data.shape)
data.columns

(84432, 44)


Index(['race', 'gender', 'age', 'admission_type_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'acetohexamide',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'citoglipton', 'insulin',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'target', 'self_pay',
       'medicaid/medicare', 'coverByInsurance', 'f_diag', 's_diag', 't_diag',
       'clustering3', 'clustering4', 'clustering2', 'medical_specialty_new',
       'discharge_id', 'admission_id'],
      dtype='object')

In [3]:
y = data['target']

In [4]:
data['admission_type_id'] = data['admission_type_id'].apply(str)

In [5]:
data.dtypes

race                      object
gender                    object
age                       object
admission_type_id         object
time_in_hospital           int64
num_lab_procedures         int64
num_procedures             int64
num_medications            int64
number_outpatient          int64
number_emergency           int64
number_inpatient           int64
number_diagnoses           int64
max_glu_serum             object
A1Cresult                 object
metformin                 object
repaglinide               object
nateglinide               object
chlorpropamide            object
acetohexamide             object
glipizide                 object
glyburide                 object
tolbutamide               object
pioglitazone              object
rosiglitazone             object
acarbose                  object
miglitol                  object
citoglipton               object
insulin                   object
metformin-pioglitazone    object
change                    object
diabetesMe

In [6]:
cols_num = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_emergency', 'number_inpatient',
           'number_diagnoses']

In [7]:
data = data.drop(['target'], axis = 1)

In [8]:
dummied_features = pd.get_dummies(data, drop_first = True)

In [9]:
dummied_features.columns

Index(['time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'self_pay', 'medicaid/medicare',
       ...
       'discharge_id_5', 'discharge_id_6', 'discharge_id_Other',
       'admission_id_17', 'admission_id_2', 'admission_id_4', 'admission_id_5',
       'admission_id_6', 'admission_id_7', 'admission_id_Other'],
      dtype='object', length=156)

In [10]:
dummied_features.dtypes

time_in_hospital                                      int64
num_lab_procedures                                    int64
num_procedures                                        int64
num_medications                                       int64
number_outpatient                                     int64
number_emergency                                      int64
number_inpatient                                      int64
number_diagnoses                                      int64
self_pay                                              int64
medicaid/medicare                                     int64
coverByInsurance                                      int64
clustering3                                           int64
clustering4                                           int64
clustering2                                           int64
race_Asian                                            uint8
race_Caucasian                                        uint8
race_Hispanic                           

In [11]:
X_train, X_test, y_train, y_test = train_test_split(dummied_features, y, test_size=0.3, random_state=42, stratify=y, shuffle = True)

In [12]:
scaler = preprocessing.StandardScaler()

In [13]:
scaler.fit(X_train[cols_num])

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [14]:
weight = pd.Series(np.zeros(len(y_train.index)), index=y_train.index)
for i in y_train.index:
    if y_train[i] == 1:
        weight[i] = 9
    elif y_train[i] == 0:
        weight[i] = 1

In [15]:
X_test[cols_num] = scaler.transform(X_test[cols_num])

  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
import xgboost as xgb

In [17]:
clf_xgb = xgb.XGBClassifier(objective = 'binary:logistic', class_weight = 'balanced')
param_dist = {'n_estimators': [1000,2000,3000],
              'learning_rate': [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
              'subsample': [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
              'max_depth': [2,3,5],
              'max_feature' : ["sqrt", "auto"],
              "min_samples_split": [2,3,5,7],
              "min_samples_leaf": [2,3,5,7]
             }
numFolds = 3
clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         cv = numFolds,  
                         n_iter = 10,  
                         scoring = 'roc_auc')

In [18]:
X_train = X_train.rename({"max_glu_serum_>300":"max_glu_serum_greater300", "A1Cresult_>8":"A1Cresult_greater8"}, axis='columns')

In [19]:
X_train.columns = X_train.columns.str.replace(",", "")

In [20]:
X_train.columns = X_train.columns.str.replace(" ", "_")

In [21]:
X_train.columns = X_train.columns.str.replace("[", "")

In [22]:
X_train.columns = X_train.columns.str.replace("]", "")

In [23]:
X_test.columns = X_train.columns.str.replace(",", "")

In [24]:
X_test.columns = X_train.columns.str.replace(" ", "_")

In [25]:
X_test.columns = X_train.columns.str.replace("[", "")

In [26]:
X_test.columns = X_train.columns.str.replace("]", "")

In [None]:
clf.fit(X_train, y_train, sample_weight = {0:1, 1:9})

In [None]:
clf.best_params_

In [None]:
#set best params to xgb
#clf_xgb.set()

In [None]:
#clf_xgb.fit(X_train, y_train, sample_weight = weight)

In [None]:
#y_score = clf_xgb.predict(X_test)

In [None]:
#print(roc_auc_score(y_test,y_score))

In [None]:
print("auc train score: ", roc_auc_score(y_train, clf_xgb.predict_proba(X_train)[:,1]))
print("auc test score: ",roc_auc_score(y_test, clf_xgb.predict_proba(X_test)[:,1]))
print("recall train score: ",recall_score(y_train, clf_xgb.predict(X_train)))
print("recall test score: ",recall_score(y_test, clf_xgb.predict(X_test)))
print("precision train score: ", precision_score(y_train, clf_xgb.predict(X_train)))
print("precision test score: ",precision_score(y_test, clf_xgb.predict(X_test)))
print("accuracy train score: ",accuracy_score(y_train, clf_xgb.predict(X_train)))
print("accuracy test score: ",accuracy_score(y_test, clf_xgb.predict(X_test)))
print(confusion_matrix(y_test, clf_xgb.predict(X_test)))