In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from lifelines import KaplanMeierFitter

In [2]:
from sklearn.model_selection import RandomizedSearchCV

In [4]:
X_test = pd.read_csv('encoded_X_test.csv')
X_test.head()

Unnamed: 0,age,time_in_hospital,num_medications,number_diagnoses,total_treatments,total_visits,total_procedures,race_AfricanAmerican,race_Caucasian,race_Other,...,glimepiride-pioglitazone_0,glimepiride-pioglitazone_1,metformin-rosiglitazone_0,metformin-rosiglitazone_1,metformin-pioglitazone_0,metformin-pioglitazone_1,change_No,change_Yes,diabetesMed_No,diabetesMed_Yes
0,-3.812915,-0.804767,-1.481259,-2.79697,-0.203068,-0.521816,-0.569333,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1,0.575936,-0.804767,0.002096,0.82082,0.882818,-0.08524,-0.569333,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,-0.051043,0.203343,0.991,-0.212834,-0.203068,-0.521816,0.290806,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,-0.051043,0.203343,0.002096,0.82082,-0.203068,-0.08524,-0.569333,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
4,-1.931979,-0.46873,-1.481259,0.82082,-1.288954,-0.521816,-0.569333,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


In [5]:
X_train = pd.read_csv('encoded_X_train.csv')
X_train.head()

Unnamed: 0,age,time_in_hospital,num_medications,number_diagnoses,total_treatments,total_visits,total_procedures,race_AfricanAmerican,race_Caucasian,race_Other,...,glimepiride-pioglitazone_0,glimepiride-pioglitazone_1,metformin-rosiglitazone_0,metformin-rosiglitazone_1,metformin-pioglitazone_0,metformin-pioglitazone_1,change_No,change_Yes,diabetesMed_No,diabetesMed_Yes
0,-0.051043,3.227674,5.317452,0.82082,0.882818,-0.521816,1.35333,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
1,-2.558958,0.53938,0.620161,0.82082,-0.203068,-0.521816,1.454523,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
2,-0.051043,1.883527,-0.492356,0.303993,-0.203068,-0.08524,-0.215158,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
3,-1.305,-1.140804,0.002096,0.82082,0.882818,1.224488,-0.316351,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
4,-0.051043,-0.804767,-0.24513,0.303993,-0.203068,-0.08524,0.341402,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0


In [7]:
y_test = pd.read_csv('binary_y_test.csv')
y_test.head()

Unnamed: 0,readmitted_target
0,0
1,0
2,1
3,0
4,0


In [6]:
y_train = pd.read_csv('binary_y_train.csv')
y_train.head()

Unnamed: 0,readmitted_target
0,0
1,0
2,1
3,1
4,0


In [8]:
# Create XGB Classifier object
xgb_clf = XGBClassifier()#tree_method = "gpu_exact", 
#                     predictor = "gpu_predictor", 
#                            eval_metric = ["logloss", "auc"]), 
                        
#                             objective = "multi:softmax")
# Create parameter grid
parameters = {"learning_rate": [0.1, 0.01, 0.001],
               "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
               "max_depth": [2, 4, 7, 10],
               "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
               "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
               "reg_alpha": [0, 0.5, 1],
               "reg_lambda": [1, 1.5, 2, 3, 4.5],
               "min_child_weight": [1, 3, 5, 7],
               "n_estimators": [100, 250, 500, 1000]}

# Create RandomizedSearchCV Object
xgb_rscv = RandomizedSearchCV(xgb_clf, param_distributions = parameters, 
                              scoring = ['recall', 'f1'],
                              refit = 'recall',
                             cv = 7, verbose = 3, random_state = 42,
                     return_train_score=True)

# Fit the model
model_xgboost = xgb_rscv.fit(X_train, y_train)

Fitting 7 folds for each of 10 candidates, totalling 70 fits
[CV] subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6, f1=(train=0.567, test=0.565), recall=(train=0.511, test=0.509), total= 4.0min
[CV] subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.0min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6, f1=(train=0.568, test=0.563), recall=(train=0.514, test=0.510), total= 4.1min
[CV] subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.1min remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6, f1=(train=0.569, test=0.556), recall=(train=0.514, test=0.502), total= 4.2min
[CV] subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6 
[CV]  subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6, f1=(train=0.568, test=0.560), recall=(train=0.514, test=0.508), total= 4.2min
[CV] subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6 
[CV]  subsample=0.6, reg_lambda=1.5, reg_alpha=0, n_estimators=500, min_child_weight=5, max_depth=4, learning_rate=0.01, gamma=0.1, colsample_bytree=0.6, f1=(train=0.567, test=0.568), recall=(train=0.513, test=0.513), total= 3.8mi

[CV]  subsample=0.6, reg_lambda=2, reg_alpha=1, n_estimators=250, min_child_weight=7, max_depth=2, learning_rate=0.01, gamma=1, colsample_bytree=1.0, f1=(train=0.567, test=0.570), recall=(train=0.528, test=0.534), total= 1.8min
[CV] subsample=0.6, reg_lambda=2, reg_alpha=1, n_estimators=250, min_child_weight=7, max_depth=2, learning_rate=0.01, gamma=1, colsample_bytree=1.0 
[CV]  subsample=0.6, reg_lambda=2, reg_alpha=1, n_estimators=250, min_child_weight=7, max_depth=2, learning_rate=0.01, gamma=1, colsample_bytree=1.0, f1=(train=0.566, test=0.571), recall=(train=0.527, test=0.530), total= 1.7min
[CV] subsample=0.6, reg_lambda=2, reg_alpha=1, n_estimators=250, min_child_weight=7, max_depth=2, learning_rate=0.01, gamma=1, colsample_bytree=1.0 
[CV]  subsample=0.6, reg_lambda=2, reg_alpha=1, n_estimators=250, min_child_weight=7, max_depth=2, learning_rate=0.01, gamma=1, colsample_bytree=1.0, f1=(train=0.567, test=0.563), recall=(train=0.527, test=0.523), total= 1.7min
[CV] subsample=0.6

KeyboardInterrupt: 

In [None]:
print(model_xgboost.best_score_)
print(model_xgboost.best_params_)