In [1]:
import random
import pandas as pd
import numpy as np
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 1
random.seed(SEED)
np.random.seed(SEED)

In [3]:
s_tr = pd.read_parquet('../dataset/dummy_scale_train.parquet')
s_te = pd.read_parquet('../dataset/dummy_scale_test.parquet')

In [4]:
from xgboost.sklearn import XGBClassifier
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, classification_report

In [5]:
x_train, x_test = train_test_split(s_tr, train_size=0.8, test_size=0.2, shuffle=True)

In [6]:
cols = list(x_train.columns.values)
cols.remove('encounter_id')
cols.remove('diabetes_mellitus')

x_tr = x_train[cols].values
y_tr = x_train['diabetes_mellitus'].values
x_te = x_test[cols].values
y_te = x_test['diabetes_mellitus'].values

In [7]:
tuned_parameters_xgb = [{'booster':['gbtree'], 'learning_rate':np.linspace(.05, 1, 20),
                         'min_child_weight':np.linspace(5, 1, 10), 'n_estimators':list(range(100, 300, 20)),
                    'objective':['binary:logistic'],'max_depth': list(range(4,10)), 'gamma': np.linspace(0,1.,20) }]

scores = {'f1': 'f1_macro', 'roc':'roc_auc_ovr' , 'prec': 'precision'}     

clf = RandomizedSearchCV(XGBClassifier(), tuned_parameters_xgb,
                   scoring=scores, 
                   refit= "f1",
                   cv=4,
                   verbose=1,
                   n_jobs=4,
                   n_iter= 80,)

clf.fit(x_tr, y_tr)
# Validation
pred = clf.best_estimator_.predict_proba(x_te)
pred_bin = clf.best_estimator_.predict(x_te)
roc = roc_auc_score(y_te, pred[:,1])
print("ROC test split = ", roc)
print(classification_report(y_te, pred_bin, digits=4 ))
print(clf.best_params_)

Fitting 4 folds for each of 80 candidates, totalling 320 fits






RandomizedSearchCV(cv=4,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=None, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints=None,
                                           n_estimators=100,...
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95, 1.  ]),
                                         'max_depth': [4, 5, 6, 7, 8, 9],
                                         'mi

ROC test split =  0.862838681284858
              precision    recall  f1-score   support

         0.0     0.8682    0.9368    0.9012     20413
         1.0     0.6780    0.4834    0.5644      5619

    accuracy                         0.8389     26032
   macro avg     0.7731    0.7101    0.7328     26032
weighted avg     0.8271    0.8389    0.8285     26032

{'objective': 'binary:logistic', 'n_estimators': 260, 'min_child_weight': 4.555555555555555, 'max_depth': 8, 'learning_rate': 0.1, 'gamma': 0.21052631578947367, 'booster': 'gbtree'}


In [8]:
clf.best_params_

{'objective': 'binary:logistic',
 'n_estimators': 260,
 'min_child_weight': 4.555555555555555,
 'max_depth': 8,
 'learning_rate': 0.1,
 'gamma': 0.21052631578947367,
 'booster': 'gbtree'}

In [11]:
model = XGBClassifier(**clf.best_params_)
model.fit(s_tr[cols], s_tr['diabetes_mellitus'].values)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0.21052631578947367,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=4.555555555555555, missing=nan,
              monotone_constraints='()', n_estimators=260, n_jobs=8,
              num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=1, subsample=1, tree_method='exact',
              validate_parameters=1, verbosity=None)

In [None]:
# with open('/content/drive/MyDrive/dataset/best_xgboost.txt', 'w') as file:
#   file.write(clf.best_params_)
# file.close()

In [12]:
y_pred = model.predict_proba(s_te[cols])

In [13]:
encounter_ids = pd.read_csv('../dataset/WiDS2021/UnlabeledWiDS2021.csv')[['encounter_id']].values
pd.DataFrame.from_dict({'encounter_id':encounter_ids[:, 0], 
                        'diabetes_mellitus': y_pred[:, 1]}).set_index(['encounter_id']).to_csv('../dataset/xgb_predictions_onehot.csv')