In [28]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import precision_recall_curve, precision_recall_fscore_support, roc_curve, auc

from library_code import XGBoostModelData
from sklearn.metrics import recall_score, precision_score, f1_score

In [8]:
xgboost_data = pickle.load(open('xgb_train_validation_data.p', 'rb'))
xgbc = XGBClassifier(n_estimators=100, max_depth=10)


In [33]:
data_folds = xgboost_data.k_folds
for fold in data_folds:
    xgbc.fit(X=data_folds[fold]['X_train'], y=data_folds[fold]['y_train'], eval_metric='logloss')
    
    y_true = list(data_folds[fold]['y_test'])
    y_pred = xgbc.predict(data_folds[fold]['X_test'])
    
    print(fold)
    print('Accuracy: {}'.format((sum([x==y for x,y in zip(y_true, y_pred)])/len(y_pred))))
    print('Precision: {}'.format(precision_score(y_pred=y_pred, y_true=y_true)))
    print('Recall: {}'.format(recall_score(y_pred=y_pred, y_true=y_true)))
    print('F1 Score: {}'.format(f1_score(y_pred=y_pred, y_true=y_true)))
    print()

fold_0
Accuracy: 0.871976401179941
Precision: 0.5714285714285714
Recall: 0.05454545454545454
F1 Score: 0.0995850622406639

fold_1
Accuracy: 0.8707964601769912
Precision: 0.5384615384615384
Recall: 0.031818181818181815
F1 Score: 0.06008583690987123

fold_2
Accuracy: 0.872491145218418
Precision: 0.5882352941176471
Recall: 0.045662100456621
F1 Score: 0.0847457627118644

fold_3
Accuracy: 0.8695395513577332
Precision: 0.4444444444444444
Recall: 0.0365296803652968
F1 Score: 0.06751054852320675

fold_4
Accuracy: 0.8713105076741441
Precision: 0.5454545454545454
Recall: 0.0273972602739726
F1 Score: 0.05217391304347826



In [30]:
precision_score(y_pred=y_pred, y_true=y_true)

0.5714285714285714

In [31]:
recall_score(y_pred=y_pred, y_true=y_true)

0.05454545454545454

In [None]:
model = XGBClassifier()
n_estimators = range(1, 401, 50)
param_grid = dict(n_estimators=n_estimators)
kfold = StratifiedKFold(n_splits=6, shuffle=True, random_state=7)
grid_search = GridSearchCV(model, param_grid, scoring="neg_log_loss", n_jobs=-1, cv=kfold)
grid_result = grid_search.fit(X, y_trans)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [None]:
from matplotlib import pyplot
pyplot.errorbar(n_estimators, means, yerr=stds)
pyplot.title("XGBoost n_estimators vs Log Loss")
pyplot.xlabel('n_estimators')
pyplot.ylabel('Log Loss')
pyplot.savefig('n_estimators.png')
pyplot.show()

In [None]:
xgbc.fit(X=X, y=y_trans, eval_metric='logloss')

In [None]:
y_pred = xgbc.predict(X)
y_proba = xgbc.predict_proba(X)

In [None]:
pr_data = precision_recall_curve(y_trans, [x[1] for x in y_proba])
fpr, tpr, _ = roc_curve(y_true=y_trans, y_score=y_pred)

In [None]:
precision = pr_data[0]
recall = pr_data[1]
probas = pr_data[2]

In [None]:
plt.scatter(x=recall, y=precision)

In [None]:
sum([x==y for x,y in zip(y_trans,y_pred)]) / len(y_pred)

In [None]:
y_trans