In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb
import pandas as pd
import metrics
import pickle

import Evaluation as eva
import utils

__filename__ = 'XGboost.model'

### load data

In [None]:
data = pd.read_csv(f'{utils._data_pth_}/processed/train_joined.csv', index_col=0)
y, X = data['isFraud'], data.drop(columns=['isFraud'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=utils._random_seed_)

### configure

In [None]:
xgb.set_config(verbosity=2)

### train


In [None]:
dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)
booster = xgb.train({
    'objective': 'binary:logistic', 
    'eval_metric': ['auc', 'ams@0'],
    'max_depth': 50,
    'num_parallel_tree': 5,
}, dtrain=dtrain)

### test

In [None]:
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, roc_auc_score, roc_curve, confusion_matrix
from sklearn.metrics import precision_recall_curve

### Roc curve and PR curve
def roc_pr_curve(y_test, probs_predict):
    # Draw the ROC curve
    plt.figure(1)
    # ROC curve components
    fpr, tpr, thresholdsROC = roc_curve(y_test, probs_predict)
    #plot
    plt.plot(fpr,tpr)
    plt.title("ROC curve")
    plt.xlabel("1-SPEC")
    plt.ylabel("SENS")
    plt.show
    
    # Draw the PR curve
    plt.figure(2)
    # Components of the Precision recall curvey
    precision, recall, thresholdsPR = precision_recall_curve(y_test, probs_predict)
    # plot
    plt.plot(recall,precision)
    plt.title("PR curve")
    plt.xlabel("SENS (Recall)")
    plt.ylabel("PPV (Precision)")
    plt.show
    

y_pred_probs = booster.predict(dtest)
roc_pr_curve(y_test, y_pred_probs)
y_pred_probs[y_pred_probs >= 0.5] = 1
y_pred_probs[y_pred_probs < 0.5] = 0
metrics.conf_matrix(y_test, y_pred_probs)


In [None]:
type(y_pred_probs)

## Export model

In [None]:
# save the model to disk
pickle.dump(booster, open(f'{utils._data_pth_}/models/{__filename__}', 'wb'))

## Evaluation

In [None]:
eva.evaluate("XGboost")

## Grid Search

In [None]:
import gc
import csv
gc.enable()
dtest = xgb.DMatrix(data=X_test, label=y_test)
xgb.set_config(verbosity=1)

fieldnames = ['max_depth', 'num_parallel_tree', 'TN', 'FP', 'FN', 'TP']
with open('grid_search_result.csv', 'a') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

for tree_depth in range(3,8):
    if tree_depth % 10 != 0:
        continue
    for num_tree in range(1,50):
        if num_tree % 9 != 0:
            continue
        dtrain = xgb.DMatrix(data=X_train, label=y_train)
        booster = xgb.train({
            'objective': 'binary:logistic', 
            'eval_metric': ['auc', 'ams@0'],
            'max_depth': tree_depth,
            'num_parallel_tree': num_tree,
        }, dtrain=dtrain)
        y_pred_probs = booster.predict(dtest)
        y_pred_probs[y_pred_probs >= 0.5] = 1
        y_pred_probs[y_pred_probs < 0.5] = 0
        tn, fp, fn, tp = confusion_matrix(y_test, y_pred_probs).ravel()

        with open('grid_search_result.csv', 'a') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writerow({fieldnames[0]: tree_depth,
                             fieldnames[1]: num_tree,
                             fieldnames[2]: tn,
                             fieldnames[3]: fp,
                             fieldnames[4]: fn,
                             fieldnames[5]: tp})

        del booster
        del dtrain
        
