# Importing packages and libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Reading file and tidying

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

# Train test split

In [3]:
y_train = df_train[["attrition_flag"]]
x_train = df_train.drop("attrition_flag", axis=1)

y_test = df_test[["attrition_flag"]]
x_test = df_test.drop("attrition_flag", axis=1)

In [4]:
xgb = XGBClassifier(use_label_encoder=False,
                random_state=2021)
oversampler = SMOTE(random_state=2021)
x_train, y_train = oversampler.fit_resample(x_train, y_train)
xgb.fit(x_train, y_train, eval_metric='logloss')
y_pred = xgb.predict(x_test)

print(f"-------------------------TEST SCORES-----------------------")
print(f"Recall: {recall_score(y_test, y_pred)}")
print(f"Precision: {precision_score(y_test, y_pred)}")
print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
print()

-------------------------TEST SCORES-----------------------
Recall: 0.8092307692307692
Precision: 0.834920634920635
F2-Score: 0.8142414860681114
Accuracy score: 0.9437314906219151
AUC Score: 0.8893302582191471



#### Applying xgboost model

In [5]:
# # # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# TN | FP
# FN| TP

In [6]:
# # https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde
# # {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
# #  "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
# #  "min_child_weight" : [ 1, 3, 5, 7 ], Minimum sum of instance weight(hessian) needed in a child.
# #  "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ], Minimum loss reduction required to make a further partition on a leaf node of the tree.
# #  "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] } Subsample ratio of columns when constructing each tree.
# # GridSearch, Coordinate Descent

# model = XGBClassifier(n_jobs=-1, random_state=0)

# param_dist = {
#     "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
#     "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
#     'min_child_weight': [1, 3, 5, 7],
#     'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],
#     'colsample_bytree': [0.3, 0.4, 0.5 , 0.7],
# }

# grid_search = GridSearchCV(model, param_dist, n_jobs=-1, scoring="recall", error_score='raise')

# grid_search.fit(x_train, y_train)
# grid_search.best_params_

In [7]:
# gs_best_params = grid_search.best_estimator_
# print(gs_best_params)

# y_pred = gs_best_params.predict(x_test)
# false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)

# print(classification_report(y_test, y_pred))

# print("-------------------------TEST SCORES-----------------------")
# print("AUC:", auc(false_positive_rate, true_positive_rate))
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Precision:", precision_score(y_test, y_pred))
# print("Recall:", recall_score(y_test, y_pred, average='macro'))
# print("f1_score:", f1_score(y_test, y_pred, average='macro'))