In [None]:
#Importing Libraries
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn import metrics

In [None]:
#Reading in Dataset
df = pd.read_csv('BankChurners.csv')

#Subsetting to needed columns
df = df[df.columns[1:21]]

#Changing Attrition_Flag column to 0s and 1s
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})

In [None]:
#Creating Target Array
X = df.drop('Attrition_Flag', axis=1)
y = df.Attrition_Flag

In [None]:
#Creating a categorical class that transforms to one hot encoding 
categorical_pipeline = Pipeline(
    steps=[
        ("impute", SimpleImputer(strategy="most_frequent")),
        ("oh-encode", OneHotEncoder(handle_unknown="ignore", sparse=False)),
    ]
)

In [None]:
#Creating a numerical class that imputs means for missing values
numeric_pipeline = Pipeline(
    steps=[("impute", SimpleImputer(strategy="mean")), 
           ("scale", StandardScaler())]
)

In [None]:
#Separating Categorical and Numeric data types
cat_cols = X.select_dtypes(exclude="number").columns
num_cols = X.select_dtypes(include="number").columns

In [None]:
#Creating Class that Transofrms Numeric and Categorical Variables using classes above
full_processor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, num_cols),
        ("categorical", categorical_pipeline, cat_cols),
    ]
)

In [None]:
# Apply preprocessing

# Using Full Processor for Predictor Variables
X_processed = full_processor.fit_transform(X)

# Imputing mode for Target Variable
y_processed = SimpleImputer(strategy="most_frequent").fit_transform(
    y.values.reshape(-1, 1)
)


# Splitting data 
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, stratify=y_processed, random_state=219
)

In [None]:
#Tuning XGBoost Model with all Default parameters

# Initiate classifier
xgb_cl = xgb.XGBClassifier()

# Fit
xgb_cl.fit(X_train, y_train)

# Predict
preds = xgb_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)

In [None]:
#Tuning Model
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.1, 0.01, 0.05],
    "gamma": [0, 0.25, 1],
    "reg_lambda": [0, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
}

In [None]:
# Initiate classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Initiate Grid Search from Param_Grid tuning above
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

# Fit
_ = grid_cv.fit(X_processed, y_processed)

In [None]:
# AUROC Score for 2nd model
grid_cv.best_score_

In [None]:
# Checking best parameters to Tune next model
grid_cv.best_params_

In [None]:
# Insert the new fixed values to the grid
param_grid["scale_pos_weight"] = [.25, .5, .75, 1]
param_grid["subsample"] = [0.8]
param_grid["colsample_bytree"] = [0.5]

# Give new value ranges to other params
param_grid["gamma"] = [3, 5, 7]
param_grid["max_depth"] = [9, 15, 20]
param_grid["reg_lambda"] = [0]
param_grid["learning_rate"] = [0.1, 0.3, 0.6, 1]

In [None]:
#Creating new model with new tuning parameters
grid_cv_2 = GridSearchCV(xgb_cl, param_grid, 
                         cv=3, scoring="roc_auc")

_ = grid_cv_2.fit(X_processed, y_processed)

In [None]:
# AUROC Score for 3rd model
grid_cv_2.best_score_

In [None]:
# Final Model
final_cl = xgb.XGBClassifier(
    **grid_cv.best_params_,
    bjective="binary:logistic"
)

_ = final_cl.fit(X_train, y_train)

In [None]:
#Accuracy Score. Not much better than original, simple model.
# Fit
final_cl.fit(X_train, y_train)

# Predict
preds = final_cl.predict(X_test)

# Score
accuracy_score(y_test, preds)

In [None]:
# Model Metrics
print("Accuracy:",metrics.accuracy_score(y_test, preds))
print("Precision:",metrics.precision_score(y_test, preds))
print("Recall:",metrics.recall_score(y_test, preds))

In [None]:
#ROC Curve
fpr, tpr, _ = metrics.roc_curve(y_test,  preds)
auc = metrics.roc_auc_score(y_test, preds)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()