# Importing packages and libraries
https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Reading file and tidying

In [2]:
df = pd.read_csv("../Data/BankChurners.csv")
df.drop(columns=["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1, inplace=True)

# tidy up and standardise column naming
tidied_cols = []
for col_name in df.columns:
    col_name = col_name.lower()
    if '_ct' in col_name:
        col_name = col_name.replace('_ct', '_count')
    if '_chng' in col_name:
        col_name = col_name.replace('_chng', '_change')
    tidied_cols.append( col_name )

df.columns = tidied_cols.copy()

numerical = list(df.describe().columns)

categorical = [i for i in df.columns if i not in numerical and i != "attrition_flag"]

# Create train test split

In [3]:
y = df[["attrition_flag"]]
x = df.drop("attrition_flag", axis=1)
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   customer_age              10127 non-null  int64  
 1   gender                    10127 non-null  object 
 2   dependent_count           10127 non-null  int64  
 3   education_level           10127 non-null  object 
 4   marital_status            10127 non-null  object 
 5   income_category           10127 non-null  object 
 6   card_category             10127 non-null  object 
 7   months_on_book            10127 non-null  int64  
 8   total_relationship_count  10127 non-null  int64  
 9   months_inactive_12_mon    10127 non-null  int64  
 10  contacts_count_12_mon     10127 non-null  int64  
 11  credit_limit              10127 non-null  float64
 12  total_revolving_bal       10127 non-null  int64  
 13  avg_open_to_buy           10127 non-null  float64
 14  total_

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2021)

In [5]:
def impute_unknown(df_x):
    x = df_x.copy()

    marital_status_mode = x["marital_status"].mode()[0]
    x["marital_status"] = x["marital_status"].replace("Unknown", marital_status_mode)
    education_level_mode = x["education_level"].mode()[0]
    x["education_level"] = x["education_level"].replace("Unknown", education_level_mode)
    income_category_mode = x["income_category"].mode()[0]
    x["income_category"] = x["income_category"].replace("Unknown", income_category_mode)
    
    return x

# Preparing for modelling

In [6]:
def data_preprocessing(df_x, df_y):
    x = df_x.copy()
    y = df_y.copy()
    
    label_enc = LabelEncoder()
    y["attrition_flag"] = label_enc.fit_transform(y["attrition_flag"])
    x["gender"] = label_enc.fit_transform(x["gender"])
    
    onehot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

    values = onehot_enc.fit_transform(x[["marital_status"]])
    labels = np.array(["divorced", "married", "single"]).ravel()
    marital_status_df = pd.DataFrame(values, columns=labels)

    x.reset_index(drop=True, inplace=True)
    marital_status_df.reset_index(drop=True, inplace=True)
    x = pd.concat([x, marital_status_df], axis=1)
    x.drop("marital_status", axis=1, inplace=True)
    
    edu_level_mapper = {"Doctorate": 1, "Post-Graduate": 2, "Graduate": 3, "College": 4, "High School": 5, "Uneducated": 6}
    x["education_level"] = x["education_level"].replace(edu_level_mapper)

    income_cat_mapper = {"$120K +": 1, "$80K - $120K":2, "$60K - $80K":3, "$40K - $60K": 4, "Less than $40K": 5}
    x["income_category"] = x["income_category"].replace(income_cat_mapper)

    card_cat_mapper = {"Platinum":1, "Gold":2, "Silver":3, "Blue": 4}
    x["card_category"] = x["card_category"].replace(card_cat_mapper)
    
    # Transformation and scaling
    skewed = ["credit_limit", "total_amt_change_q4_q1", "total_trans_amt", "total_count_change_q4_q1"]

    for skewed_col in skewed:
        x[skewed_col] = np.where(x[skewed_col] > 0 , np.log(x[skewed_col]), 0)
        # data_log[skewed_col] = np.log(data_log[skewed_col].mask(data_log[skewed_col] <=0)).fillna(0)

    scaler = MinMaxScaler()
    x[numerical] = scaler.fit_transform(x[numerical]) 
    
    return x, y

### Execute preprocessing steps

In [7]:
# Data cleaning - imputing unknown
x_train = impute_unknown(x_train)
x_test = impute_unknown(x_test)

# Dropping correlated features
df = df.drop(columns=["customer_age", "avg_open_to_buy", "total_trans_count"])

# Feature engineering and feature scaling
x_train, y_train = data_preprocessing(x_train, y_train)
x_test, y_test = data_preprocessing(x_test, y_test)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


# Running Model

In [8]:
def run_variation_model(x_train, y_train, x_test, y_test, variation):
    if "pca" in variation:
        pca = PCA(0.9, random_state=2021)
        x_train = pca.fit_transform(x_train)
        x_test = pca.fit_transform(x_test)
        
#     https://medium.com/analytics-vidhya/categorical-feature-selection-using-chi-squared-test-e4c0d0af6b7e
#     https://towardsdatascience.com/using-the-chi-squared-test-for-feature-selection-with-implementation-b15a4dad93f1
    if "chi_square" in variation:
        chi_scores = chi2(x_train, y_train)
        p_values = pd.Series(chi_scores[1], index = x_train.columns)
        p_values.sort_values(ascending = False , inplace = True)
#         p_values.plot.bar()
        x_train = x_train.drop(["card_category", "months_on_book", "divorced", "total_amt_change_q4_q1", "dependent_count", "single", "income_category", "married", "education_level", "credit_limit", "gender"], axis=1)
        x_test = x_test.drop(["card_category", "months_on_book", "divorced", "total_amt_change_q4_q1", "dependent_count", "single", "income_category", "married", "education_level", "credit_limit", "gender"], axis=1)
        
    if "smote" in variation:
        oversampler = SMOTE(random_state=2021)
        x_train, y_train = oversampler.fit_resample(x_train, y_train)
        
        
    xgb = XGBClassifier(use_label_encoder=False,
                    random_state=2021)
    xgb.fit(x_train, y_train, eval_metric='logloss')
    y_pred = xgb.predict(x_test)
    
    print(f"-------------------------TEST SCORES for {variation}-----------------------")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
    print()

#### Applying xgboost model

In [15]:
# # # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn


variations = ["base", "smote", "chi_square", "pca", "smote, chi_square", "smote, pca"]
# variations = ["chi_square"]
for variation in variations:
    run_variation_model(x_train, y_train, x_test, y_test, variation)

# xgb.fit(x_train, y_train, eval_metric='logloss')
# y_pred = xgb.predict(x_test)
# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# TN | FP
# FN| TP

-------------------------TEST SCORES for base-----------------------
Recall: 0.9835390946502057
Precision: 0.9738067520372526
F2-Score: 0.9815770945787373
Accuracy score: 0.9639684106614018
AUC Score: 0.9225387780943337

-------------------------TEST SCORES for smote-----------------------
Recall: 0.9794238683127572
Precision: 0.9765533411488863
F2-Score: 0.9788484136310223
Accuracy score: 0.9629812438302073
AUC Score: 0.928173472617917

-------------------------TEST SCORES for chi_square-----------------------
Recall: 0.9776601998824221
Precision: 0.9747948417350527
F2-Score: 0.9770857814336076
Accuracy score: 0.9600197433366239
AUC Score: 0.922676253787365

-------------------------TEST SCORES for pca-----------------------
Recall: 0.9394473838918284
Precision: 0.8550026752273944
F2-Score: 0.9212498558745531
Accuracy score: 0.8153998025666338
AUC Score: 0.5528006150228373

-------------------------TEST SCORES for smote, chi_square-----------------------
Recall: 0.9682539682539683
Pre

In [10]:
# # https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde
# # {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
# #  "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
# #  "min_child_weight" : [ 1, 3, 5, 7 ], Minimum sum of instance weight(hessian) needed in a child.
# #  "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ], Minimum loss reduction required to make a further partition on a leaf node of the tree.
# #  "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] } Subsample ratio of columns when constructing each tree.
# # GridSearch, Coordinate Descent

# model = XGBClassifier(n_jobs=-1, random_state=0)

# param_dist = {
#     "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
#     "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
#     'min_child_weight': [1, 3, 5, 7],
#     'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],
#     'colsample_bytree': [0.3, 0.4, 0.5 , 0.7],
# }

# grid_search = GridSearchCV(model, param_dist, n_jobs=-1, scoring="recall", error_score='raise')

# grid_search.fit(x_train, y_train)
# grid_search.best_params_

In [11]:
# gs_best_params = grid_search.best_estimator_
# print(gs_best_params)

# y_pred = gs_best_params.predict(x_test)
# false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)

# print(classification_report(y_test, y_pred))

# print("-------------------------TEST SCORES-----------------------")
# print("AUC:", auc(false_positive_rate, true_positive_rate))
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Precision:", precision_score(y_test, y_pred))
# print("Recall:", recall_score(y_test, y_pred, average='macro'))
# print("f1_score:", f1_score(y_test, y_pred, average='macro'))