# Importing packages and libraries
https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, fbeta_score, accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV

# Reading file and tidying

In [10]:
df = pd.read_csv("../Data/BankChurners.csv")
df.drop(columns=["CLIENTNUM", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], axis=1, inplace=True)
df = df.drop(columns=["Customer_Age", "Avg_Open_To_Buy", "Total_Trans_Ct"])
df.columns = ['attrition_flag', 'gender', 'dependent_count', 'education_level', 
              'marital_status', 'income_category', 'card_category', 'months_on_book', 
              'total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 
              'credit_limit', 'total_revolving_bal', 'total_amt_change_q4_q1', 
              'total_trans_amt','total_count_change_q4_q1', 'avg_utilization_ratio']

numerical = ['dependent_count', 'months_on_book', 
             'total_relationship_count', 'months_inactive_12_month',
             'contacts_count_12_month', 'credit_limit', 'total_revolving_bal',
             'total_amt_change_q4_q1', 'total_trans_amt',
             'total_count_change_q4_q1', 'avg_utilization_ratio']

# Train test split

In [11]:
y = df[["attrition_flag"]]
x = df.drop("attrition_flag", axis=1)
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 16 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   gender                    10127 non-null  object 
 1   dependent_count           10127 non-null  int64  
 2   education_level           10127 non-null  object 
 3   marital_status            10127 non-null  object 
 4   income_category           10127 non-null  object 
 5   card_category             10127 non-null  object 
 6   months_on_book            10127 non-null  int64  
 7   total_relationship_count  10127 non-null  int64  
 8   months_inactive_12_month  10127 non-null  int64  
 9   contacts_count_12_month   10127 non-null  int64  
 10  credit_limit              10127 non-null  float64
 11  total_revolving_bal       10127 non-null  int64  
 12  total_amt_change_q4_q1    10127 non-null  float64
 13  total_trans_amt           10127 non-null  int64  
 14  total_

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2021)

In [16]:
def impute_unknown(df_x):
    x = df_x.copy()

    marital_status_mode = x["marital_status"].mode()[0]
    x["marital_status"] = x["marital_status"].replace("Unknown", marital_status_mode)
    education_level_mode = x["education_level"].mode()[0]
    x["education_level"] = x["education_level"].replace("Unknown", education_level_mode)
    income_category_mode = x["income_category"].mode()[0]
    x["income_category"] = x["income_category"].replace("Unknown", income_category_mode)
    
    return x
    
x_train = impute_unknown(x_train)
x_test = impute_unknown(x_test)

# Preparing for modelling

In [17]:
def data_preprocessing(df_x, df_y):
    x = df_x.copy()
    y = df_y.copy()
    
    label_enc = LabelEncoder()
    y["attrition_flag"] = label_enc.fit_transform(y["attrition_flag"])
    x["gender"] = label_enc.fit_transform(x["gender"])
    
    onehot_enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

    values = onehot_enc.fit_transform(x[["marital_status"]])
    labels = np.array(["divorced", "married", "single"]).ravel()
    marital_status_df = pd.DataFrame(values, columns=labels)

    x.reset_index(drop=True, inplace=True)
    marital_status_df.reset_index(drop=True, inplace=True)
    x = pd.concat([x, marital_status_df], axis=1)
    x.drop("marital_status", axis=1, inplace=True)
    
    edu_level_mapper = {"Doctorate": 1, "Post-Graduate": 2, "Graduate": 3, "College": 4, "High School": 5, "Uneducated": 6}
    x["education_level"] = x["education_level"].replace(edu_level_mapper)

    income_cat_mapper = {"$120K +": 1, "$80K - $120K":2, "$60K - $80K":3, "$40K - $60K": 4, "Less than $40K": 5}
    x["income_category"] = x["income_category"].replace(income_cat_mapper)

    card_cat_mapper = {"Platinum":1, "Gold":2, "Silver":3, "Blue": 4}
    x["card_category"] = x["card_category"].replace(card_cat_mapper)
    
    # Transformation and scaling
    skewed = ["credit_limit", "total_amt_change_q4_q1", "total_trans_amt", "total_count_change_q4_q1"]

    for skewed_col in skewed:
        x[skewed_col] = np.where(x[skewed_col] > 0 , np.log(x[skewed_col]), 0)
        # data_log[skewed_col] = np.log(data_log[skewed_col].mask(data_log[skewed_col] <=0)).fillna(0)

    scaler = MinMaxScaler()
    x[numerical] = scaler.fit_transform(x[numerical]) 
    
    return x, y

In [18]:
x_train, y_train = data_preprocessing(x_train, y_train)
x_test, y_test = data_preprocessing(x_test, y_test)

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
def run_variation_model(x_train, y_train, x_test, y_test, variation):
    if "pca" in variation:
        pca = PCA(0.9, random_state=2021)
        x_train = pca.fit_transform(x_train)
        x_test = pca.fit_transform(x_test)       
        
    
    
    print(f"-------------------------TEST SCORES for {variation}-----------------------")
    print(f"Recall: {recall_score(y_test, y_pred)}")
    print(f"Precision: {precision_score(y_test, y_pred)}")
    print(f"F2-Score: {fbeta_score(y_test, y_pred, beta=2)}")
    print(f"Accuracy score: {accuracy_score(y_test, y_pred)}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred)}")
    print()

#### Applying xgboost model

In [9]:
# # # https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn
xgb = XGBClassifier(use_label_encoder=False,
                    random_state=2021)

oversampler = SMOTE(random_state=2021)
x_train, y_train = oversampler.fit_resample(x_train, y_train)

xgb.fit(x_train, y_train, eval_metric='logloss')
    y_pred = xgb.predict(x_test)
# xgb.fit(x_train, y_train, eval_metric='logloss')
# y_pred = xgb.predict(x_test)
# cm = confusion_matrix(y_test, y_pred)
# print(cm)
# TN | FP
# FN| TP

-------------------------TEST SCORES for base-----------------------
Recall: 0.9782480893592005
Precision: 0.9635205558772437
F2-Score: 0.9752666744813034
Accuracy score: 0.9506416584402764
AUC Score: 0.8922009677565234

-------------------------TEST SCORES for smote-----------------------
Recall: 0.975896531452087
Precision: 0.9606481481481481
F2-Score: 0.9728082512892638
Accuracy score: 0.9461994076999013
AUC Score: 0.8833328811106589

-------------------------TEST SCORES for chi_square-----------------------
Recall: 0.9711934156378601
Precision: 0.9610238510762071
F2-Score: 0.9691423207790684
Accuracy score: 0.9427443237907206
AUC Score: 0.882519784742007

-------------------------TEST SCORES for pca-----------------------
Recall: 0.9735449735449735
Precision: 0.8474923234390993
F2-Score: 0.945421329070564
Accuracy score: 0.8307008884501481
AUC Score: 0.5283109483109483

-------------------------TEST SCORES for smote, chi_square-----------------------
Recall: 0.9512051734273956
Prec

In [10]:
# # https://towardsdatascience.com/doing-xgboost-hyper-parameter-tuning-the-smart-way-part-1-of-2-f6d255a45dde
# # {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
# #  "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
# #  "min_child_weight" : [ 1, 3, 5, 7 ], Minimum sum of instance weight(hessian) needed in a child.
# #  "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ], Minimum loss reduction required to make a further partition on a leaf node of the tree.
# #  "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ] } Subsample ratio of columns when constructing each tree.
# # GridSearch, Coordinate Descent

# model = XGBClassifier(n_jobs=-1, random_state=0)

# param_dist = {
#     "learning_rate": [0.05, 0.10, 0.15, 0.20, 0.25, 0.30],
#     "max_depth": [3, 4, 5, 6, 8, 10, 12, 15],
#     'min_child_weight': [1, 3, 5, 7],
#     'gamma': [0.0, 0.1, 0.2 , 0.3, 0.4],
#     'colsample_bytree': [0.3, 0.4, 0.5 , 0.7],
# }

# grid_search = GridSearchCV(model, param_dist, n_jobs=-1, scoring="recall", error_score='raise')

# grid_search.fit(x_train, y_train)
# grid_search.best_params_

In [11]:
# gs_best_params = grid_search.best_estimator_
# print(gs_best_params)

# y_pred = gs_best_params.predict(x_test)
# false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred)

# print(classification_report(y_test, y_pred))

# print("-------------------------TEST SCORES-----------------------")
# print("AUC:", auc(false_positive_rate, true_positive_rate))
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print("Precision:", precision_score(y_test, y_pred))
# print("Recall:", recall_score(y_test, y_pred, average='macro'))
# print("f1_score:", f1_score(y_test, y_pred, average='macro'))