# Import libraries

In [4]:
# Packages
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
warnings.filterwarnings('ignore')

# Constants
alpha = 0.05
data_file = '../data/bank-train.csv'

# Data preprocessing utilities

In [5]:
from sklearn.preprocessing import LabelEncoder

job_labelenc = LabelEncoder()
marital_labelenc = LabelEncoder()
contact_labelenc = LabelEncoder()
poutcome_labelenc = LabelEncoder()

def preproc_education(x):
    if(x == 'unknown') : return 0
    elif(x == 'primary') : return 1
    elif(x == 'secondary') : return 2
    elif(x == 'tertiary') : return 3
    
def preproc_month(x):
    if(x == 'jan') : return 1
    elif(x == 'feb') : return 2
    elif(x == 'mar') : return 3
    elif(x == 'apr') : return 4
    elif(x == 'may') : return 5
    elif(x == 'jun') : return 6
    elif(x == 'jul') : return 7
    elif(x == 'aug') : return 8
    elif(x == 'sep') : return 9
    elif(x == 'oct') : return 10
    elif(x == 'nov') : return 11
    elif(x == 'dec') : return 12
    
def preproc_binary(x):
    if(x == 'no') : return 0
    elif(x == 'yes') : return 1

# XGBoost

### Search for the best parameters using GridSearchCV

Default validation for GridSearchCV is 5-fold cross validation, hence we will keep it at default state

#### Best parameters are scored based on accuracy

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, matthews_corrcoef
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split

# list down the parameters to search
max_depth = [2,3,4,5]
min_child_weight = [1,2,3]
learning_rate = [0.1,0.15,0.2]
eta = [0.2,0.3]


param_grid = dict(
    learning_rate=learning_rate,
    min_child_weight=min_child_weight,
    max_depth=max_depth,
    eta = eta
)

# Oversampled data should go in the below cell -> df

In [7]:
#prepare the train data for modelling

# Re-read
df = pd.read_csv(data_file)

# Rename last column|
df = df.rename(columns={'y' : 'subscription'})

# ALl preprocessing steps
df['job'] = job_labelenc.fit_transform(df['job'])
df['marital'] = marital_labelenc.fit_transform(df['marital'])
df['contact'] = contact_labelenc.fit_transform(df['contact'])
df['poutcome'] = poutcome_labelenc.fit_transform(df['poutcome'])
df['education'] = df['education'].apply(preproc_education)
df['month'] = df['month'].apply(preproc_month)
df['default'] = df['default'].apply(preproc_binary)
df['housing'] = df['housing'].apply(preproc_binary)
df['loan'] = df['loan'].apply(preproc_binary)


target_col = 'subscription'
feat_cols = [col for col in df.columns if col != 'subscription']

df['subscription'] = df['subscription'].replace({'yes': 1, 'no': 0})

y = df[target_col]
X = df.drop(columns = [target_col, "Unnamed: 0",'duration'])

In [8]:
#train the data
#scoring base on accuracy
xgb_model = XGBClassifier(random_state=1, verbosity=1, objective ="binary:logistic",
                         tree_method = "gpu_hist", device ="cuda")

# Start validation
grid_search = GridSearchCV(estimator=xgb_model,
                           param_grid=param_grid,
                           scoring='accuracy'
                           )

best_model = grid_search.fit(X, y)

#print the best parameters
print('Optimum parameters: ', best_model.best_params_)
print('Accuracy: ', best_model.score(X, y))

Optimum parameters:  {'eta': 0.2, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 2}
Accuracy:  0.9032570227825703


In [9]:
# Finalise our model using the best parameters found
xgb_model = XGBClassifier(random_state=1, verbosity=1, objective="binary:logistic", 
                                   learning_rate=0.1,max_depth=5,min_child_weight=2,
                                   eta=0.2,tree_method = "gpu_hist", device ="cuda")
xgb_model.fit(X,y)

### Test our prediction against the ground truth

In [10]:
#prepare test data 
test_file = '../data/bank-test.csv'

# Re-read
df_test = pd.read_csv(test_file)

# Rename last column|
df_test = df_test.rename(columns={'y' : 'subscription'})

# ALl preprocessing steps
df_test['job'] = job_labelenc.fit_transform(df_test['job'])
df_test['marital'] = marital_labelenc.fit_transform(df_test['marital'])
df_test['contact'] = contact_labelenc.fit_transform(df_test['contact'])
df_test['poutcome'] = poutcome_labelenc.fit_transform(df_test['poutcome'])
df_test['education'] = df_test['education'].apply(preproc_education)
df_test['month'] = df_test['month'].apply(preproc_month)
df_test['default'] = df_test['default'].apply(preproc_binary)
df_test['housing'] = df_test['housing'].apply(preproc_binary)
df_test['loan'] = df_test['loan'].apply(preproc_binary)

X_test = df_test.drop(columns = [target_col, "Unnamed: 0",'duration'])

In [11]:
# use our model to predict
pred = xgb_model.predict(X_test)
pred_dt = pd.DataFrame(pred)

In [12]:
# Convert ground truth values from "no" and "yes" to 0 and 1 (same format as our prediction)
ground_truth = df_test['subscription']
ground_truth_binary = [0 if val == "no" else 1 for val in ground_truth]

# Compute MCC
mcc_score = matthews_corrcoef(ground_truth_binary, pred_dt)
print("MCC Score:", mcc_score)

MCC Score: 0.3582787556836247


### Improving the model

In [13]:
#re-initialise the train data
df_train = df

y = df_train[target_col]

In [14]:
#check for the feature's importance
# we can drop those irrelevant ones and engineer more relevant ones
importances = xgb_model.feature_importances_
columns = X.columns
i = 0

while i < len(columns):
    print(f"The importance of feature {columns[i]} is {round(importances[i]*100, 2)} %.")
    i+=1

The importance of feature age is 3.83 %.
The importance of feature job is 1.23 %.
The importance of feature marital is 2.15 %.
The importance of feature education is 2.06 %.
The importance of feature default is 1.02 %.
The importance of feature balance is 1.91 %.
The importance of feature housing is 13.9 %.
The importance of feature loan is 6.8 %.
The importance of feature contact is 20.95 %.
The importance of feature day is 3.07 %.
The importance of feature month is 5.55 %.
The importance of feature campaign is 2.79 %.
The importance of feature pdays is 6.05 %.
The importance of feature previous is 2.12 %.
The importance of feature poutcome is 26.58 %.


In [15]:
#feature engineer
#total number of days in the year, instead of day and month

df_train['total days'] = df_train['month']*30 + df_train['day']

# lets drop the ones with lower importance
X_new = df_train.drop(columns = [target_col, "job","Unnamed: 0",'month','day','duration'])

# we note that there might be outliers
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()

# Scale the specific column to reduce impact of outliers

X_new['balance'] = scaler.fit_transform(X_new['balance'].values.reshape(-1, 1))
X_new['age'] = scaler.fit_transform(X_new['age'].values.reshape(-1, 1))

In [16]:
# list down the parameters to search
max_depth = [4,6,7]
min_child_weight = [2,3,4]
learning_rate = [0.06,0.08,0.1]
eta = [0.03,0.05,0.08]


param_grid = dict(
    learning_rate=learning_rate,
    min_child_weight=min_child_weight,
    max_depth=max_depth,
    eta = eta
)

In [17]:
#retrain the data
#scoring base on accuracy
xgb_model = XGBClassifier(random_state=1, verbosity=1, objective ="binary:logistic",
                         tree_method = "gpu_hist", device ="cuda")

# Start validation
grid_search = GridSearchCV(estimator=xgb_model,
                           param_grid=param_grid,
                           scoring='accuracy'
                           )

best_model = grid_search.fit(X_new, y)

#print the best parameters
print('Optimum parameters: ', best_model.best_params_)
print('Accuracy: ', best_model.score(X_new, y))

Optimum parameters:  {'eta': 0.03, 'learning_rate': 0.06, 'max_depth': 6, 'min_child_weight': 2}
Accuracy:  0.9042800265428003


In [18]:
# Finalise our model using the *new* best parameters found
new_xgb_model = XGBClassifier(random_state=1, verbosity=1, objective="binary:logistic", 
                                   learning_rate=0.06,max_depth=6,min_child_weight=2,
                                   eta=0.03,tree_method = "gpu_hist", device ="cuda")
new_xgb_model.fit(X_new,y)

In [19]:
#prepare test data again
test_file = '../data/bank-test.csv'

# Re-read
df_test = pd.read_csv(test_file)

# Rename last column|
df_test = df_test.rename(columns={'y' : 'subscription'})

# ALl preprocessing steps
df_test['job'] = job_labelenc.fit_transform(df_test['job'])
df_test['marital'] = marital_labelenc.fit_transform(df_test['marital'])
df_test['contact'] = contact_labelenc.fit_transform(df_test['contact'])
df_test['poutcome'] = poutcome_labelenc.fit_transform(df_test['poutcome'])
df_test['education'] = df_test['education'].apply(preproc_education)
df_test['month'] = df_test['month'].apply(preproc_month)
df_test['default'] = df_test['default'].apply(preproc_binary)
df_test['housing'] = df_test['housing'].apply(preproc_binary)
df_test['loan'] = df_test['loan'].apply(preproc_binary)

#feature engineer
df_test['total days'] = df_test['month']*30 + df_test['day']

X_test = df_test.drop(columns = [target_col, "Unnamed: 0","job",'month','day','duration'])

# we did scaling in this new training, so test must follow
X_test['balance'] = scaler.fit_transform(X_test['balance'].values.reshape(-1, 1))
X_test['age'] = scaler.fit_transform(X_test['age'].values.reshape(-1, 1))

In [20]:
# use our model to predict
pred_new = new_xgb_model.predict(X_test)
pred_dt = pd.DataFrame(pred_new)

In [21]:
# Convert ground truth values from "no" and "yes" to 0 and 1 (same format as our prediction)
ground_truth = df_test['subscription']
ground_truth_binary = [0 if val == "no" else 1 for val in ground_truth]

# Compute MCC
mcc_score = matthews_corrcoef(ground_truth_binary, pred_dt)
print("MCC Score:", mcc_score)

MCC Score: 0.34314749939868844


# Hyper-parameters tuning

In [22]:
%cd ..

D:\Repository\MH6151-project


In [23]:
!python .\modelling.py --model_name xgboost



--------------------------------------------------------------------------------------
Parameters set : {'booster': 'gbtree', 'max_depth': 5, 'eta': 0.2}
--------------------------------------------------------------------------------------
Cross valiation for xgboost
 -- Split #1, performance metrics mcc = 0.36, accuracy = 0.90, f1_score = 0.37
 -- Split #2, performance metrics mcc = 0.43, accuracy = 0.90, f1_score = 0.44
 -- Split #3, performance metrics mcc = 0.36, accuracy = 0.90, f1_score = 0.37
 -- Split #4, performance metrics mcc = 0.35, accuracy = 0.89, f1_score = 0.37
 -- Split #5, performance metrics mcc = 0.42, accuracy = 0.90, f1_score = 0.43
Performance metrics of model xgboost
 -- mcc : Mean=0.3845, STD=0.0327
 -- accuracy : Mean=0.8990, STD=0.0041
 -- f1_score : Mean=0.3951, STD=0.0330
--------------------------------------------------------------------------------------


--------------------------------------------------------------------------------------
Parameter

# Hyper-parameters tuning + oversampling

In [24]:
!python .\modelling.py --model_name xgboost --oversampling

Class distribution after resampling : Counter({0: 31969, 1: 15984})


--------------------------------------------------------------------------------------
Parameters set : {'booster': 'gbtree', 'max_depth': 5, 'eta': 0.2}
--------------------------------------------------------------------------------------
Cross valiation for xgboost
 -- Split #1, performance metrics mcc = 0.59, accuracy = 0.82, f1_score = 0.70
 -- Split #2, performance metrics mcc = 0.58, accuracy = 0.82, f1_score = 0.70
 -- Split #3, performance metrics mcc = 0.57, accuracy = 0.82, f1_score = 0.70
 -- Split #4, performance metrics mcc = 0.60, accuracy = 0.83, f1_score = 0.71
 -- Split #5, performance metrics mcc = 0.59, accuracy = 0.82, f1_score = 0.71
Performance metrics of model xgboost
 -- mcc : Mean=0.5859, STD=0.0075
 -- accuracy : Mean=0.8220, STD=0.0031
 -- f1_score : Mean=0.7036, STD=0.0060
--------------------------------------------------------------------------------------


----------------------------