In [None]:
pip install odfpy

In [None]:
pip install --upgrade xgboost

In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [None]:
data = pd.read_excel("bank-additional-full(1).ods", engine="odf")

In [None]:
data.hist(bins=50, figsize=(20,15))
plt.show()

In [None]:
# seperating our target variable from our dataset.
data_target = data['y']
del data['y']

In [None]:
data_target.value_counts()

no     36548
yes     4640
Name: y, dtype: int64

In [None]:
# seperating our categorical variables to 'cat_data' for encoding. 
# Calling sklearn's OrdinalEncoder as 'encoder'.
encoder = OrdinalEncoder()
cat_data = data[['Job','Marital','Education','Default(credit)','Housing(loan)','Loan','Contact','Month','day_of_week','poutcome']].copy()

In [None]:
# encoding our categorical data. 
cat_data_encoded = encoder.fit_transform(cat_data)
print(type(cat_data_encoded))

<class 'numpy.ndarray'>


In [None]:
# List of categories for all our categorical variables. Useful for future reference.
encoder.categories_

In [None]:
# converting numpy array to DataFrame for merging back with data.
cat_data_encoded = pd.DataFrame(cat_data_encoded)
cat_data_encoded.head()

In [None]:
# adding column names 
cat_data_encoded.columns = ['Job','Marital','Education','Default(credit)','Housing(loan)','Loan','Contact','Month','day_of_week','poutcome']
cat_data_encoded.head()

In [None]:
# dropping categorical variables for encoded ones.
data.drop(['Job','Marital','Education','Default(credit)','Housing(loan)','Loan','Contact','Month','day_of_week','poutcome'], axis=1, inplace=True)

In [None]:
len(cat_data_encoded)

In [None]:
len(data)

In [None]:
# merging encoded attriutes back with complete data
for i in cat_data_encoded.columns:
  data[i] = cat_data_encoded[i].values

In [None]:
data.tail(30)

In [None]:
# Splitting data set into training and testing.
data_train, data_test, target_train, target_test = train_test_split(data, data_target, test_size=0.20, random_state=42)

In [None]:
len(data_train)

32950

In [None]:
# First model using XGBoost
model_xg = XGBClassifier()
model_xg.fit(data_train, target_train)

In [None]:
prediction_xg = model_xg.predict(data_test)

In [None]:
accuracy_xg = accuracy_score(target_test, prediction_xg)
accuracy_xg

0.9147851420247632

In [None]:
# Second model using Decision Tree
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(data_train, target_train)


In [None]:
prediction_dt = model_dt.predict(data_test)
accuracy_dt = accuracy_score(target_test, prediction_dt)
accuracy_dt

0.8909929594561787

In [None]:
# Second model using Logistic Regression
model_lr = LogisticRegression(random_state=42)
model_lr.fit(data_train, target_train)

In [None]:
prediction_lr = model_lr.predict(data_test)
accuracy_lr = accuracy_score(target_test, prediction_lr)
accuracy_lr

0.9090798737557659

In [None]:
con_matrix_xg = confusion_matrix(target_test, prediction_xg)
con_matrix_dt = confusion_matrix(target_test, prediction_dt)
con_matrix_lr = confusion_matrix(target_test, prediction_lr)

In [None]:
tn_dt, fp_dt, fn_dt, tp_dt = confusion_matrix(target_test,prediction_dt).ravel()
tn_xg, fp_xg, fn_xg, tp_xg = confusion_matrix(target_test,prediction_xg).ravel()
tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(target_test,prediction_lr).ravel()

In [None]:
con_matrix_xg

array([[7018,  285],
       [ 417,  518]])

In [None]:
con_matrix_dt

array([[6857,  446],
       [ 452,  483]])

In [None]:
con_matrix_lr

array([[7106,  197],
       [ 552,  383]])

In [None]:
# Resampling 
data_resample = pd.concat([data_train, target_train], axis=1)

In [None]:
deposit_yes = data_resample[data_resample.y=='yes']
deposit_no = data_resample[data_resample.y=='no']

In [None]:
deposit_yes.y.value_counts()

yes    3705
Name: y, dtype: int64

In [None]:
# upsample minority
from sklearn.utils import resample
deposit_yes_upsampled = resample(deposit_yes,
                          replace=True, # sample with replacement
                          n_samples=len(deposit_no), # match number in majority class
                          random_state=42) # reproducible results

In [None]:
print(deposit_no.y.value_counts(),deposit_yes_upsampled.y.value_counts())

no    29245
Name: y, dtype: int64 yes    29245
Name: y, dtype: int64


In [None]:
# combining the two
upsampled_final = pd.concat([deposit_no, deposit_yes_upsampled])

In [None]:
upsampled_final.y.value_counts()

yes    29245
no     29245
Name: y, dtype: int64

In [None]:
upsampled_y = upsampled_final.y
upsampled_x = upsampled_final.drop('y', axis=1)

In [None]:
#  model using XGBoost on RESAMPLED DATA
model_xg_resample = XGBClassifier()
model_xg_resample.fit(upsampled_x, upsampled_y)

In [None]:
prediction_xg_resample = model_xg_resample.predict(data_test)

In [None]:
accuracy_xg_resample = accuracy_score(target_test, prediction_xg_resample)
accuracy_xg_resample

0.8820101966496723

In [None]:
tn_xg_r, fp_xg_r, fn_xg_r, tp_xg_r = confusion_matrix(target_test,prediction_xg_resample).ravel()

In [None]:
print(tn_xg_r,fp_xg_r,fn_xg_r,tp_xg_r)

6484 819 153 782
