# Importing libraries

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import OneHotEncoder

from imblearn.over_sampling import SMOTE

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

# Reading data

In [None]:
# reading the training data
train_data = pd.read_csv("Portugese Bank Data - TRAIN.csv")
train_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [None]:
# reading the testing data
test_data = pd.read_csv("Portugese Bank Data - TEST.csv")
test_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


# Data Exploration

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [None]:
train_data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,1422.657819,15.915284,263.961292,2.79363,39.766645,0.542579
std,10.576211,3009.638142,8.247667,259.856633,3.109807,100.121124,1.693562
min,19.0,-3313.0,1.0,4.0,1.0,-1.0,0.0
25%,33.0,69.0,9.0,104.0,1.0,-1.0,0.0
50%,39.0,444.0,16.0,185.0,2.0,-1.0,0.0
75%,49.0,1480.0,21.0,329.0,3.0,-1.0,0.0
max,87.0,71188.0,31.0,3025.0,50.0,871.0,25.0


In [None]:
# class distribution of the training data
train_data['y'].value_counts()

no     4000
yes     521
Name: y, dtype: int64

# Data Preprocessing

In [None]:
# splitting the input and output of the training and testing data
X_train = train_data.drop('y', axis = 1)
y_train = train_data['y']
X_test = test_data.drop('y', axis = 1)
y_test = test_data['y']

In [None]:
# one hot encoding the training data
categorical_columns = []

for col in X_train.columns:
  if X_train[col].dtype=='O':
    categorical_columns.append(col)

ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
Xcat = pd.DataFrame(ohe.fit_transform(X_train[categorical_columns]),columns=ohe.get_feature_names_out(),index = X_train.index)
X_train = pd.concat([X_train, Xcat],axis=1)
X_train.drop(labels = categorical_columns,axis=1,inplace=True)
X_train



Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79,1,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,33,4789,11,220,1,339,4,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,35,1350,16,185,1,330,1,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,30,1476,3,199,4,-1,0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,59,0,5,226,1,-1,0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,-333,30,329,5,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4517,57,-3313,9,153,1,-1,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4518,57,295,19,151,11,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4519,28,1137,6,129,4,211,3,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
# one hot encoding the testing data
categorical_columns = []

for col in X_test.columns:
  if X_test[col].dtype=='O':
    categorical_columns.append(col)

ohe = OneHotEncoder(handle_unknown='ignore',sparse=False)
Xcat = pd.DataFrame(ohe.fit_transform(X_test[categorical_columns]),columns=ohe.get_feature_names_out(),index = X_test.index)
X_test = pd.concat([X_test, Xcat],axis=1)
X_test.drop(labels = categorical_columns,axis=1,inplace=True)
X_test



Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,44,29,5,151,1,-1,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,33,2,5,76,1,-1,0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,47,1506,5,92,1,-1,0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,33,1,5,198,1,-1,0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,825,17,977,3,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
45207,71,1729,17,456,2,-1,0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
45208,72,5715,17,1127,5,184,3,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
45209,57,668,17,508,4,-1,0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# performing SMOTE oversampling to balance the dataset

sm = SMOTE(random_state=19)

X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
X_resampled

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,30,1787,19,79,1,-1,0,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,1.000000,0.0,0.000000,0.000000,0.000000,1.000000
1,33,4789,11,220,1,339,4,0.000000,0.000000,0.0,...,0.000000,0.0,1.000000,0.000000,0.000000,0.0,1.000000,0.000000,0.000000,0.000000
2,35,1350,16,185,1,330,1,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,1.000000,0.000000,0.000000,0.000000
3,30,1476,3,199,4,-1,0,0.000000,0.000000,0.0,...,1.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000
4,59,0,5,226,1,-1,0,0.000000,1.000000,0.0,...,0.000000,0.0,1.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,32,0,6,541,3,31,1,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.333202,0.000000,0.0,0.000000,0.000000,0.333202,0.666798
7996,30,239,20,411,1,-1,0,0.047253,0.952747,0.0,...,0.000000,0.0,0.952747,0.000000,0.047253,0.0,0.000000,0.000000,0.000000,1.000000
7997,49,3364,10,254,2,-1,0,0.000000,0.000000,0.0,...,0.470965,0.0,0.000000,0.529035,0.000000,0.0,0.000000,0.000000,0.000000,1.000000
7998,34,890,10,753,1,182,5,0.000000,0.000000,0.0,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.415762,0.584238,0.000000,0.000000


In [None]:
y_resampled

0        no
1        no
2        no
3        no
4        no
       ... 
7995    yes
7996    yes
7997    yes
7998    yes
7999    yes
Name: y, Length: 8000, dtype: object

In [None]:
# checking the class distribution after performing SMOTE oversampling
# the dataset is now balanced
y_resampled.value_counts()

no     4000
yes    4000
Name: y, dtype: int64

# Models

## Decision Tree - no hyperparameter tuning

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [None]:
train_pred = dt.predict(X_resampled)
test_pred = dt.predict(X_test)

#Model Accuracy
print("Train Accuracy:", accuracy_score(y_resampled, train_pred))
print("Test Accuracy:", accuracy_score(y_test, test_pred))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test, test_pred))
print("Max Depth",dt.get_depth())
print("Leaf",dt.get_n_leaves())
print('Printing the precision and recall, among other metrics')
print(classification_report(y_test, test_pred))

Train Accuracy: 0.889
Test Accuracy: 0.8823295215766075
Confusion Matrix for Decision Tree:
[[37236  2686]
 [ 2634  2655]]
Max Depth 26
Leaf 380
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.93      0.93      0.93     39922
         yes       0.50      0.50      0.50      5289

    accuracy                           0.88     45211
   macro avg       0.72      0.72      0.72     45211
weighted avg       0.88      0.88      0.88     45211



## Random Forest - No hyperparameter tuning

In [None]:
rf = RandomForestClassifier(random_state = 20)
rf.fit(X_resampled, y_resampled)

In [None]:
train_pred_1 = rf.predict(X_resampled)
test_pred_1 = rf.predict(X_test)

#Model Accuracy
print("Train Accuracy:", accuracy_score(y_resampled, train_pred_1))
print("Test Accuracy:", accuracy_score(y_test, test_pred_1))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test, test_pred_1))
print('Printing the precision and recall, among other metrics')
print(classification_report(y_test, test_pred_1))

Train Accuracy: 1.0
Test Accuracy: 0.9104863860564907
Confusion Matrix for Decision Tree:
[[39096   826]
 [ 3221  2068]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.92      0.98      0.95     39922
         yes       0.71      0.39      0.51      5289

    accuracy                           0.91     45211
   macro avg       0.82      0.69      0.73     45211
weighted avg       0.90      0.91      0.90     45211



## Decision Tree - Random search hyperparameter tuning

In [None]:
#Hyperparameter tuning done for decision tree classifier

#RANDOM SEARCH--------------------------------------------

import time
start_time = time.time()

print("RandomizedSearchCV-Decision tree")

parameters={
            'max_depth': range(5,30,5),
            'criterion':['gini','entropy'],
            'max_leaf_nodes':range(10,500,10),
            'min_samples_split':range(100,500,10)
           }

dt_random = RandomizedSearchCV(dt,parameters,n_iter=25,cv=5)

dt_random.fit(X_resampled, y_resampled)

grid_parm = dt_random.best_params_
print(grid_parm)

train_pred_2 = dt_random.predict(X_resampled)
test_pred_2 = dt_random.predict(X_test)

print("Train Accuracy:", dt_random.score(X_resampled, y_resampled))
print("Test Accuracy:", dt_random.score(X_test, y_test))

print("--- %s seconds ---" % (time.time() - start_time))



RandomizedSearchCV-Decision tree
{'min_samples_split': 240, 'max_leaf_nodes': 90, 'max_depth': 25, 'criterion': 'entropy'}
Train Accuracy: 0.919125
Test Accuracy: 0.8800291964344961
--- 13.53446912765503 seconds ---


# Random Forest - Random search hyperparameter tuning

In [None]:
rfRand = RandomForestClassifier(**grid_parm)
rfRand.fit(X_resampled,y_resampled)

rfRand_predict_train = rfRand.predict(X_resampled)
rfRand_predict = rfRand.predict(X_test)
print("Train Accuracy:", accuracy_score(y_resampled,rfRand_predict_train))
print("Test Accuracy:", accuracy_score(y_test,rfRand_predict))
print("Confusion Matrix for Decision Tree:")
print(confusion_matrix(y_test,rfRand_predict))
print('Printing the precision and recall, among other metrics')
print(classification_report(y_test, rfRand_predict))
clf_cv_score = cross_val_score(rfRand, X_resampled, y_resampled, cv=5, scoring="balanced_accuracy")
print(clf_cv_score)

Train Accuracy: 0.952875
Test Accuracy: 0.9026121961469554
Confusion Matrix for Decision Tree:
[[38778  1144]
 [ 3259  2030]]
Printing the precision and recall, among other metrics
              precision    recall  f1-score   support

          no       0.92      0.97      0.95     39922
         yes       0.64      0.38      0.48      5289

    accuracy                           0.90     45211
   macro avg       0.78      0.68      0.71     45211
weighted avg       0.89      0.90      0.89     45211

[0.726875 0.974375 0.97875  0.97125  0.9775  ]
