In [170]:
#importing the necessary library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import warnings

In [154]:
#ignore warnings
warnings.filterwarnings("ignore")

In [155]:
#loading the dataset into pandas dataframe object

df=pd.read_csv("bank.csv", sep=";")

In [156]:
# checking the first five row 
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [157]:
# checking the last five row 

df.tail()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41187,74,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,3,999,1,failure,-1.1,94.767,-50.8,1.028,4963.6,no


In [158]:
#getting more info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [159]:
#checking the shape of the dataset
df.shape

(41188, 21)

In [160]:
#separating the target variable
Y=df.pop("y")

In [161]:
#selecting the column of the object datatype to perfrm onehotencoding on them
df_col=df.select_dtypes(include="object").columns

In [162]:
df_col

Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
       'month', 'day_of_week', 'poutcome'],
      dtype='object')

In [163]:
#using pd.get_dumming to perform the encoding
pro_df=pd.get_dummies(df,columns=df_col,prefix=df_col,drop_first=True)

In [164]:
pro_df.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0,...,1,0,0,0,1,0,0,0,1,0


In [165]:
#confirm is there is null value in the target variable

Y.isnull().sum()

0

In [166]:
#preprocessing the target variable by changing all "no" to 0 and all"yes" to 1
Y=Y.map({"no":0,"yes":1})
Y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [167]:
#spliting the data into train and test set
x_train,x_test,y_train,y_test=train_test_split(pro_df,Y,test_size=0.3,random_state=30)


In [168]:
#fiting logistic regression on the train set and using it to predict the test set.Also printing out the classification report

lr=LogisticRegression()
lr.fit(x_train,y_train)
lr_pred=lr.predict(x_test)
print(f"the classification report for logistic regression is\n\n {classification_report(lr_pred,y_test)}")

the classification report for logistic regression is

               precision    recall  f1-score   support

           0       0.97      0.93      0.95     11525
           1       0.40      0.67      0.50       832

    accuracy                           0.91     12357
   macro avg       0.69      0.80      0.72     12357
weighted avg       0.94      0.91      0.92     12357



In [171]:
#fiting decisiontreeclassifier on the train set and using it to predict the test set.Also printing out the classification report
dt=DecisionTreeClassifier(max_depth=6)
dt.fit(x_train,y_train)

dt_pred=dt.predict(x_test)

print(f"the classification report for decision tree regression is {classification_report(dt_pred,y_test)}")


the classification report for decision tree regression is               precision    recall  f1-score   support

           0       0.95      0.95      0.95     10973
           1       0.62      0.63      0.62      1384

    accuracy                           0.91     12357
   macro avg       0.79      0.79      0.79     12357
weighted avg       0.92      0.91      0.91     12357



In [172]:
#fiting randomforestclassifier on the train set and using it to predict the test set.Also printing out the classification report

rf=RandomForestClassifier(n_estimators=1000)


rf.fit(x_train,y_train)
rf_pred=rf.predict(x_test)

print(f"the classification report for random forest regression is {classification_report(rf_pred,y_test)}")

the classification report for random forest regression is               precision    recall  f1-score   support

           0       0.97      0.93      0.95     11358
           1       0.47      0.66      0.55       999

    accuracy                           0.91     12357
   macro avg       0.72      0.80      0.75     12357
weighted avg       0.93      0.91      0.92     12357



THE AIM IS TO CHECK WHICH MODEL WILL PERFORM BEST IN CLASSIFYING "1". EVEN THOUGH THERE IS IMBALANCE IN THE DATA SET,LOGISTICREGESSION IS STILL DOING BETTER THAN THE OTHER TWO WITH RECALL VALUE OF 0.G7