In [4]:
#Import necessary libraries
%matplotlib inline
import numpy as np

import pandas as pd
from matplotlib import pyplot as pyplot
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import zscore

#Load the dataset
colnames=['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'Target']
df=pd.read_csv('./bank-full.csv')

#Check the data
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  Target     45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
#Check for missing values
df.isnull().values.any() # False

#Handle missing or irrelevant data
other = df.poutcome == 'other'
df1=df.drop(df[other].index, axis=0, inplace=False)
df1[['job', 'education']] = df1[['job', 'education']].replace(['unknown'], 'other')

#Transform categorical data
for feature in df.columns:
  if df1[feature].dtype=='object':
    df1[feature]=pd.Categorical(df1[feature]).codes

df1.info(), df1.describe()


<class 'pandas.core.frame.DataFrame'>
Index: 43371 entries, 0 to 45209
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        43371 non-null  int64
 1   job        43371 non-null  int8 
 2   marital    43371 non-null  int8 
 3   education  43371 non-null  int8 
 4   default    43371 non-null  int8 
 5   balance    43371 non-null  int64
 6   housing    43371 non-null  int8 
 7   loan       43371 non-null  int8 
 8   contact    43371 non-null  int8 
 9   day        43371 non-null  int64
 10  month      43371 non-null  int8 
 11  duration   43371 non-null  int64
 12  campaign   43371 non-null  int64
 13  pdays      43371 non-null  int64
 14  previous   43371 non-null  int64
 15  poutcome   43371 non-null  int8 
 16  Target     43371 non-null  int8 
dtypes: int64(7), int8(10)
memory usage: 3.1 MB


(None,
                 age           job       marital     education       default  \
 count  43371.000000  43371.000000  43371.000000  43371.000000  43371.000000   
 mean      40.986443      4.696825      1.165456      2.059164      0.018422   
 std       10.595454      3.661037      0.606564      0.779909      0.134475   
 min       18.000000      0.000000      0.000000      0.000000      0.000000   
 25%       33.000000      1.000000      1.000000      2.000000      0.000000   
 50%       39.000000      4.000000      1.000000      2.000000      0.000000   
 75%       48.000000      8.000000      2.000000      3.000000      0.000000   
 max       95.000000     11.000000      2.000000      3.000000      1.000000   
 
              balance       housing          loan       contact           day  \
 count   43371.000000  43371.000000  43371.000000  43371.000000  43371.000000   
 mean     1356.963063      0.551774      0.160914      0.662101     15.862904   
 std      3030.956348      0

In [7]:
# Split the data
X = df1.drop(['contact', 'Target'], axis=1)
y = df1['Target']

# Split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=10)

In [8]:
#Scale the data
from sklearn import preprocessing

ZX=preprocessing.scale(X)
ZX_train, ZX_test, y_train, y_test = train_test_split(ZX, y, test_size=0.3, random_state = 10)

In [10]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier

# Random Forest Classifier
rfcl = RandomForestClassifier(criterion='entropy', class_weight={0: 0.5, 1: 0.5}, max_depth=5, min_samples_leaf=5)
rfcl =rfcl.fit(X_train, y_train)
test_pred_rf =rfcl.predict(X_test)
rfcl_score = rfcl.score(X_test, y_test)

# Adaboost Ensemble Algorithm
abcl = AdaBoostClassifier(n_estimators=20)
abcl = abcl.fit(X_train, y_train)
test_pred_ab = abcl.predict(X_test)
abcl_score = abcl.score(X_test, y_test)

#Bagging Classifier
bgcl = BaggingClassifier(n_estimators=20)
bgcl = bgcl.fit(X_train, y_train)
test_pred_bg = bgcl.predict(X_test)
bgcl_score = bgcl.score(X_test, y_test)

#Gradient Boost Classifier
gbcl = GradientBoostingClassifier(n_estimators=50, learning_rate=0.05)
gbcl = gbcl.fit(X_train, y_train)
test_pred_gb = gbcl.predict(X_test)
gbcl_score = gbcl.score(X_test, y_test)

#Print model accuracies
print("Random Forest model accuracy: {}".format(rfcl_score))
print("Adaboost Ensemble model accuracy: {}".format(abcl_score))
print("Bagging Classifier model accuracy:{}".format(bgcl_score))
print("Gradient Boost Classifier model accuracy:{}".format(gbcl_score))

Random Forest model accuracy: 0.8953273901014448
Adaboost Ensemble model accuracy: 0.8934829388256994
Bagging Classifier model accuracy:0.9004764832462342
Gradient Boost Classifier model accuracy:0.8992468490624039
