In [1]:
# Import libraries
## Basic libs
import pandas as pd
pd.set_option('display.max_columns', 500)
import numpy as np
import warnings
## Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Configure libraries
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 10)
plt.style.use('seaborn')

In [2]:
# Load dataset
df_bank = pd.read_csv('bank-full.csv',sep=";")

# Drop 'duration' column
df_bank = df_bank.drop('duration', axis=1)

# print(df_bank.info())
print('Shape of dataframe:', df_bank.shape)
df_bank.head()

Shape of dataframe: (45211, 16)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,1,-1,0,unknown,no


In [3]:
df_bank['y'].value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [4]:
df_bank.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [5]:
df_bank.drop_duplicates(keep='first',inplace=True)
df_bank.reset_index(inplace=True)
all_cols = list(df_bank.columns)
try:
    all_cols.remove('duration')
except:
    pass
df_bank[df_bank.duplicated(all_cols)]

Unnamed: 0,index,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y


In [6]:
from sklearn.preprocessing import StandardScaler

# Copying original dataframe
df_bank_ready = df_bank.copy()

scaler = StandardScaler()
num_cols = ['age', 'balance', 'day', 'campaign', 'pdays', 'previous']
df_bank_ready[num_cols] = scaler.fit_transform(df_bank_ready[num_cols])

df_bank_ready.head()

Unnamed: 0,index,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,0,1.606781,management,married,tertiary,no,0.256225,yes,no,unknown,-1.298232,may,-0.569343,-0.411538,-0.251988,unknown,no
1,1,0.288389,technician,single,secondary,no,-0.437991,yes,no,unknown,-1.298232,may,-0.569343,-0.411538,-0.251988,unknown,no
2,2,-0.747491,entrepreneur,married,secondary,no,-0.446858,yes,yes,unknown,-1.298232,may,-0.569343,-0.411538,-0.251988,unknown,no
3,3,0.570901,blue-collar,married,unknown,no,0.04704,yes,no,unknown,-1.298232,may,-0.569343,-0.411538,-0.251988,unknown,no
4,4,-0.747491,unknown,single,unknown,no,-0.447186,no,no,unknown,-1.298232,may,-0.569343,-0.411538,-0.251988,unknown,no


In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
cat_cols = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']

# Encode Categorical Data
df_encoded = pd.DataFrame(encoder.fit_transform(df_bank_ready[cat_cols]))
df_encoded.columns = encoder.get_feature_names(cat_cols)

# Replace Categotical Data with Encoded Data
df_bank_ready.drop(columns=cat_cols, inplace=True)
df_bank_ready = pd.concat([df_bank_ready,df_encoded], axis=1)

# Encode target value
df_bank_ready['y'] = df_bank_ready['y'].apply(lambda x: 1 if x == 'yes' else 0)


print('Shape of dataframe:', df_bank_ready.shape)
df_bank_ready.head()

Shape of dataframe: (45195, 52)


Unnamed: 0,index,age,balance,day,campaign,pdays,previous,y,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,0,1.606781,0.256225,-1.298232,-0.569343,-0.411538,-0.251988,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1,0.288389,-0.437991,-1.298232,-0.569343,-0.411538,-0.251988,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2,-0.747491,-0.446858,-1.298232,-0.569343,-0.411538,-0.251988,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,0.570901,0.04704,-1.298232,-0.569343,-0.411538,-0.251988,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,-0.747491,-0.447186,-1.298232,-0.569343,-0.411538,-0.251988,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
# Select Features
feature = df_bank_ready.drop(columns='y')

# Select Target
target = df_bank_ready['y']

# Set Training and Testing Data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(feature , target, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=42)

# Show the Training and Testing Data
print('Shape of training feature:', X_train.shape)
print('Shape of testing feature:', X_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

Shape of training feature: (36156, 51)
Shape of testing feature: (9039, 51)
Shape of training label: (36156,)
Shape of training label: (9039,)


In [9]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}


def print_evaluation_metrics (model,X_test, y_test):
    # Evaluate Model
    dtc_eval = evaluate_model(model, X_test, y_test)

    # Print result
    print('Accuracy:', dtc_eval['acc'])
    print('Precision:', dtc_eval['prec'])
    print('Recall:', dtc_eval['rec'])
    print('F1 Score:', dtc_eval['f1'])
    print('Cohens Kappa Score:', dtc_eval['kappa'])
    print('Area Under Curve:', dtc_eval['auc'])
    print('Confusion Matrix:\n', dtc_eval['cm'])

In [10]:
from sklearn import tree

# Building Decision Tree model 
dtc = tree.DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

print_evaluation_metrics(dtc,X_test,y_test)

Accuracy: 0.8349374930855183
Precision: 0.33527939949958296
Recall: 0.3664539653600729
F1 Score: 0.3501742160278745
Cohens Kappa Score: 0.2558497965396118
Area Under Curve: 0.6330507046644233
Confusion Matrix:
 [[7145  797]
 [ 695  402]]


In [11]:
from sklearn.ensemble import RandomForestClassifier

# Building Random Forest model 
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

print_evaluation_metrics(rf,X_test,y_test)

Accuracy: 0.8936829295276026
Precision: 0.6338582677165354
Recall: 0.2935278030993619
F1 Score: 0.4012461059190031
Cohens Kappa Score: 0.3514185685807176
Area Under Curve: 0.79465482083299
Confusion Matrix:
 [[7756  186]
 [ 775  322]]


In [12]:
from sklearn.naive_bayes import GaussianNB

# Building Naive Bayes model 
nb = GaussianNB()
nb.fit(X_train, y_train)

print_evaluation_metrics(nb, X_test, y_test)

Accuracy: 0.841243500387211
Precision: 0.36817472698907955
Recall: 0.43026435733819507
F1 Score: 0.3968053804119378
Cohens Kappa Score: 0.3060342228613627
Area Under Curve: 0.7591323558883032
Confusion Matrix:
 [[7132  810]
 [ 625  472]]


In [13]:
from sklearn.neighbors import KNeighborsClassifier

# Building KNN model 
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

print_evaluation_metrics(knn,X_test, y_test)

Accuracy: 0.8843898661356345
Precision: 0.54
Recall: 0.31996353691886964
F1 Score: 0.40183171150543795
Cohens Kappa Score: 0.3424480820893345
Area Under Curve: 0.7408477872965509
Confusion Matrix:
 [[7643  299]
 [ 746  351]]


In [14]:
from sklearn.linear_model import LogisticRegression

# Building KNN model
lr = LogisticRegression()
lr.fit(X_train, y_train)

print_evaluation_metrics(lr,X_test, y_test)

Accuracy: 0.8757605929859498
Precision: 0.37
Recall: 0.03372835004557885
F1 Score: 0.06182121971595656
Cohens Kappa Score: 0.04240320753700644
Area Under Curve: 0.7256503221739563
Confusion Matrix:
 [[7879   63]
 [1060   37]]


In [15]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [50, 80, 100],
    'max_features': [2, 3, 4],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 300, 500]
}

# Create a base model
rf_grids = RandomForestClassifier(random_state=42)

# Initiate the grid search model
grid_search = GridSearchCV(estimator=rf_grids, param_grid=param_grid, scoring='recall',
                           cv=5, n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

grid_search.best_params_

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.8s
[CV] END max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   1.9s
[CV] END max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300; total time=   4.8s
[CV] END max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300; total time=   4.9s
[CV] END max_depth=50, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300; total time=   5.0s


{'max_depth': 50,
 'max_features': 4,
 'min_samples_leaf': 3,
 'min_samples_split': 12,
 'n_estimators': 100}

In [16]:
grid_search.best_score_

0.1975243285989401

In [23]:
best_grid = grid_search.best_estimator_

In [19]:
grid_search.best_index_

60

In [21]:
grid_search.classes_

array([0, 1])

In [24]:
print_evaluation_metrics(best_grid,X_test,y_test)

Accuracy: 0.8956743002544529
Precision: 0.7566666666666667
Recall: 0.20692798541476753
F1 Score: 0.3249821045096636
Cohens Kappa Score: 0.28786235566491314
Area Under Curve: 0.8082695944871052
Confusion Matrix:
 [[7869   73]
 [ 870  227]]
