In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.utils import class_weight 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE

In [3]:
data = pd.read_csv('term-deposit-marketing-2020.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 14 columns):
age          40000 non-null int64
job          40000 non-null object
marital      40000 non-null object
education    40000 non-null object
default      40000 non-null object
balance      40000 non-null int64
housing      40000 non-null object
loan         40000 non-null object
contact      40000 non-null object
day          40000 non-null int64
month        40000 non-null object
duration     40000 non-null int64
campaign     40000 non-null int64
y            40000 non-null object
dtypes: int64(5), object(9)
memory usage: 4.3+ MB


In [5]:
#Display values of columns for understanding data
for column in data.columns:
    print(data[column].unique())

[58 44 33 47 35 28 42 43 41 29 53 57 51 45 60 56 32 25 40 39 52 46 36 49
 59 37 50 54 55 48 24 38 31 30 27 34 23 26 61 22 21 20 66 62 83 75 67 70
 65 68 64 69 72 71 19 76 85 63 90 82 73 74 78 80 94 79 77 86 95 81]
['management' 'technician' 'entrepreneur' 'blue-collar' 'unknown'
 'retired' 'admin' 'services' 'self-employed' 'unemployed' 'housemaid'
 'student']
['married' 'single' 'divorced']
['tertiary' 'secondary' 'unknown' 'primary']
['no' 'yes']
[  2143     29      2 ...   7222   3402 102127]
['yes' 'no']
['no' 'yes']
['unknown' 'cellular' 'telephone']
[ 5  6  7  8  9 12 13 14 15 16 19 20 21 23 26 27 28 29 30  2  3  4 11 17
 18 24 25  1 10 22 31]
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'jan' 'feb' 'mar' 'apr']
[ 261  151   76 ... 1880 1460 2219]
[ 1  2  3  5  4  6  7  8  9 10 11 12 13 19 14 24 16 32 18 22 15 17 25 21
 43 51 63 41 26 28 55 50 38 23 20 29 31 37 30 46 27 58 33 35 34 36 39 44]
['no' 'yes']


In [6]:
#Check number of negative/positive values 
data.y.value_counts()

no     37104
yes     2896
Name: y, dtype: int64

In [7]:
#Convert string colum to numeric column for binary classification
data['is_subscribed'] = data['y'].apply(lambda x: 1 if x=='yes' else 0).copy()

In [8]:
#Check for null values in numeric columns
data[['age','balance','day','duration','campaign']].dropna()

Unnamed: 0,age,balance,day,duration,campaign
0,58,2143,5,261,1
1,44,29,5,151,1
2,33,2,5,76,1
3,47,1506,5,92,1
4,33,1,5,198,1
...,...,...,...,...,...
39995,53,395,3,107,1
39996,30,3340,3,238,3
39997,54,200,3,170,1
39998,34,1047,3,342,1


In [9]:
#Converting day type to string because it is a categorical value
data['day'] = data['day'].apply(lambda x : str(x))

In [10]:
numerical_columns = ['age','balance','duration','campaign']
categorical_columns = ['job','marital','education','contact','day','month','housing','default','loan']

In [11]:
#One-hot encoding
categorical_data = pd.get_dummies(data[categorical_columns],drop_first=True)

In [12]:
encoded_data = pd.concat([data[numerical_columns], categorical_data],axis=1)

In [13]:
encoded_data['is_subscribed'] = data['is_subscribed']

In [14]:
encoded_data

Unnamed: 0,age,balance,duration,campaign,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,housing_yes,default_yes,loan_yes,is_subscribed
0,58,2143,261,1,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
1,44,29,151,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
2,33,2,76,1,0,1,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
3,47,1506,92,1,1,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,33,1,198,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,395,107,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
39996,30,3340,238,3,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
39997,54,200,170,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,1
39998,34,1047,342,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


## First Method Oversampling Imbalanced Data

In [15]:
#train data
x = encoded_data.loc[:, encoded_data.columns != 'is_subscribed']
#target data
y = encoded_data[['is_subscribed']]

In [17]:
#SMOTE method used for handling imbalanced data
sm = SMOTE(random_state=42)
x_res, y_res = sm.fit_sample(x, y)

In [18]:
#Split data
x_train, x_test,y_train,y_test = train_test_split(x_res,y_res,test_size=0.33,random_state=42)

In [19]:
def calculate_score(y_test,y_pred):
    cm = confusion_matrix(y_test,y_pred)
    auc = roc_auc_score(y_test,y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    print('Confusion matrix:')
    print(cm)
    print('AUC: %.3f' % auc)
    print('Accuracy: %.3f' % accuracy)
    print('Precision: %.3f' % precision)
    print('Recall: %.3f' % recall)
    print('F-Measure: %.3f' % f1)

Decision Tree

In [20]:
dtc = DecisionTreeClassifier(criterion = 'entropy')
dtc.fit(x_train,y_train)
y_pred = dtc.predict(x_test) 
cm = confusion_matrix(y_test,y_pred)
print('DTC')
calculate_score(y_test,y_pred)

DTC
Confusion matrix:
[[11487   743]
 [  548 11711]]
AUC: 0.947
Accuracy: 0.947
Precision: 0.940
Recall: 0.955
F-Measure: 0.948


In [39]:
#K-fold cross validation
score = cross_val_score(estimator =dtc , X = x_train , y=y_train , cv = 5)
print('---------------------')
print(score.mean())

---------------------
0.9775773195876288


KNN

In [21]:
knn = KNeighborsClassifier(n_neighbors=1, metric='minkowski')
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
print('KNN')
calculate_score(y_test,y_pred)

  


KNN
Confusion matrix:
[[10217  2013]
 [  684 11575]]
AUC: 0.890
Accuracy: 0.890
Precision: 0.852
Recall: 0.944
F-Measure: 0.896


In [22]:
#K-fold cross validation
score = cross_val_score(estimator =knn , X = x_train , y=y_train , cv = 5)
print('---------------------')
print(score.mean())

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


---------------------
0.8827812274938995


Naive Bayes

In [23]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('GNB')
calculate_score(y_test,y_pred)

  return f(**kwargs)


GNB
Confusion matrix:
[[11877   353]
 [  925 11334]]
AUC: 0.948
Accuracy: 0.948
Precision: 0.970
Recall: 0.925
F-Measure: 0.947


In [24]:
#K-fold cross validation
score = cross_val_score(estimator =gnb , X = x_train , y=y_train , cv = 5)
print('---------------------')
print(score.mean())

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


---------------------
0.9459964496746499


  return f(**kwargs)


## Second Method Undersampling Imbalanced Data

In [25]:
subscribed_data = encoded_data[encoded_data['is_subscribed'] == 1]

In [26]:
not_subscribed_data = encoded_data[encoded_data['is_subscribed'] == 0][0:2896]

In [27]:
merge_df = pd.concat([subscribed_data,not_subscribed_data])

In [28]:
merge_df = merge_df.sample(n=len(merge_df),random_state=42).reset_index(drop=True)

In [29]:
#train data
x = merge_df.loc[:, merge_df.columns != 'is_subscribed']
#target data
y = merge_df[['is_subscribed']]

In [30]:
#Split data
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

In [31]:
def calculate_score(y_test,y_pred):
    cm = confusion_matrix(y_test,y_pred)
    auc = roc_auc_score(y_test,y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    print('Confusion matrix:')
    print(cm)
    print('AUC: %.3f' % auc)
    print('Accuracy: %.3f' % accuracy)
    print('Precision: %.3f' % precision)
    print('Recall: %.3f' % recall)
    print('F-Measure: %.3f' % f1)

Decision Tree

In [32]:
dtc = DecisionTreeClassifier(criterion = 'entropy')
dtc.fit(x_train,y_train)
y_pred = dtc.predict(x_test) 
cm = confusion_matrix(y_test,y_pred)
print('DTC')
calculate_score(y_test,y_pred)

DTC
Confusion matrix:
[[959  14]
 [ 28 911]]
AUC: 0.978
Accuracy: 0.978
Precision: 0.985
Recall: 0.970
F-Measure: 0.977


In [33]:
score = cross_val_score(estimator =dtc , X = x_train , y=y_train , cv = 5)
print('---------------------')
print(score.mean())

---------------------
0.9775773195876288


KNN

In [34]:
knn = KNeighborsClassifier(n_neighbors=1, metric='minkowski')
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
print('KNN')
calculate_score(y_test,y_pred)

  


KNN
Confusion matrix:
[[728 245]
 [258 681]]
AUC: 0.737
Accuracy: 0.737
Precision: 0.735
Recall: 0.725
F-Measure: 0.730


In [35]:
#K-fold cross validation
score = cross_val_score(estimator =knn , X = x_train , y=y_train , cv = 5)
print('---------------------')
print(score.mean())

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


---------------------
0.7175257731958763


  estimator.fit(X_train, y_train, **fit_params)


Naive Bayes

In [36]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
print('GNB')
calculate_score(y_test,y_pred)

  return f(**kwargs)


GNB
Confusion matrix:
[[973   0]
 [ 37 902]]
AUC: 0.980
Accuracy: 0.981
Precision: 1.000
Recall: 0.961
F-Measure: 0.980


In [37]:
#K-fold cross validation
score = cross_val_score(estimator =gnb , X = x_train , y=y_train , cv = 5)
print('---------------------')
print(score.mean())

---------------------
0.9876288659793815


  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
