# Prediction of term deposit subscriptions by bank customers

Using the [Bank Marketing Data Set](https://archive.ics.uci.edu/ml/datasets/Bank+Marketing) from the UCI Machine Learning Repository

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df = pd.read_csv('bank-additional.csv', sep=';')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
age               4119 non-null int64
job               4119 non-null object
marital           4119 non-null object
education         4119 non-null object
default           4119 non-null object
housing           4119 non-null object
loan              4119 non-null object
contact           4119 non-null object
month             4119 non-null object
day_of_week       4119 non-null object
duration          4119 non-null int64
campaign          4119 non-null int64
pdays             4119 non-null int64
previous          4119 non-null int64
poutcome          4119 non-null object
emp.var.rate      4119 non-null float64
cons.price.idx    4119 non-null float64
cons.conf.idx     4119 non-null float64
euribor3m         4119 non-null float64
nr.employed       4119 non-null float64
y                 4119 non-null object
dtypes: float64(5), int64(5), object(11)
memory usage: 675.9+ KB


In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [5]:
df.describe(include='all')

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
count,4119.0,4119,4119,4119,4119,4119,4119,4119,4119,4119,...,4119.0,4119.0,4119.0,4119,4119.0,4119.0,4119.0,4119.0,4119.0,4119
unique,,12,4,8,3,3,3,2,10,5,...,,,,3,,,,,,2
top,,admin.,married,university.degree,no,yes,no,cellular,may,thu,...,,,,nonexistent,,,,,,no
freq,,1012,2509,1264,3315,2175,3349,2652,1378,860,...,,,,3523,,,,,,3668
mean,40.11362,,,,,,,,,,...,2.537266,960.42219,0.190337,,0.084972,93.579704,-40.499102,3.621356,5166.481695,
std,10.313362,,,,,,,,,,...,2.568159,191.922786,0.541788,,1.563114,0.579349,4.594578,1.733591,73.667904,
min,18.0,,,,,,,,,,...,1.0,0.0,0.0,,-3.4,92.201,-50.8,0.635,4963.6,
25%,32.0,,,,,,,,,,...,1.0,999.0,0.0,,-1.8,93.075,-42.7,1.334,5099.1,
50%,38.0,,,,,,,,,,...,2.0,999.0,0.0,,1.1,93.749,-41.8,4.857,5191.0,
75%,47.0,,,,,,,,,,...,3.0,999.0,0.0,,1.4,93.994,-36.4,4.961,5228.1,


In [6]:
df.drop('duration', axis=1, inplace=True)

In [7]:
df_with_dummies = pd.get_dummies(df, drop_first=True).astype(np.float32)

In [8]:
df_with_dummies.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_blue-collar,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,30.0,2.0,999.0,0.0,-1.8,92.892998,-46.200001,1.313,5099.100098,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,39.0,4.0,999.0,0.0,1.1,93.994003,-36.400002,4.855,5191.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,25.0,1.0,999.0,0.0,1.4,94.464996,-41.799999,4.962,5228.100098,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,38.0,3.0,999.0,0.0,1.4,94.464996,-41.799999,4.959,5228.100098,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,47.0,1.0,999.0,0.0,-0.1,93.199997,-42.0,4.191,5195.799805,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [9]:
X = df_with_dummies.iloc[:, :-1].values

In [10]:
y = df_with_dummies.iloc[:, -1].values

In [11]:
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=2, stratify=y)

In [13]:
model = SVC(gamma='auto')

In [14]:
model.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [15]:
y_pred = model.predict(X_test)

In [16]:
print(classification_report(y_pred, y_test))
print(accuracy_score(y_pred, y_test), '\n')

              precision    recall  f1-score   support

         0.0       0.99      0.91      0.95       800
         1.0       0.20      0.75      0.32        24

   micro avg       0.91      0.91      0.91       824
   macro avg       0.60      0.83      0.63       824
weighted avg       0.97      0.91      0.93       824

0.9053398058252428 

