# Machine Learning with SVM 

### Problem Statement 
Predict if a customer subscribes to a term deposits or not, when contacted by a marketing agent using SVM

In [3]:
# install libraries 
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
from sklearn import preprocessing, svm 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report

### Read the file 

In [4]:
# Read the dataset fro local drive
deposit = pd.read_csv(r"D:\new Data science class\project\7. svm\Predicting Term Deposit Subscription by a client\Dataset\bank-additional-full.csv", delimiter= ';')

In [5]:
# Primary data inspection 
deposit.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


### Data Fields 
* age -- Age of the client
* job -- Type of job (categorical: 'admin.','blue-collar', 'entrepreneur', 'housemaid', 'management', 'retired','self-employed', 'services', 'student', 'technician', 'unemployed', 'unknown')
* marital -- Marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
* education -- (categorical: 'basic.4y', 'basic.6y' ,'basic.9y', 'high.school', 'illiterate', 'professional.course', 'university.degree','unknown')
* default -- has credit in default? (categorical: 'no','yes','unknown')
* housing -- has a housing loan? (categorical: 'no','yes','unknown')
* loan -- has a personal loan? (categorical: 'no','yes','unknown')
* contact -- contact communication type (categorical: 'cellular','telephone')
* month -- last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
* day_of_week -- last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
* duration -- last contact duration, in seconds
* campaign -- number of contacts performed during this campaign and for this client (numeric, includes last contact)
* pdays -- number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
* previous -- number of contacts performed before this campaign and for this client
* poutcome -- outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
* emp.var.rate -- employment variation rate - quarterly indicator (numeric) cons.price.idx consumer price index - monthly indicator cons.conf.idx consumer confidence index - monthly indicator
* euribor3m -- euribor 3 month rate - daily indicator nr.employed number of employees - quarterly indicator 
* y(target variable)-- has the client subscribed a term deposit? (binary: 'yes','no')

### EDA

In [6]:
# Check the no of rows and no of columns in dataset 
deposit.shape

(41188, 21)

* This dataset contains 41,188 records 
* There are 21 fields in this dataset 

In [7]:
# Check all the columns in the dataset 
deposit.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [8]:
# Primary analysis 
deposit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

* There are no nulls in the dataset 
* This dataset contains combination of object and numeric datatypes

In [9]:
# check distribution of y variable
deposit.y.value_counts()

no     36548
yes     4640
Name: y, dtype: int64

* The y variable is form of yes and no, we need to convert it to 0 and 1 

In [10]:
# change yes to 0 and no to 1 in y variable 
deposit["y"]= deposit["y"].replace({'no':1,'yes':0})

In [11]:
deposit.y.value_counts()

1    36548
0     4640
Name: y, dtype: int64

* The y variable is not equally distributed 

### Pre- processing of data 

In [12]:
# split into numeric and factoral columns 

nc = deposit.select_dtypes(exclude = 'object').columns
fc = deposit.select_dtypes(include = 'object').columns

In [13]:
nc, fc

(Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
        'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
       dtype='object'),
 Index(['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact',
        'month', 'day_of_week', 'poutcome'],
       dtype='object'))

In [14]:
# check unique values in all columns 

for col in nc:
    n= len(deposit[col].unique());
    print("The no of unique values in {} are {}".format(col,n))

The no of unique values in age are 78
The no of unique values in duration are 1544
The no of unique values in campaign are 42
The no of unique values in pdays are 27
The no of unique values in previous are 8
The no of unique values in emp.var.rate are 10
The no of unique values in cons.price.idx are 26
The no of unique values in cons.conf.idx are 26
The no of unique values in euribor3m are 316
The no of unique values in nr.employed are 11
The no of unique values in y are 2


#### Convert Factoral columns into dummy variables 

In [15]:
# convert factoral columns into dummy variables 
deposit_new = deposit.copy()
for col in fc:
    dummy = pd.get_dummies(deposit_new[col], drop_first= True, prefix = col)
    deposit_new = deposit_new.join(dummy)

In [17]:
deposit_new.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,0,0,0,1,0,0,0,1,0
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,0,0,0,1,0,0,0,1,0
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,0,0,0,1,0,0,0,1,0
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,0,0,0,1,0,0,0,1,0
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,0,0,0,1,0,0,0,1,0


In [18]:
# drop old factoral columns 
deposit_new.drop(columns = fc, inplace = True)

#### Scaling the data 

In [19]:
# Scaling the data using standardScaler
nc = nc.drop("y") # dropping y variable for standardisation 

In [20]:
ss= preprocessing.StandardScaler()
deposit_new[nc] = ss.fit_transform(deposit_new[nc])

In [21]:
deposit_new[nc]

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1.533034,0.010471,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
1,1.628993,-0.421501,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
2,-0.290186,-0.124520,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
3,-0.002309,-0.413787,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
4,1.533034,0.187888,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.712460,0.331680
...,...,...,...,...,...,...,...,...,...,...
41183,3.164336,0.292025,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41184,0.573445,0.481012,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41185,1.533034,-0.267225,-0.204909,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697
41186,0.381527,0.708569,-0.565922,0.195414,-0.349494,-0.752343,2.058168,-2.224953,-1.495186,-2.815697


In [22]:
deposit_new.head()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,...,month_may,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success
0,1.533034,0.010471,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0
1,1.628993,-0.421501,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0
2,-0.290186,-0.12452,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0
3,-0.002309,-0.413787,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0
4,1.533034,0.187888,-0.565922,0.195414,-0.349494,0.648092,0.722722,0.886447,0.71246,0.33168,...,1,0,0,0,1,0,0,0,1,0


In [23]:
deposit_new.columns

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'y',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'marital_unknown',
       'education_basic.6y', 'education_basic.9y', 'education_high.school',
       'education_illiterate', 'education_professional.course',
       'education_university.degree', 'education_unknown', 'default_unknown',
       'default_yes', 'housing_unknown', 'housing_yes', 'loan_unknown',
       'loan_yes', 'contact_telephone', 'month_aug', 'month_dec', 'month_jul',
       'month_jun', 'month_mar', 'month_may', 'month_nov', 'month_oct',
       'month_sep', 'day_of_week_mon', 'day_of_week_thu', 'day_of_week_tue',
       'day_of_week_wed', 'poutcome_nonexistent', 'poutc

#### Defining dependent and independent variables

In [24]:
# defining dependent and independent variables 
X = deposit_new.drop('y',1)
Y = deposit_new['y']

#### Splitting data into train and test 

In [25]:
# train test split 
trainx,testx,trainy,testy = train_test_split(X,Y,test_size= 0.2)

In [26]:
trainx.shape

(32950, 53)

### Model 
We will first build a SVM model 

In [27]:
# using gridsearch for svm
lim = 5 
lov_c = np.logspace(-5,4,lim)
lov_g = np.random.random(lim)
params = [{'kernel':['linear'], 'C':lov_c ,
          'kernel':['sigmoid'],'C':lov_c, 'gamma':lov_g,
          'kernel':['poly'],'C':lov_c, 'gamma':lov_g,
          'kernel':['rbf'],'C':lov_c, 'gamma':lov_g}]

In [28]:
lov_g

array([0.26128744, 0.26042841, 0.79988867, 0.88677162, 0.6167061 ])

In [29]:
model = svm.SVC()
grid = GridSearchCV(model,param_grid = params, 
                   scoring= 'accuracy',n_jobs= -1, cv= 3).fit(trainx,trainy)

In [30]:
# best parameters from gridSearchCV
bp = grid.best_params_
bp

{'C': 0.31622776601683794, 'gamma': 0.26042841482468293, 'kernel': 'rbf'}

In [31]:
# build the SVM model using best parametrs from GridSearchCV
m1 = svm.SVC(kernel = bp['kernel'], C= bp['C'], gamma = bp['gamma']).fit(trainx,trainy)

In [37]:
# predictions 
p1= m1.predict(testx)

In [44]:
# confusion matrix and classification report 

# confusion matrix 
df = pd.DataFrame({'actual':testy,'predicted':p1})
print(pd.crosstab(df.actual,df.predicted, margins = True ))
print('\n')
print(classification_report(testy,p1))

predicted    0     1   All
actual                    
0          368   576   944
1          201  7093  7294
All        569  7669  8238


              precision    recall  f1-score   support

           0       0.65      0.39      0.49       944
           1       0.92      0.97      0.95      7294

    accuracy                           0.91      8238
   macro avg       0.79      0.68      0.72      8238
weighted avg       0.89      0.91      0.90      8238



* This model shows a overall accuracy of 91%