# iLykei Lecture Series 

# Machine Learning

# Support Vector Machines

# Workshop 3

## Y.Balasanov, L.Nazarov &copy; iLykei 2017-2022

This is Python version of Week 8 Workshop 3. Below is the list of packages used in the workshop.

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
from numpy.random import seed #normal,random,binomial,,choice
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.svm import SVC #, SVR, LinearSVC, LinearSVR
from sklearn.preprocessing import StandardScaler #LabelEncoder, 
from sklearn.model_selection import cross_val_score,GridSearchCV,train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve 
from sklearn.metrics import accuracy_score, cohen_kappa_score
import string

# Spam data    

Spam data were collected by Hewlett-Packard Labs. The set contains 4601 emails of types "spam", "non-spam".   
Besides the type in the file there are 57 predictors indicating frequencies of words and characters from vocabulary.    
Help file explains the column meanings:    

The first 48 variables contain the frequency of the variable name (e.g., business) in the e-mail. If the variable name starts with num (e.g., num650) it indicates the frequency of the corresponding number (e.g., 650). The variables 49-54 indicate the frequency of the characters ‘;’, ‘(’, ‘[’, ‘!’, ‘\$’, and ‘\#’. The variables 55-57 contain the average, longest and total run-length of capital letters.    

Data are available through (UCI Machine Learning Repository)[https://archive.ics.uci.edu/ml/datasets/spambase].

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.names'
spam_names = pd.read_csv(url,sep=':',skiprows=32,header=None)[0]
# shorten names
spam_names = list(spam_names.str.replace('word_freq_','')) + ['type']
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'
spam = pd.read_csv(url,names=spam_names)
print(list(spam))
spam.head()

['make', 'address', 'all', '3d', 'our', 'over', 'remove', 'internet', 'order', 'mail', 'receive', 'will', 'people', 'report', 'addresses', 'free', 'business', 'email', 'you', 'credit', 'your', 'font', '000', 'money', 'hp', 'hpl', 'george', '650', 'lab', 'labs', 'telnet', '857', 'data', '415', '85', 'technology', '1999', 'parts', 'pm', 'direct', 'cs', 'meeting', 'original', 'project', 're', 'edu', 'table', 'conference', 'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$', 'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'type']


Unnamed: 0,make,address,all,3d,our,over,remove,internet,order,mail,...,char_freq_;,char_freq_(,char_freq_[,char_freq_!,char_freq_$,char_freq_#,capital_run_length_average,capital_run_length_longest,capital_run_length_total,type
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


Prepare train and test samples.

In [3]:
N = len(spam)
X,y = spam.drop('type',axis=1),spam['type']
spam_train,spam_test,y_train,y_test = train_test_split(spam, y, test_size =1/3, 
                                                       stratify=y,random_state=0)

# Fitting SVM with default kernel    

Tune SVM with radial kernel (default) like in workshop example with Galton's data.    
Use grid for `gamma = [1e-06,1e-05,1e-04,1e-03]` and `C = [10,100]`.

<span style="color:red">(Skipped Code)</span>

In [4]:
# Parameters grid to search
param_test = {'gamma':.1**np.arange(3,7), 'C': [10,100]}
grid = GridSearchCV(estimator = SVC(), param_grid = param_test, 
                    n_jobs=-1,cv=10)
grid.fit(spam_train[:300], y_train[:300])
# summarize results
for mean,param in zip(grid.cv_results_['mean_test_score'],grid.cv_results_['params']):
    print("%f with: " % mean,param)
bestGamma = grid.best_params_['gamma']
bestC = grid.best_params_['C']

0.743333 with:  {'C': 10, 'gamma': 0.0010000000000000002}
0.730000 with:  {'C': 10, 'gamma': 0.00010000000000000002}
0.683333 with:  {'C': 10, 'gamma': 1.0000000000000003e-05}
0.690000 with:  {'C': 10, 'gamma': 1.0000000000000004e-06}
0.786667 with:  {'C': 100, 'gamma': 0.0010000000000000002}
0.843333 with:  {'C': 100, 'gamma': 0.00010000000000000002}
0.726667 with:  {'C': 100, 'gamma': 1.0000000000000003e-05}
0.673333 with:  {'C': 100, 'gamma': 1.0000000000000004e-06}


Fit model with best parameters.

In [5]:
clf = SVC(C=grid.best_params_['C'],gamma=grid.best_params_['gamma'])
clf.fit(spam_train, y_train)
pred = clf.predict(spam_test)
print(confusion_matrix(y_test,pred))
print('accuracy',accuracy_score(y_test,pred))
print('kappa',cohen_kappa_score(y_test,pred))

[[909  21]
 [ 20 584]]
accuracy 0.9732724902216427
kappa 0.9440328324580028


# Experimenting with kernels     

Experiment with kernels. Try training and fitting model with linear, sigmoid, polynomial kernel of degree 2 and degree 3.   

<span style="color:red">(Skipped Code)</span>

In [6]:
param_test = {'gamma':.1**np.arange(3,7), 'C': [10,100]}
param_test['kernel'] = ["linear","sigmoid","poly"]
# "linear","sigmoid" and "poly" with degree = 3
grid = GridSearchCV(estimator = SVC(), param_grid = param_test, 
                    n_jobs=-1,cv=10)
grid.fit(spam_train[:300], y_train[:300])
# summarize results
for mean,param in zip(grid.cv_results_['mean_test_score'],grid.cv_results_['params']):
    print("%f with: " % mean,param)
bestGamma = grid.best_params_['gamma']
bestC = grid.best_params_['C']
bestKernel = grid.best_params_['kernel']
print('best params',grid.best_params_)

1.000000 with:  {'C': 10, 'gamma': 0.0010000000000000002, 'kernel': 'linear'}
0.350000 with:  {'C': 10, 'gamma': 0.0010000000000000002, 'kernel': 'sigmoid'}
0.950000 with:  {'C': 10, 'gamma': 0.0010000000000000002, 'kernel': 'poly'}
1.000000 with:  {'C': 10, 'gamma': 0.00010000000000000002, 'kernel': 'linear'}
0.443333 with:  {'C': 10, 'gamma': 0.00010000000000000002, 'kernel': 'sigmoid'}
0.833333 with:  {'C': 10, 'gamma': 0.00010000000000000002, 'kernel': 'poly'}
1.000000 with:  {'C': 10, 'gamma': 1.0000000000000003e-05, 'kernel': 'linear'}
0.590000 with:  {'C': 10, 'gamma': 1.0000000000000003e-05, 'kernel': 'sigmoid'}
0.706667 with:  {'C': 10, 'gamma': 1.0000000000000003e-05, 'kernel': 'poly'}
1.000000 with:  {'C': 10, 'gamma': 1.0000000000000004e-06, 'kernel': 'linear'}
0.706667 with:  {'C': 10, 'gamma': 1.0000000000000004e-06, 'kernel': 'sigmoid'}
0.683333 with:  {'C': 10, 'gamma': 1.0000000000000004e-06, 'kernel': 'poly'}
1.000000 with:  {'C': 100, 'gamma': 0.0010000000000000002, 

Run separately search with "poly" and degree = 2

In [7]:
param_test['kernel'] = ["poly"]
grid = GridSearchCV(estimator = SVC(degree = 2), param_grid = param_test, 
                    n_jobs=-1,cv=10)
grid.fit(spam_train[:300], y_train[:300])
# summarize results
for mean,param in zip(grid.cv_results_['mean_test_score'],grid.cv_results_['params']):
    print("%f with: " % mean,param)
print('best params',grid.best_params_)

0.963333 with:  {'C': 10, 'gamma': 0.0010000000000000002, 'kernel': 'poly'}
0.810000 with:  {'C': 10, 'gamma': 0.00010000000000000002, 'kernel': 'poly'}
0.700000 with:  {'C': 10, 'gamma': 1.0000000000000003e-05, 'kernel': 'poly'}
0.670000 with:  {'C': 10, 'gamma': 1.0000000000000004e-06, 'kernel': 'poly'}
0.983333 with:  {'C': 100, 'gamma': 0.0010000000000000002, 'kernel': 'poly'}
0.913333 with:  {'C': 100, 'gamma': 0.00010000000000000002, 'kernel': 'poly'}
0.763333 with:  {'C': 100, 'gamma': 1.0000000000000003e-05, 'kernel': 'poly'}
0.693333 with:  {'C': 100, 'gamma': 1.0000000000000004e-06, 'kernel': 'poly'}
best params {'C': 100, 'gamma': 0.0010000000000000002, 'kernel': 'poly'}


Select the best kernel and show the confusion table and class agreement of the selected model.  
Linear kernel showed the best performance.

In [8]:
clf = SVC(C=bestC, gamma=bestGamma, kernel=bestKernel)
clf.fit(spam_train, y_train)
pred = clf.predict(spam_test)
print('accuracy',accuracy_score(y_test,pred))
print('kappa',cohen_kappa_score(y_test,pred))
pd.DataFrame(confusion_matrix(y_test,pred),
    columns=['not spam','spam'],index=['pred not spam','pred spam'])

accuracy 0.9973924380704041
kappa 0.9945382040874458


Unnamed: 0,not spam,spam
pred not spam,928,2
pred spam,2,602
