# Linear Model 3 - Linear SVM

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from auxiliars import *
import pickle

## Data

In [2]:
data = pd.read_csv("./data/stdHTRU_2.csv")

In [3]:
col = data['class'].map({1:'r', 0:'b'})
pd.plotting.scatter_matrix(data.drop(['class'], axis = 1), c=col, figsize=(15,15))

array([[<matplotlib.axes._subplots.AxesSubplot object at 0x1153c4b38>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116875b70>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1168b6048>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1168e75f8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116917ba8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116957198>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116985748>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x1169b6d30>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x1169b6d68>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116a25898>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116a57e48>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116a92438>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116ac59e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x116af6f98>,
     

From the Scatter Matrix we can de deduce that the Linear Kernel should be sufficient for the separation of classes.

Even so, we can obvserve that some features, see for example DM_mean-DM_stdev, have very close data. In order to reduce the impact of this fact, let's train SVM with (standarized) normal data and data with selected features. 

We split a separate test set of relative size 20%:

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data[data.columns[0:8]], 
                                                    data['class'], 
                                                    test_size = 0.2,
                                                    random_state = 1234)

We will analyze the performance of the method with no-correlated standarized data: 

In [5]:
noCorrData = pd.read_csv("./data/noCorrStdHTRU_2.csv")

In [6]:
X_train_NC, X_test_NC, y_train_NC, y_test_NC = train_test_split(noCorrData[noCorrData.columns[0:6]], 
                                                    noCorrData['class'], 
                                                    test_size = 0.2,
                                                    random_state = 1234)

## Model Training

In order to train Linear SVM we are going to use the scikit-learn LinearSVC class, specialized in Linear SVM.

In [7]:
from sklearn.svm import LinearSVC

In [8]:
SVMClass = LinearSVC(random_state = 1234, max_iter = 5000)

LinearSVC allow us to hypertuning the following parameters:
- Regularization parameter C.
- Class weights:
    - Dict: Weights specified by class.
    - Balanced: Uses the values of target (y) to automatically adjust weights inversely proportional to class frequencies in the input data.

In order to hypertuning model parameters and get a better idea on how the model performs on unseen data, we will use GridSearchCV.

In [9]:
from sklearn.model_selection import GridSearchCV

Values of the 10-Fold CV Grid to test:

In [10]:
grid = {'C': [10**x for x in range(-3, 4, 1)],
        'class_weight': [{0: 1, 1: 1}, 'balanced']}

In [11]:
grid

{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
 'class_weight': [{0: 1, 1: 1}, 'balanced']}

Grid Search 10-Fold CV:

In [12]:
gs10cv = GridSearchCV(SVMClass, param_grid = grid, cv = 10, n_jobs = -1)

### Normal Data Training

In [13]:
gs10cv.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=1234, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'class_weight': [{0: 1, 1: 1}, 'balanced']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [14]:
pd.DataFrame(gs10cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.067043,0.015112,0.005495,0.005022,0.001,"{0: 1, 1: 1}","{'C': 0.001, 'class_weight': {0: 1, 1: 1}}",0.972067,0.977654,0.973464,0.972067,0.968575,0.977654,0.977654,0.969972,0.976939,0.973445,0.973949,0.0032,6
1,0.129222,0.038246,0.005577,0.003026,0.001,balanced,"{'C': 0.001, 'class_weight': 'balanced'}",0.969274,0.974162,0.967877,0.969274,0.97067,0.973464,0.968575,0.96648,0.966457,0.966457,0.969269,0.002636,14
2,0.106115,0.05324,0.020416,0.04516,0.01,"{0: 1, 1: 1}","{'C': 0.01, 'class_weight': {0: 1, 1: 1}}",0.974162,0.97905,0.974162,0.976257,0.97067,0.97905,0.980447,0.973464,0.980433,0.975542,0.976324,0.003144,5
3,0.569645,0.281467,0.003367,0.000886,0.01,balanced,"{'C': 0.01, 'class_weight': 'balanced'}",0.974162,0.976955,0.97486,0.97067,0.972765,0.976257,0.972067,0.972765,0.972048,0.967855,0.97304,0.002546,10
4,0.724118,0.322835,0.004992,0.002068,0.1,"{0: 1, 1: 1}","{'C': 0.1, 'class_weight': {0: 1, 1: 1}}",0.976257,0.980447,0.975559,0.978352,0.975559,0.981844,0.980447,0.975559,0.979734,0.979036,0.978279,0.00226,3
5,3.026904,0.825948,0.008067,0.015337,0.1,balanced,"{'C': 0.1, 'class_weight': 'balanced'}",0.97486,0.976257,0.974162,0.972765,0.974162,0.973464,0.97067,0.972765,0.971349,0.97065,0.97311,0.001751,9
6,3.008573,0.900897,0.00785,0.007868,1.0,"{0: 1, 1: 1}","{'C': 1, 'class_weight': {0: 1, 1: 1}}",0.976955,0.981145,0.975559,0.97905,0.976257,0.982542,0.981145,0.976257,0.979734,0.979734,0.978838,0.002316,1
7,16.433442,2.413215,0.00726,0.003504,1.0,balanced,"{'C': 1, 'class_weight': 'balanced'}",0.974162,0.97486,0.972067,0.972067,0.974162,0.973464,0.97067,0.969972,0.969951,0.969252,0.972063,0.001931,11
8,6.205535,0.231656,0.003481,0.000471,10.0,"{0: 1, 1: 1}","{'C': 10, 'class_weight': {0: 1, 1: 1}}",0.976955,0.981145,0.975559,0.97905,0.976257,0.982542,0.981145,0.976955,0.979036,0.979734,0.978838,0.002231,2
9,14.519821,0.568743,0.003143,0.000644,10.0,balanced,"{'C': 10, 'class_weight': 'balanced'}",0.972765,0.97486,0.969972,0.972067,0.974162,0.973464,0.97067,0.969972,0.969951,0.969252,0.971714,0.001911,12


In [15]:
gs10cv.best_params_

{'C': 1, 'class_weight': {0: 1, 1: 1}}

In [16]:
pd.DataFrame(gs10cv.cv_results_).iloc[gs10cv.best_index_]

mean_fit_time                                        3.00857
std_fit_time                                        0.900897
mean_score_time                                   0.00784965
std_score_time                                    0.00786795
param_C                                                    1
param_class_weight                              {0: 1, 1: 1}
params                {'C': 1, 'class_weight': {0: 1, 1: 1}}
split0_test_score                                   0.976955
split1_test_score                                   0.981145
split2_test_score                                   0.975559
split3_test_score                                    0.97905
split4_test_score                                   0.976257
split5_test_score                                   0.982542
split6_test_score                                   0.981145
split7_test_score                                   0.976257
split8_test_score                                   0.979734
split9_test_score       

In [17]:
# Save model
SVMClassFile = open('./models/SVMClass_BestCV_STDData_pickle_file', 'wb')
pickle.dump(gs10cv, SVMClassFile) 

### No-correlated Data Training

Grid Search 10-Fold CV:

In [18]:
gs10cv_nc = GridSearchCV(SVMClass, param_grid = grid, cv = 10, n_jobs = -1)

Training:

In [19]:
gs10cv_nc.fit(X_train_NC, y_train_NC)



GridSearchCV(cv=10, error_score=nan,
             estimator=LinearSVC(C=1.0, class_weight=None, dual=True,
                                 fit_intercept=True, intercept_scaling=1,
                                 loss='squared_hinge', max_iter=5000,
                                 multi_class='ovr', penalty='l2',
                                 random_state=1234, tol=0.0001, verbose=0),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'class_weight': [{0: 1, 1: 1}, 'balanced']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [20]:
pd.DataFrame(gs10cv_nc.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.072511,0.016275,0.004814,0.001002,0.001,"{0: 1, 1: 1}","{'C': 0.001, 'class_weight': {0: 1, 1: 1}}",0.972765,0.976955,0.972067,0.972765,0.969274,0.977654,0.979749,0.969274,0.97624,0.973445,0.974019,0.003345,7
1,0.096511,0.018379,0.005374,0.004214,0.001,balanced,"{'C': 0.001, 'class_weight': 'balanced'}",0.969972,0.97486,0.967877,0.969972,0.97067,0.971369,0.968575,0.967179,0.968553,0.964361,0.969339,0.002645,14
2,0.174571,0.054502,0.006091,0.005424,0.01,"{0: 1, 1: 1}","{'C': 0.01, 'class_weight': {0: 1, 1: 1}}",0.974162,0.97905,0.974162,0.976955,0.97067,0.978352,0.980447,0.973464,0.980433,0.975542,0.976324,0.003097,6
3,0.345941,0.051665,0.004582,0.002179,0.01,balanced,"{'C': 0.01, 'class_weight': 'balanced'}",0.97486,0.976955,0.97486,0.969972,0.972765,0.976257,0.972765,0.972067,0.972048,0.967855,0.97304,0.002659,12
4,0.346532,0.057209,0.003759,0.001156,0.1,"{0: 1, 1: 1}","{'C': 0.1, 'class_weight': {0: 1, 1: 1}}",0.975559,0.979749,0.975559,0.976955,0.972765,0.980447,0.981145,0.973464,0.981831,0.979036,0.977651,0.003073,4
5,2.928549,0.289189,0.005061,0.002702,0.1,balanced,"{'C': 0.1, 'class_weight': 'balanced'}",0.97486,0.978352,0.97486,0.969972,0.972765,0.976257,0.974162,0.971369,0.972048,0.967855,0.97325,0.002935,9
6,3.288173,0.410353,0.004,0.001605,1.0,"{0: 1, 1: 1}","{'C': 1, 'class_weight': {0: 1, 1: 1}}",0.975559,0.979749,0.975559,0.976955,0.972765,0.980447,0.981145,0.974162,0.981831,0.979036,0.977721,0.002984,3
7,20.338766,3.271273,0.016011,0.014751,1.0,balanced,"{'C': 1, 'class_weight': 'balanced'}",0.974162,0.977654,0.97486,0.97067,0.972765,0.976257,0.974162,0.972067,0.972048,0.967855,0.97325,0.002675,9
8,7.889672,0.098631,0.00351,0.001085,10.0,"{0: 1, 1: 1}","{'C': 10, 'class_weight': {0: 1, 1: 1}}",0.975559,0.979749,0.975559,0.977654,0.972765,0.980447,0.981145,0.974162,0.981831,0.979734,0.97786,0.00301,2
9,17.747058,0.976429,0.004255,0.001413,10.0,balanced,"{'C': 10, 'class_weight': 'balanced'}",0.974162,0.977654,0.97486,0.969972,0.972765,0.976257,0.974162,0.971369,0.972048,0.967855,0.97311,0.002785,11


In [21]:
gs10cv_nc.best_params_

{'C': 100, 'class_weight': {0: 1, 1: 1}}

In [22]:
pd.DataFrame(gs10cv_nc.cv_results_).iloc[gs10cv_nc.best_index_]

mean_fit_time                                          15.1452
std_fit_time                                          0.359181
mean_score_time                                     0.00347054
std_score_time                                      0.00138645
param_C                                                    100
param_class_weight                                {0: 1, 1: 1}
params                {'C': 100, 'class_weight': {0: 1, 1: 1}}
split0_test_score                                     0.975559
split1_test_score                                     0.980447
split2_test_score                                     0.975559
split3_test_score                                     0.979749
split4_test_score                                     0.975559
split5_test_score                                     0.981844
split6_test_score                                     0.981844
split7_test_score                                     0.974162
split8_test_score                                     0

In [23]:
# Save model
SVMClassFileNC = open('./models/SVMClass_BestCV_NCorrSTDData_pickle_file', 'wb')
pickle.dump(gs10cv_nc, SVMClassFileNC) 

## Testing

### Normal Data Model Testing

In [24]:
y_pred = gs10cv.predict(X_test)

In [25]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3249
           1       0.93      0.80      0.86       331

    accuracy                           0.98      3580
   macro avg       0.96      0.90      0.92      3580
weighted avg       0.98      0.98      0.98      3580



In [26]:
print ("Confusion Matrix:")
confusionMatrix(y_test, y_pred, classes = [0,1])

Confusion Matrix:


Predicted,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3229,20
1,65,266


In [27]:
print("Test Error:")
(1-accuracy_score(y_test, gs10cv.predict(X_test)))*100

Test Error:


2.3743016759776525

### No-correlated Data Model Testing

In [28]:
y_pred_NC = gs10cv_nc.predict(X_test_NC)

In [29]:
print(classification_report(y_test_NC, y_pred_NC))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3249
           1       0.93      0.82      0.87       331

    accuracy                           0.98      3580
   macro avg       0.95      0.90      0.93      3580
weighted avg       0.98      0.98      0.98      3580



In [30]:
print ("Confusion Matrix:")
confusionMatrix(y_test_NC, y_pred_NC, classes = [0,1])

Confusion Matrix:


Predicted,0,1
Real,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3228,21
1,61,270


In [31]:
print("Test Error:")
(1-accuracy_score(y_test_NC, gs10cv_nc.predict(X_test_NC)))*100

Test Error:


2.2905027932960897