# Support Vector Classifier

In [1]:
import pandas as pd
from io import StringIO
import numpy as np
import pickle 
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn import svm

Load files

In [2]:
with open('pickles/features_train.pkl', 'rb') as f:
    features_train = pickle.load(f)

with open('pickles/labels_train.pickle', 'rb') as f:
    labels_train = pickle.load(f)

with open('pickles/features_test.pkl', 'rb') as f:
    features_test = pickle.load(f)

with open('pickles/labels_test.pkl', 'rb') as f:
    labels_test = pickle.load(f)


In [3]:
print('SVC parameters')
svc = svm.SVC()
print(svc.get_params())


SVC parameters
{'C': 1.0, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


# GridSearchCV
Create parameter grid

We explore only the linear kernel as it is considered to perform best on a high dimensional dataset such as text

In [12]:
param_grid = {
    'C' : [0.1, 1, 10, 100],
    'gamma' : [1, 0.1, 0.01],
    'degree' : [3, 4, 5], 
    'kernel' : ['linear']
}

grid_search = GridSearchCV(svc,
                        param_grid, 
                        scoring='accuracy', 
                        n_jobs=-1, 
                        cv=3, 
                        verbose=3)
grid_search.fit(features_train, labels_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   27.1s


KeyboardInterrupt: 

In [5]:
print(grid_search.best_estimator_)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


In [6]:
best_svc = grid_search.best_estimator_
best_svc.fit(features_train, labels_train)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [7]:
svc_pred = best_svc.predict(features_test)


In [8]:
print("The training accuracy is:")
print(accuracy_score(best_svc.predict(features_train), labels_train))


The training accuracy is:
0.9703264094955489


In [9]:
print("The test accuracy is:")
print(accuracy_score(svc_pred, labels_test))

The test accuracy is:
0.7014218009478673


In [10]:
print("classification report")
print(classification_report(svc_pred, labels_test))

classification report
              precision    recall  f1-score   support

           0       0.73      0.58      0.65        52
           1       0.60      0.63      0.61        43
           2       0.56      0.82      0.67        11
           3       0.84      0.84      0.84        32
           4       0.86      0.80      0.83        46
           5       0.78      0.94      0.85        31
           6       0.76      0.85      0.80        40
           7       0.74      0.55      0.63        51
           8       0.66      0.60      0.63        48
           9       0.53      0.53      0.53        38
          10       0.60      0.87      0.71        30

    accuracy                           0.70       422
   macro avg       0.70      0.73      0.70       422
weighted avg       0.71      0.70      0.70       422



Save the model

In [11]:
with open('pickles/best_svc.pkl', 'wb') as f:
    pickle.dump(best_svc, f)


This model appeaers to be overfitting and gives a f1 score of 0.70