# SVM Multi-class Classification

So far we have used a subset of the 20-news group data set for Naive Bayes and logistic regression. Let's try SVM.



In [1]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train', 
                                  categories=categories, shuffle=True, random_state=42)
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

#### Support Vector Classifier



In [2]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss

pipe1 = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svc', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
])

pipe1.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [3]:
# evaluate on test data
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
pred = pipe1.predict(twenty_test.data)

from sklearn import metrics
print(metrics.classification_report(twenty_test.target, pred,
     target_names=twenty_test.target_names))

print("Confusion matrix:\n", metrics.confusion_matrix(twenty_test.target, pred))

import numpy as np
print("\nOverall accuracy: ", np.mean(pred==twenty_test.target))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502

Confusion matrix:
 [[258  11  15  35]
 [  4 379   3   3]
 [  5  33 355   3]
 [  5  10   4 379]]

Overall accuracy:  0.912782956059


This is a little better than the logistic regression model. However, SVM has a lot of parameters we can tune.

In [5]:
from sklearn.model_selection import GridSearchCV
parameters = {'tfidf__use_idf': (True, False),
             'svc__alpha': (1e-2, 1e-3),
                }

In [6]:
gs_clf = GridSearchCV(pipe1, parameters, cv=5, iid=False, n_jobs=-1)

In [8]:
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

In [9]:
gs_clf.best_score_

0.96499694503025402

In [10]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))


svc__alpha: 0.001
tfidf__use_idf: True
