In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_all = fetch_20newsgroups(subset='all',
                                  categories=categories,
                                  shuffle=True,
                                  random_state=0,
                                  remove=('headers', 'footers', 'quotes'))

twenty_all.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [3]:
print(len(twenty_all.target))
print('First target is', twenty_all.target_names[twenty_all.target[0]])
#print('First document is', twenty_all.data[0])

3759
First target is alt.atheism


In [4]:
count_vectorizer = CountVectorizer()
# creating the term document matrix
X = count_vectorizer.fit_transform(twenty_all.data)
print(X.shape)
print(type(X))

(3759, 38181)
<class 'scipy.sparse.csr.csr_matrix'>


In [5]:
y = twenty_all.target

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [7]:
# fitting the model 
#
model = LinearSVC(C = 1000)
model.fit(X_train, y_train)

LinearSVC(C=1000, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [8]:
predicted = model.predict(X_test)
print('Test set accuracy is', np.mean(predicted == y_test))

Test set accuracy is 0.765957446809


In [9]:
# classification report
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.62      0.67      0.64       116
          1       0.87      0.77      0.82       155
          2       0.80      0.79      0.79       140
          3       0.77      0.81      0.79       153

avg / total       0.77      0.77      0.77       564



In [10]:
# confusion matrix
print(confusion_matrix(y_test, predicted))

[[ 78   7  11  20]
 [ 19 120   9   7]
 [ 12   7 110  11]
 [ 17   4   8 124]]


### Step 7: validation set

In [11]:
X_train2, X_validation, y_train2, y_validation = train_test_split(X_train, y_train, 
                                                                          test_size=0.15, random_state=0)

In [12]:
# new training set is smaller:
print('size of previous trainnig set: ', X_train.shape)
print('size of new trainnig set: ', X_train2.shape)

size of previous trainnig set:  (3195, 38181)
size of new trainnig set:  (2715, 38181)


In [16]:
trade_off = [1, 10, 100, 1000]


for point in trade_off:

    model = LinearSVC(C = point)
    model.fit(X_train2, y_train2)
    predicted = model.predict(X_validation)
    print('Validation set accuracy of model with C = {0} is'.format(point), np.mean(predicted == y_validation))
    #print(classification_report(y_test2, predicted))
    #print(confusion_matrix(y_test2, predicted))

Validation set accuracy of model with C = 1 is 0.825
Validation set accuracy of model with C = 10 is 0.814583333333
Validation set accuracy of model with C = 100 is 0.8125
Validation set accuracy of model with C = 1000 is 0.777083333333


In [14]:
# best model
model = LinearSVC(C = 1)
model.fit(X_train, y_train)
predicted = model.predict(X_test)
print('Test set accuracy is', np.mean(predicted == y_test))
# classification report
print('The classification report: \n', classification_report(y_test, predicted))
# confusion matrix
print('The confusion matrix for the best model: \n', confusion_matrix(y_test, predicted))

Test set accuracy is 0.794326241135
The classification report: 
              precision    recall  f1-score   support

          0       0.67      0.71      0.69       116
          1       0.88      0.86      0.87       155
          2       0.76      0.81      0.79       140
          3       0.85      0.77      0.81       153

avg / total       0.80      0.79      0.80       564

The confusion matrix for the best model: 
 [[ 82   3  14  17]
 [  8 134  13   0]
 [ 10  12 114   4]
 [ 22   4   9 118]]
