In [1]:
import numpy as np
from scipy.sparse import vstack

from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_data = fetch_20newsgroups(subset='all',
                                 categories=categories,
                                 shuffle=True,
                                 remove=('headers', 'footers', 'quotes'),
                                 random_state=0)

twenty_data.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [3]:
print(len(twenty_data.target))
print("First target is", twenty_data.target_names[twenty_data.target[0]])
print("First document is", twenty_data.data[0])

3759
First target is alt.atheism
First document is 


But of course YOUR version of YOUR position has been included in the
Charley Challenges, so your claim above is a flat-out lie.  Further,
only last week you claimed that you "might not" answer the Challenges
because you were turned off by "included text".  So which is it, do
you want your context included in my articles or not?  Come to think
of it, this contradiction has the makings of a new entry in the next
Challenges post.

By the way, I've kept every bloody thing that you've written related
to this thread, and will be only too pleased to re-post any of it to
back my position.  You seem to have forgotten that you leave an
electronic paper trail on the net.



Now, now, let's not change the subject.  Wouldn't it be best to finish
up the thread in question before you begin new ones?


In [4]:
# Set up count vectorizer
count_vectorizer = CountVectorizer()
# First fit, then transform
# I.e. first count the number of words in each document,
# and then convert each document into a vector of word counts
X = count_vectorizer.fit_transform(twenty_data.data)

# 'shape' gives us the number of rows and number of columns
# of X
# In this case, the number of rows is the number of documents
# And the number of columns is the number of individual words
print(X.shape)
print(type(X))

(3759, 38181)
<class 'scipy.sparse.csr.csr_matrix'>


In [5]:
y = twenty_data.target

In [6]:
# X_train = training set inputs (each input is a term-document matrix)
# y_train = training set outputs (each output is a category like 'graphics', 'medicine', 'christianity')
# X_test = test set inputs (again, each input is a t-d matrix)
# y_test = test set outputs (again, a category)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)

In [7]:
# Say what model I want to use
model = LinearSVC(C=1000)

# Learn the relationship between the inputs and output
# The first parameter represents the inputs/features
# The second parameter represents the outputs/targets
model.fit(X_train, y_train)

LinearSVC(C=1000, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [8]:
# X_test contains the test set inputs
# These inputs are unseen by the model
predicted = model.predict(X_test)

# y_test contains the correct categories
# predicted contains our predicted categories
# Accuracy = Percentage of correct predictions
print("Test set accuracy is", np.mean(predicted == y_test))

Test set accuracy is 0.737588652482


In [9]:
print(classification_report(y_test,
                            predicted,
                            target_names=twenty_data.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.68      0.59      0.63       116
         comp.graphics       0.84      0.79      0.82       155
               sci.med       0.86      0.68      0.76       140
soc.religion.christian       0.62      0.85      0.72       153

           avg / total       0.76      0.74      0.74       564



In [10]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.15, random_state=0)

In [11]:
for C in (1, 10, 100, 1000):
    model = LinearSVC(C=1000)
    model.fit(X_train, y_train)
    predicted = model.predict(X_validation)
    print("Test set accuracy for C = {0} is {1}".format(C, np.mean(predicted == y_validation)))

Test set accuracy for C = 1 is 0.7729166666666667
Test set accuracy for C = 10 is 0.75625
Test set accuracy for C = 100 is 0.7645833333333333
Test set accuracy for C = 1000 is 0.75
