In [None]:
# This notebook is based on sklearn's tutorial 'Working with Text Data'

In [1]:
import sklearn

In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']

In [3]:
#Loading the 20 Newsgroups dataset
from sklearn.datasets import fetch_20newsgroups

In [4]:
twenty_train = fetch_20newsgroups(subset='train',categories=categories, 
                                  shuffle=True, random_state=42)

In [5]:
#fetch_20newsgroups puts the data in the .data attribute
len(twenty_train.data)

2257

In [6]:
#This command shows the first text in the collection
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [7]:
# Extracting features from text data
# Make sure you read the part of the tutorial about the bags of words
# representation

In [8]:
# A vectorizer is used to extract features from each item in the dataset
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

(2257, 35788)

In [11]:
# CountVectorizer has extracted a docs x features matrix from the data
X_train_counts.shape

(2257, 35788)

In [12]:
# To see the index of a specific word, you can do the following
count_vect.vocabulary_.get(u'algorithm')

4690

In [13]:
# For many if not most applications, it is better to WEIGH terms wrt
# a document instead of simply COUNT their frequencies
# This can be done using one of the weighing methods: MI, TFIDF, IG,
# all of which are implemented in scikit-learn

In [14]:
# For instance, to compute document vector representations in which 
# words are weighed using TFiDF, you can use TfidfTransformer
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

In [15]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [16]:
# Training a NB classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [18]:
# Testing on a toy dataset
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [20]:
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [21]:
# A Pipeline is an object that can carry out count extraction, weighting
# and classification all in one go
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [23]:
# Proper testing on the full 20newsgroups test set
import numpy as np
twenty_test = fetch_20newsgroups(subset='test', categories=categories, 
                                 shuffle=True, random_state=42)
docs_test = twenty_test.data
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)

In [24]:
# Evaluation

In [25]:
# Using the metrics package
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

           avg / total       0.88      0.83      0.84      1502



In [26]:
# Confusion matrix
metrics.confusion_matrix(twenty_test.target, predicted)

array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]])

In [27]:
# Using a different learning algorithm: SVMs

In [28]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
                    ])

In [29]:
text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)

In [30]:
# Evaluating the results
print(metrics.classification_report(twenty_test.target, predicted,
                                    target_names=twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.95      0.81      0.87       319
         comp.graphics       0.88      0.97      0.92       389
               sci.med       0.94      0.90      0.92       396
soc.religion.christian       0.90      0.95      0.93       398

           avg / total       0.92      0.91      0.91      1502

