In [2]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

The newsgroup dataset is a collection of 20,000 newsgroup documents positioned evenly across 20 different newsgroups. A newsgroup is a forum on the Usenet service for the discussion of a particular topic. Usenet is a distributed discussion system popular in the 80s and early 90s which allowed people to post articles to various newsgroups. Each document in the dataset is a single post.

In [3]:
# Load the newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')
data = newsgroups.data
target = newsgroups.target

In [4]:
# Split dataset into training and testing set. 
# The test set will be 20% of the dataset, following the 80-20 relation.
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, random_state=42)

## `CountVectorizer` ##
`CountVectorizer` is scikit-learn's implementation of a generic vectorization method. Different parameters can be added to tweak aspects of the process. The general steps used are:
1. Tokenization
2. Lowercasing
3. Removing Stop Words
4. Building a Vocabulary (Finding all unique words in the corpus). This step determines n--the length of the resulting vectors.
5. Counting occurrences
6. Output one vector for each document in the corpus

`CountVectorizer` also allows an N-gram range to be specified, allowing combinations of words to be captured in the resulting vector. Using N-gram will increase the vector space and potentially make it prohibitively large.

In [5]:
# Bag of Words model
bow_vectorizer = CountVectorizer()
bow_train = bow_vectorizer.fit_transform(data_train)
bow_test = bow_vectorizer.transform(data_test)

In [6]:
# N-grams model, this model captures only combinations of two tokens (no single words)
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))
ngram_train = ngram_vectorizer.fit_transform(data_train)
ngram_test = ngram_vectorizer.transform(data_test)

## `TfidfTransformer` ##

TfidfTransformer will compute a normalized Term Frequency vector, which divides the counts by the total number of words in the document. If the flag `use_idf=True`, then Inverse Document Frequency will also be calculated. Inverse Document Frequency is the logarithmically scaled inverse fraction of the documents that contain the word. IDF is used to measure the importance of a word relative to the corpus.

In [8]:
# Bag of Words model can be updated to account for term frequency (TF) in scikit-learn
tf_transformer = TfidfTransformer(use_idf=False).fit(bow_train)
bow_train_tf = tf_transformer.transform(bow_train)

## Multinomial Naive Bayes Classifier ##

A Multinomial Naive Bayes Classifier is a Naive Bayes classifier (Bayes Theorem with the assumption that all features are independent from each other) which assumes a multinomial distribution for the data. This is a good assumption to make for text data, where the features are counts of words.

In [9]:
# Train an example classifier (Multinomial Naive Bayes) to predict the category of a post
classifier = MultinomialNB().fit(bow_train_tf, target_train)

docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new = bow_vectorizer.transform(docs_new) 
X_new_tf = tf_transformer.transform(X_new)

predicted = classifier.predict(X_new_tf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, newsgroups.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.sys.ibm.pc.hardware


In [10]:
# Create a Pipeline, which allows one to apply multiple transformations together.
text_classifier = Pipeline([
    ('tf', TfidfTransformer(use_idf=True)),
    ('classifier', MultinomialNB())
])

## Performance Evaluation ##
Now we can evaluate the performance of difference forms of vectorization and compare them to one another on a couple of classifiers. With a MultinomialNB classifier, we achieve slightly better results by using bigram vectorization.

In [11]:
# Evaluate performance
text_classifier.fit(bow_train, target_train)
predicted = text_classifier.predict(bow_test)
print('Accuracy achieved for Bag-of-Words model is ', np.mean(predicted == target_test))

text_classifier.fit(ngram_train, target_train)
predicted = text_classifier.predict(ngram_test)
print('Accuracy achieved for 2-grams model is ', np.mean(predicted == target_test))

Accuracy achieved for Bag-of-Words model is  0.7652519893899205
Accuracy achieved for 2-grams model is  0.8010610079575596


## Support Vector Machine ##

SVM finds the best boundary to separate data points into classes. It is a good classifier for text but has high computational requirements. 

In [12]:
# Change the classifier used in our Pipeline to a linear Support Vector Machine
text_classifier = Pipeline([
    ('tf', TfidfTransformer(use_idf=False)),
    ('classifier', SGDClassifier(loss='hinge', penalty='l2',
                              alpha=1e-3, random_state=42,
                              max_iter=5, tol=None))
])

In [13]:
# Evaluate performance with the new classifier
text_classifier.fit(bow_train, target_train)
predicted = text_classifier.predict(bow_test)
print('Accuracy achieved for Bag-of-Words model is ', np.mean(predicted == target_test))

text_classifier.fit(ngram_train, target_train)
predicted = text_classifier.predict(ngram_test)
print('Accuracy achieved for 2-grams model is ', np.mean(predicted == target_test))

Accuracy achieved for Bag-of-Words model is  0.8236074270557029
Accuracy achieved for 2-grams model is  0.8697612732095491
