In [49]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier

In [50]:
# Load the newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all')
data = newsgroups.data
target = newsgroups.target

In [23]:
print("\n".join(newsgroups.data[0].split("\n")[:3]))

print("\n",newsgroups.target_names[newsgroups.target[0]])

From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>
Subject: Pens fans reactions
Organization: Post Office, Carnegie Mellon, Pittsburgh, PA

 rec.sport.hockey


In [24]:
# Split dataset into training and testing set
data_train, data_test, target_train, target_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [13]:
# Bag of Words model
bow_vectorizer = CountVectorizer()
bow_train = bow_vectorizer.fit_transform(data_train)
bow_test = bow_vectorizer.transform(data_test)

(15076, 149303)


In [46]:
# N-grams model (bigrams in this example)
ngram_vectorizer = CountVectorizer(ngram_range=(2, 2))
ngram_train = ngram_vectorizer.fit_transform(data_train)
ngram_test = ngram_vectorizer.transform(data_test)

In [18]:
# Bag of Words model can be updated to account for term frequency (TF)
tf_transformer = TfidfTransformer(use_idf=False).fit(bow_train)
bow_train_tf = tf_transformer.transform(bow_train)
print(bow_train_tf.shape)

(15076, 149303)


In [31]:
# Train a Multinomial Naive Bayes classifier to predict the category of a post
classifier = MultinomialNB().fit(bow_train_tf, target_train)

docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new = bow_vectorizer.transform(docs_new)
X_new_tf = tf_transformer.transform(X_new)

predicted = classifier.predict(X_new_tf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, newsgroups.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.sys.ibm.pc.hardware


In [44]:
# Create a Pipeline
text_classifier = Pipeline([
    ('tf', TfidfTransformer(use_idf=False)),
    ('classifier', MultinomialNB())
])

In [45]:
# Evaluate performance
text_classifier.fit(bow_train, target_train)
predicted = text_classifier.predict(bow_test)
print('Accuracy achieved for Bag-of-Words model is ', np.mean(predicted == target_test))

text_classifier.fit(ngram_train, target_train)
predicted = text_classifier.predict(ngram_test)
print('Accuracy achieved for 2-grams model is ', np.mean(predicted == target_test))

Accuracy achieved for Bag-of-Words model is  0.7652519893899205
Accuracy achieved for 2-grams model is  0.8074270557029177


In [47]:
# Change the classifier used in our Pipeline to a linear Support Vector Machine
text_classifier = Pipeline([
    ('tf', TfidfTransformer(use_idf=False)),
    ('classifier', SGDClassifier(loss='hinge', penalty='l2',
                              alpha=1e-3, random_state=42,
                              max_iter=5, tol=None))
])

In [48]:
# Evaluate performance with the new classifier
text_classifier.fit(bow_train, target_train)
predicted = text_classifier.predict(bow_test)
print('Accuracy achieved for Bag-of-Words model is ', np.mean(predicted == target_test))

text_classifier.fit(ngram_train, target_train)
predicted = text_classifier.predict(ngram_test)
print('Accuracy achieved for 2-grams model is ', np.mean(predicted == target_test))

Accuracy achieved for Bag-of-Words model is  0.8236074270557029
Accuracy achieved for 2-grams model is  0.8697612732095491


In [None]:
# Parameter grid search to see which parameters are ideal for this specific task
parameters = {
    'n': [(1,1), (1,2), (2,2)],
    'tf_use_idf': (True, False),
    'alpha': (1e-2, 1e-3),
}

In [None]:
# Grid search classifier