In [1]:
#The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents,
#partitioned (nearly) evenly across 20 different newsgroups. To the best of our knowledge,
#it was originally collected by Ken Lang, probably for his paper 
#“Newsweeder: Learning to filter netnews,” though he does not explicitly mention this
#collection.
#The 20 newsgroups collection has become a popular data set for experiments
#in text applications of machine learning techniques, such as text classification 
#and text clustering.

In [2]:
# selecting labels 
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [3]:
import sklearn
from sklearn.datasets import fetch_20newsgroups

dataset = sklearn.datasets.load_files("../text_analytics/data/twenty_newsgroups/20news-bydate-train/", categories=categories, load_content=True, shuffle=True, encoding='UTF-8', decode_error='replace', random_state=0)

In [4]:
# You can explore the data a bit
print dataset.target_names
print("\n".join(dataset.data[0].split("\n")[:3]))
print(dataset.target_names[dataset.target[0]])


['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
From: dpc47852@uxa.cso.uiuc.edu (Daniel Paul Checkman)
Subject: Re: Is MSG sensitivity superstition?
Article-I.D.: news.C5wI4F.Dt
sci.med


In [5]:
# Tokenisation
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset.data)


In [61]:
# How many times does GPU appear?
# How many words in total does your dictionary have?
# Which words are the most common?

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer
# Feature representation
# TF
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

#TF_IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [7]:
# Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, dataset.target)

In [9]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, dataset.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [11]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
                        ])

In [12]:
import numpy as np

dataset_test = sklearn.datasets.load_files("../text_analytics/data/twenty_newsgroups/20news-bydate-test/", categories=categories, load_content=True, shuffle=True, encoding='UTF-8', decode_error='replace', random_state=42)
docs_test = dataset_test.data
text_clf = text_clf.fit(dataset.data, dataset.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == dataset_test.target)

0.83488681757656458

In [None]:
#What happens if you change the classifier?
#Can you change the vectorizer?
#What happens if you exclude stop words or limit the size of the dictionary?
#Try with other classes.