In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix

In [8]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories,
                                  shuffle=True,
                                  random_state=0)

twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories,
                                 shuffle=True,
                                 random_state=0)

twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
print(len(twenty_train.target))
print('First target is', twenty_train.target_names[twenty_train.target[0]])
print('First document is', twenty_train.data[0])

2257
First target is soc.religion.christian
First document is From: reedr@cgsvax.claremont.edu
Subject: Re: DID HE REALLY RISE???
Organization: The Claremont Graduate School
Lines: 29

In article <Apr.9.01.11.16.1993.16937@athos.rutgers.edu>, emery@tc.fluke.COM (John Emery) writes:
> The one single historic event that has had the biggest impact on the
> world over the centuries is the resurrection of Jesus.  At the same
> time, it is one of the most hotly contested topics....
> 
> Did Jesus Christ really rise from the dead?  Since the eyewitnesses
> are no longer living, we have only their written accounts. ...
> ...  Because of the magnitude of significance
> involved here, either the resurrection is the greatest event in the
> history of man or the greatest deception played on man.
> [massive amounts of data deleted]

John, 

While I will not take the time to rebut you point by point, I will suggest
three current works which I think will be helpful in your quest to answer
this questi

In [7]:
print(dir(twenty_train))
print(dir(twenty_test))

['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']
['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']


In [13]:
# built model
count_vectorizer = CountVectorizer()
# creating the term document matrix
X_train_counts = count_vectorizer.fit_transform(twenty_train.data)
print(X_train_counts.shape)
print(type(X_train_counts))

(2257, 35788)
<class 'scipy.sparse.csr.csr_matrix'>


In [14]:
# fitting the model 
#
model = MultinomialNB()
model.fit(X_train_counts, twenty_train.target)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [17]:
# model evaluation
#
docs_new = ['Reading the Bible',
            'OpenGL vs DirectX',
            'Diabetes and glucose',
            'Humanists in the Republican Party',]

X_new_counts = count_vectorizer.transform(docs_new)

predicted = model.predict(X_new_counts)

for doc, category in zip(docs_new, predicted):
    print('{0} => {1}'.format(doc, twenty_train.target_names[category]))

Reading the Bible => soc.religion.christian
OpenGL vs DirectX => comp.graphics
Diabetes and glucose => sci.med
Humanists in the Republican Party => alt.atheism
UK in London for vote => alt.atheism


In [18]:
docs_test = twenty_test.data
X_new_counts = count_vectorizer.transform(docs_test)
predicted = model.predict(X_new_counts)
print('Test set accuracy is', np.mean(predicted == twenty_test.target))

Test set accuracy is 0.934087882823


In [21]:
print(classification_report(twenty_test.target,
                           predicted,
                           target_names = twenty_test.target_names))

                        precision    recall  f1-score   support

           alt.atheism       0.92      0.90      0.91       319
         comp.graphics       0.95      0.95      0.95       389
               sci.med       0.96      0.91      0.93       396
soc.religion.christian       0.91      0.97      0.94       398

           avg / total       0.93      0.93      0.93      1502



In [22]:
print(confusion_matrix(twenty_test.target, predicted))

[[288   4   3  24]
 [  8 370   8   3]
 [ 12  13 360  11]
 [  5   4   4 385]]
