#### Import packages

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [2]:
categories = ['alt.atheism', 'soc.religion.christian','comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test',categories=categories, shuffle=False, random_state=42)

In [3]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [4]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

X_train_tf = tf_transformer.transform(X_train_counts)

In [5]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [6]:
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)


In [7]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [8]:
X_test_counts = count_vect.transform(twenty_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

predicted2 = clf.predict(X_test_tfidf)

In [9]:
for doc, category in zip(X_test_tfidf, predicted2):
     print('%r => %s' % (doc, twenty_train.target_names[category]))

<1x35788 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format> => comp.graphics
<1x35788 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format> => soc.religion.christian
<1x35788 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format> => soc.religion.christian
<1x35788 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format> => comp.graphics
<1x35788 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format> => soc.religion.christian


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
print('accuracy : %f'%(accuracy_score(y_test, y_predict)))   
cnf_matrix = confusion_matrix(y_test, y_predict)
print('Confusion matrix:')
print(cnf_matrix)