### Import required libraries

In [1]:
import numpy as np

In [2]:
import re

### Load input data

In [3]:
categories = ['talk.politics.guns', 'talk.politics.misc', 'talk.politics.mideast', 'talk.religion.misc']

In [4]:
from sklearn.datasets import fetch_20newsgroups

In [5]:
data_trace = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)

In [6]:
data_trace.target_names

['talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
len(data_trace.data)

1952

In [8]:
len(data_trace.filenames)

1952

In [9]:
print(data_trace.target_names[data_trace.target[0]])

talk.religion.misc


In [10]:
data_trace.data[0]

"From: psyrobtw@ubvmsb.cc.buffalo.edu (Robert Weiss)\nSubject: 6 Apr 93   God's Promise in John 16:24\nOrganization: University at Buffalo\nLines: 8\nNews-Software: VAX/VMS VNEWS 1.41\nNntp-Posting-Host: ubvmsb.cc.buffalo.edu\n\n\n\n\tHitherto have ye asked nothing\n\tin my name:\n\task, and ye shall receive,\n\tthat your joy may be full.\n\n\tJohn 16:24\n"

In [None]:
print("\n".join(data_trace.data[0].split("\n")[:]))

In [None]:
data_trace.target[:10]

In [None]:
for t in data_trace.target[:20]:
...     print(data_trace.target_names[t])

### The input files are texts. We need to:

> **1. Tokenize the strings to extract individual words. We can use white spaces and punctuation marks as token separators.**
<br>
> **2. Count the number of occurrences of tokens in each document.**
<br>
> **3. Convert counts to probabilities. This involves normalizing and possibly weighting.**



In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
train_counts = vectorizer.fit_transform(data_trace.data)
train_counts.shape

In [None]:
dictionary = vectorizer.vocabulary_
dictionary

In [None]:
vectorizer.vocabulary_.get('algorithm')

**We have the counts. Now to probabilities and naive Bayes.**

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_counts, data_trace.target)

In [None]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
new_counts = vectorizer.transform(docs_new)

In [None]:
predicted = clf.predict(new_counts)

In [None]:
print(data_trace.target_names[predicted[0]])
print(data_trace.target_names[predicted[1]])

**Apply on test data**

In [None]:
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
test_count = vectorizer.transform(docs_test)
predicted = clf.predict(test_count)
np.mean(predicted == twenty_test.target)

**Obtain performance metrics**

In [None]:
from sklearn import metrics
print (metrics.classification_report(twenty_test.target,predicted,target_names=twenty_test.target_names))

In [None]:
from sklearn import metrics
metrics.confusion_matrix(twenty_test.target, predicted)