# Document Classifier with Multinomial Naive Bayes

> For discret analisys. Useful to classifier reports, emails, medical information and many kinds of repetition patterns of words.


      



In [None]:
# Import libraries
import numpy as np
from sklearn.datasets import fetch_20newsgroups # the dataset 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer # generates weights for the words
from sklearn.pipeline import Pipeline # allocates the model in a pipeline
from sklearn.model_selection import GridSearchCV # optimizes hyperparameters
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
fetch_20newsgroups

<function sklearn.datasets._twenty_newsgroups.fetch_20newsgroups>

In [None]:
# Defining categories 
# In this exemple I used only 4 out of 20 available to make the classification process faster
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [None]:
# training
twenty_train = fetch_20newsgroups(subset = 'train', categories = categories, shuffle = True, random_state = 42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
# Classes
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [None]:
len(twenty_train.data) # quantity of documents

2257

In [None]:
# Visualizing some attributes 
print("\n".join(twenty_train.data[0].split("\n")[:10])) 

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton
Organization: The City University
Lines: 14

Does anyone know of a good way (standard PC application/PD utility) to
convert tif/img/tga files into LaserJet III format.  We would also like to
do the same, converting to HPGL (HP plotter) files.



In [None]:
# Visualizing the target
print(twenty_train.target_names[twenty_train.target[0]]) 

comp.graphics


In [None]:
# Scikit-Learn creates a array to increase processing
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [None]:
# Visualizing the classes of 10 first records
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


Bag of Words 



In [None]:
# Tokenizing 
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
count_vect.vocabulary_.get(u'algorithm')
X_train_counts.shape


(2257, 35788)

In [None]:
# Term Frequency times Inverse Document Frequency (Tfidf)
tf_transformer = TfidfTransformer(use_idf = False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [None]:
# Same results as above, but combining functions
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [None]:
# Creating a Multinomial Model
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) 

In [None]:
# testing a new document entrance
docs_new = ['Antibiotics']
X_new_counts = count_vect.transform(docs_new) 
X_new_tfidf = tfidf_transformer.transform(X_new_counts) 

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'Antibiotics' => sci.med


In [None]:
# Pipeline - Composite classifier
# vectorizer => transform => classifier 
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])

In [None]:
# Fit
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

In [None]:
# Accuracy of the Model
twenty_test = fetch_20newsgroups(subset = 'test', categories = categories, shuffle = True, random_state = 42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)    

0.8348868175765646

In [None]:
# Metrics
print(metrics.classification_report(twenty_test.target, predicted, target_names = twenty_test.target_names))


                        precision    recall  f1-score   support

           alt.atheism       0.97      0.60      0.74       319
         comp.graphics       0.96      0.89      0.92       389
               sci.med       0.97      0.81      0.88       396
soc.religion.christian       0.65      0.99      0.78       398

              accuracy                           0.83      1502
             macro avg       0.89      0.82      0.83      1502
          weighted avg       0.88      0.83      0.84      1502



In [None]:
# Confusion Matrix
metrics.confusion_matrix(twenty_test.target, predicted)


array([[192,   2,   6, 119],
       [  2, 347,   4,  36],
       [  2,  11, 322,  61],
       [  2,   2,   1, 393]])

In [None]:
# Parameters for GridSearchCV - tuning of hyperparameters
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], #
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}

In [None]:
# GridSearchCV - classifier
gs_clf = GridSearchCV(text_clf, parameters, n_jobs = -1)

In [None]:
# Fit
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [None]:
# Test
twenty_train.target_names[gs_clf.predict(['Lord', 'see', 'priest', 'intel'])[0]]

'soc.religion.christian'

In [None]:
# Score -it improved with tuning?
gs_clf.best_score_        

0.9349999999999999

In [None]:
# the parameters used
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: True
vect__ngram_range: (1, 2)
