### Bernoulli Naive Bayes Classifier to Predict Categories (Machine Learning vs. Business Software)

In [1]:
pwd

'/home/jovyan/ipynb'

In [2]:
from os import chdir
chdir('/home/jovyan/')

In [3]:
import library.db_helper as db
import library.functions as fy

In [4]:
import pandas as pd
import numpy as np

#### Setup a train/test/split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
text_query = '''
SELECT text
FROM page
'''
X = db.query_to_dataframe(text_query)
X.shape

(2449, 1)

In [7]:
X.values.ravel().shape

(2449,)

In [8]:
catid_query = '''
SELECT category_cid
FROM category_page
'''
y = db.query_to_dataframe(catid_query)
y.shape

(2449, 1)

In [9]:
y.values.ravel().shape

(2449,)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X.values.ravel(), y.values.ravel())

#### Build a Pipeline for a Bernoulli NB Classifier

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

In [12]:
bnnb_pipeline = Pipeline([
    ('cvt', CountVectorizer(min_df=3, ngram_range=(1,2))),
    ('svd', TruncatedSVD(n_components=300)),
    ('clf', BernoulliNB())
])

In [13]:
bnnb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('cvt', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_...te=None, tol=0.0)), ('clf', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))])

In [14]:
bnnb_pipeline.predict(X_train)

array([1, 1, 0, ..., 1, 1, 1])

In [15]:
bnnb_pipeline.score(X_test, y_test)

0.87275693311582381

In [16]:
proba_df = pd.DataFrame(bnnb_pipeline.predict_proba(X_train))\
                    .rename(columns={0:'Business Software', 1:'Machine Learning'})
proba_df.head()

Unnamed: 0,Business Software,Machine Learning
0,0.000386,0.999614
1,6.2e-05,0.999938
2,0.961748,0.038252
3,0.999525,0.000475
4,0.356613,0.643387


#### Test my Bernoulli NB Classifier Model on New Text

In [17]:
def clean_url (url):
    url_list = url.split('/')
    return url_list[-1]

In [18]:
salesops_url = 'https://en.wikipedia.org/wiki/Sales_operations'

In [19]:
salesops_article_name = clean_url(salesops_url)
salesops_article_name

'Sales_operations'

In [20]:
salesops_text = fy.beautify_html_article(salesops_article_name)

In [21]:
salesops_test = np.array([salesops_text])

In [22]:
bnnb_pipeline.predict(salesops_test), bnnb_pipeline.predict_proba(salesops_test)

(array([0]), array([[ 0.96483325,  0.03516675]]))

#### Generate a Classification Report on my Bernoulli NB Model

In [23]:
predicted = bnnb_pipeline.predict(X_test)

In [24]:
from sklearn.metrics import classification_report

In [25]:
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.91      0.90      0.90       407
          1       0.81      0.82      0.81       206

avg / total       0.87      0.87      0.87       613

