### GaussianNB Classifier to Predict Categories (Machine Learning vs. Business Software)

In [1]:
pwd

'/home/jovyan/ipynb'

In [2]:
from os import chdir
chdir('/home/jovyan/')

In [3]:
import library.db_helper as db
import library.functions as fy

In [4]:
import pandas as pd
import numpy as np

#### Setup a train/test/split

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
text_query = '''
SELECT text
FROM page
'''
X = db.query_to_dataframe(text_query)
X.shape

(2449, 1)

In [7]:
X.values.ravel().shape

(2449,)

In [8]:
catid_query = '''
SELECT category_cid
FROM category_page
'''
y = db.query_to_dataframe(catid_query)
y.shape

(2449, 1)

In [9]:
y.values.ravel().shape

(2449,)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X.values.ravel(), y.values.ravel())

#### Build a Pipeline for a Bernoulli NB Classifier

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB

In [12]:
gnb_pipeline = Pipeline([
    ('cvt', CountVectorizer(min_df=3, ngram_range=(1,2))),
    ('svd', TruncatedSVD(n_components=300)),
    ('clf', GaussianNB())
])

In [14]:
gnb_pipeline.fit(X_train, y_train)

Pipeline(steps=[('cvt', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_..., n_components=300, n_iter=5,
       random_state=None, tol=0.0)), ('clf', GaussianNB(priors=None))])

In [15]:
gnb_pipeline.predict(X_train)

array([0, 0, 0, ..., 1, 0, 0])

In [25]:
gnb_pipeline.score(X_test, y_test)

0.66721044045677003

In [26]:
proba_df = pd.DataFrame(gnb_pipeline.predict_proba(X_train))\
                    .rename(columns={0:'Business Software', 1:'Machine Learning'})
proba_df.head()

Unnamed: 0,Business Software,Machine Learning
0,1.0,9.317267999999999e-36
1,1.0,5.479033e-45
2,1.0,1.917095e-46
3,1.0,1.62642e-37
4,1.0,5.649429e-49


#### Test my Gaussian NB Classifier Model on New Text

In [27]:
def clean_url (url):
    url_list = url.split('/')
    return url_list[-1]

In [28]:
salesops_url = 'https://en.wikipedia.org/wiki/Sales_operations'

In [29]:
salesops_article_name = clean_url(salesops_url)
salesops_article_name

'Sales_operations'

In [30]:
salesops_text = fy.beautify_html_article(salesops_article_name)

In [31]:
salesops_test = np.array([salesops_text])

In [32]:
gnb_pipeline.predict(salesops_test), gnb_pipeline.predict_proba(salesops_test)

(array([0]), array([[  1.00000000e+00,   2.98182059e-39]]))

#### Generate a Classification Report on my Gaussian NB Model

In [33]:
predicted = gnb_pipeline.predict(X_test)

In [34]:
from sklearn.metrics import classification_report

In [35]:
print(classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.67      0.94      0.78       391
          1       0.65      0.18      0.28       222

avg / total       0.66      0.67      0.60       613

