In [14]:
import pandas as pd
import json
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')
import pickle

In [15]:
with open('../data/100_articles.json', 'r') as f:
    file = json.load(f)

In [16]:
df = pd.DataFrame(file['webscrape'])

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 3 columns):
isCycling    99 non-null object
isRacing     99 non-null object
text         99 non-null object
dtypes: object(3)
memory usage: 2.4+ KB


In [18]:
df[(df['isCycling']=='Y')&(df['isRacing']=='N')]

Unnamed: 0,isCycling,isRacing,text
49,Y,N,"Biking events, from Bike New York to moonlight..."
51,Y,N,"No sideswiping taxis, no fume-belching buses, ..."
85,Y,N,To the Editor: I'm heartened to read that Mayo...
92,Y,N,For decades the bicycle industry has drawn its...
96,Y,N,To the Editor: You report that some New Yorker...


In [19]:
df['c_label'] = [1 if x == 'Y' else 0 for x in df['isCycling']]
df['r_label'] = [1 if x == 'Y' else 0 for x in df['isRacing'] ]

In [20]:
df['label']  = (df['c_label'] + df['r_label']) // 2

In [21]:
df[(df['isCycling']=='Y')&(df['isRacing']=='N')]

Unnamed: 0,isCycling,isRacing,text,c_label,r_label,label
49,Y,N,"Biking events, from Bike New York to moonlight...",1,0,0
51,Y,N,"No sideswiping taxis, no fume-belching buses, ...",1,0,0
85,Y,N,To the Editor: I'm heartened to read that Mayo...,1,0,0
92,Y,N,For decades the bicycle industry has drawn its...,1,0,0
96,Y,N,To the Editor: You report that some New Yorker...,1,0,0


In [22]:
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [23]:
vectorizer = TfidfVectorizer()

In [24]:
matrix = vectorizer.fit_transform(X_train)

In [27]:
matrix

<69x6862 sparse matrix of type '<class 'numpy.float64'>'
	with 23359 stored elements in Compressed Sparse Row format>

In [28]:
mnb = MultinomialNB()
bnb = BernoulliNB()

In [66]:
# cv_results = cross_validate(clf, matrix, y_train, return_train_score=True)

In [67]:
# cv_results

{'fit_time': array([0.00264978, 0.00269008, 0.00276208]),
 'score_time': array([0.00113893, 0.00110984, 0.00072694]),
 'test_score': array([0.70833333, 0.73913043, 0.72727273]),
 'train_score': array([0.84444444, 0.91304348, 0.87234043])}

In [29]:
parameters = {
    'alpha': [0.0, 0.3, 0.7, 1.0],
    'fit_prior': [True, False]
}

In [30]:
m_clf = GridSearchCV(mnb, parameters, cv=5)
b_clf = GridSearchCV(bnb, parameters, cv=5)

In [31]:
m_clf.fit(matrix, y_train)
b_clf.fit(matrix, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.0, 0.3, 0.7, 1.0], 'fit_prior': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

## Multinomial NB CV Results:

In [32]:
m_clf.cv_results_

{'mean_fit_time': array([0.00729327, 0.00159898, 0.00176759, 0.00162859, 0.0015667 ,
        0.0016139 , 0.00152812, 0.00159817]),
 'std_fit_time': array([1.04464287e-02, 4.45391432e-05, 2.43657508e-04, 2.20950842e-04,
        1.28846959e-04, 1.26207671e-04, 6.87539988e-05, 7.24789971e-05]),
 'mean_score_time': array([0.00062041, 0.00037689, 0.00040059, 0.00036087, 0.00036173,
        0.000354  , 0.00041475, 0.0003746 ]),
 'std_score_time': array([3.70049304e-04, 1.01515932e-05, 4.72649785e-05, 1.72748040e-05,
        1.78934057e-05, 2.60040249e-06, 1.31678355e-04, 3.63581376e-05]),
 'param_alpha': masked_array(data=[0.0, 0.0, 0.3, 0.3, 0.7, 0.7, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_fit_prior': masked_array(data=[True, False, True, False, True, False, True, False],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
            

In [33]:
m_clf.grid_scores_

[mean: 0.85507, std: 0.07172, params: {'alpha': 0.0, 'fit_prior': True},
 mean: 0.85507, std: 0.07172, params: {'alpha': 0.0, 'fit_prior': False},
 mean: 0.69565, std: 0.03768, params: {'alpha': 0.3, 'fit_prior': True},
 mean: 0.71014, std: 0.06765, params: {'alpha': 0.3, 'fit_prior': False},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.7, 'fit_prior': True},
 mean: 0.69565, std: 0.03768, params: {'alpha': 0.7, 'fit_prior': False},
 mean: 0.68116, std: 0.01256, params: {'alpha': 1.0, 'fit_prior': True},
 mean: 0.68116, std: 0.01256, params: {'alpha': 1.0, 'fit_prior': False}]

## Bernoulli NB CV Results

In [34]:
b_clf.cv_results_

{'mean_fit_time': array([0.00194736, 0.00164409, 0.00154309, 0.00154443, 0.00222793,
        0.00183153, 0.00164442, 0.0016171 ]),
 'std_fit_time': array([1.67411095e-04, 4.74126742e-05, 3.45377601e-05, 4.39167799e-05,
        2.90724710e-04, 1.67695365e-04, 2.50298179e-05, 4.18098369e-05]),
 'mean_score_time': array([0.00186968, 0.00054607, 0.00052748, 0.0004982 , 0.0009541 ,
        0.00055232, 0.00054879, 0.00054922]),
 'std_score_time': array([2.51799793e-03, 3.22553464e-05, 3.72377765e-05, 3.72604839e-06,
        3.07716263e-04, 1.65226656e-05, 2.23643236e-05, 2.45678054e-05]),
 'param_alpha': masked_array(data=[0.0, 0.0, 0.3, 0.3, 0.7, 0.7, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_fit_prior': masked_array(data=[True, False, True, False, True, False, True, False],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
            

In [35]:
b_clf.grid_scores_

[mean: 0.86957, std: 0.09023, params: {'alpha': 0.0, 'fit_prior': True},
 mean: 0.86957, std: 0.09023, params: {'alpha': 0.0, 'fit_prior': False},
 mean: 0.94203, std: 0.05716, params: {'alpha': 0.3, 'fit_prior': True},
 mean: 0.94203, std: 0.05716, params: {'alpha': 0.3, 'fit_prior': False},
 mean: 0.94203, std: 0.07064, params: {'alpha': 0.7, 'fit_prior': True},
 mean: 0.94203, std: 0.07064, params: {'alpha': 0.7, 'fit_prior': False},
 mean: 0.76812, std: 0.07980, params: {'alpha': 1.0, 'fit_prior': True},
 mean: 0.76812, std: 0.07980, params: {'alpha': 1.0, 'fit_prior': False}]

## Bernoulli Scores Better: New Grid Search with Binarize Parameter

In [36]:
parameters = {
    'alpha': [0.0, 0.3, 0.7, 1.0],
    'fit_prior': [True, False],
    'binarize': [0.0, 1.0, 5.0, 10.0]
}
b_clf = GridSearchCV(bnb, parameters, cv=5)
b_clf.fit(matrix, y_train)
b_clf.grid_scores_

[mean: 0.86957, std: 0.09023, params: {'alpha': 0.0, 'binarize': 0.0, 'fit_prior': True},
 mean: 0.86957, std: 0.09023, params: {'alpha': 0.0, 'binarize': 0.0, 'fit_prior': False},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.0, 'binarize': 1.0, 'fit_prior': True},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.0, 'binarize': 1.0, 'fit_prior': False},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.0, 'binarize': 5.0, 'fit_prior': True},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.0, 'binarize': 5.0, 'fit_prior': False},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.0, 'binarize': 10.0, 'fit_prior': True},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.0, 'binarize': 10.0, 'fit_prior': False},
 mean: 0.94203, std: 0.05716, params: {'alpha': 0.3, 'binarize': 0.0, 'fit_prior': True},
 mean: 0.94203, std: 0.05716, params: {'alpha': 0.3, 'binarize': 0.0, 'fit_prior': False},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.3, 'binarize': 1.0, 'fit_prior': True},
 me

In [37]:
model = b_clf.best_estimator_

In [38]:
test_matrix = vectorizer.transform(X_test)
predictions = model.predict(test_matrix)
probas = model.predict_proba(test_matrix)

In [39]:
print(confusion_matrix(y_test, predictions))

[[10  0]
 [ 2 18]]


In [40]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       0.83      1.00      0.91        10
          1       1.00      0.90      0.95        20

avg / total       0.94      0.93      0.93        30



In [41]:
print(accuracy_score(y_test, predictions))

0.9333333333333333


In [42]:
print(confusion_matrix(y_test, probas[:,1:] > 0.00001))

[[10  0]
 [ 0 20]]


In [43]:
model.classes_

array([0, 1])

## Retrain on all data for best model

In [44]:
b_clf.best_estimator_

BernoulliNB(alpha=0.3, binarize=0.0, class_prior=None, fit_prior=True)

In [45]:
v = TfidfVectorizer()
final_matrix = v.fit_transform(df['text'])

In [46]:
final_matrix

<99x8305 sparse matrix of type '<class 'numpy.float64'>'
	with 31834 stored elements in Compressed Sparse Row format>

In [47]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(v, f)

In [48]:
final_model = BernoulliNB(alpha=0.3, binarize=0.0, class_prior=None, fit_prior=True)
final_model.fit(final_matrix, df['label'])

BernoulliNB(alpha=0.3, binarize=0.0, class_prior=None, fit_prior=True)

In [49]:
cross_validate(final_model, final_matrix, df['label'], cv=5)

{'fit_time': array([0.00372672, 0.00273514, 0.00274014, 0.00219679, 0.00240397]),
 'score_time': array([0.00104427, 0.00080895, 0.00082588, 0.00067616, 0.00089407]),
 'test_score': array([0.95238095, 0.85714286, 1.        , 1.        , 0.94736842]),
 'train_score': array([1.        , 0.98717949, 1.        , 1.        , 1.        ])}

## Save Model

In [50]:
with open('bnb_classifier.pkl', 'wb') as f:
    pickle.dump(final_model, f)