In [88]:
import pandas as pd
import json
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

In [2]:
with open('../data/100_articles.json', 'r') as f:
    file = json.load(f)

In [38]:
df = pd.DataFrame(file['webscrape'])

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 3 columns):
isCycling    99 non-null object
isRacing     99 non-null object
text         99 non-null object
dtypes: object(3)
memory usage: 2.4+ KB


In [47]:
df[(df['isCycling']=='Y')&(df['isRacing']=='N')]

Unnamed: 0,isCycling,isRacing,text,c_label,r_label,label
49,Y,N,"Biking events, from Bike New York to moonlight...",1,0,0
51,Y,N,"No sideswiping taxis, no fume-belching buses, ...",1,0,0
85,Y,N,To the Editor: I'm heartened to read that Mayo...,1,0,0
92,Y,N,For decades the bicycle industry has drawn its...,1,0,0
96,Y,N,To the Editor: You report that some New Yorker...,1,0,0


In [40]:
df['c_label'] = [1 if x == 'Y' else 0 for x in df['isCycling']]
df['r_label'] = [1 if x == 'Y' else 0 for x in df['isRacing'] ]

In [46]:
df['label']  = (df['c_label'] + df['r_label']) // 2

In [49]:
df[(df['isCycling']=='Y')&(df['isRacing']=='N')]

Unnamed: 0,isCycling,isRacing,text,c_label,r_label,label
49,Y,N,"Biking events, from Bike New York to moonlight...",1,0,0
51,Y,N,"No sideswiping taxis, no fume-belching buses, ...",1,0,0
85,Y,N,To the Editor: I'm heartened to read that Mayo...,1,0,0
92,Y,N,For decades the bicycle industry has drawn its...,1,0,0
96,Y,N,To the Editor: You report that some New Yorker...,1,0,0


In [50]:
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [54]:
vectorizer = TfidfVectorizer()

In [55]:
matrix = vectorizer.fit_transform(X_train)

In [56]:
matrix

<69x6862 sparse matrix of type '<class 'numpy.float64'>'
	with 23359 stored elements in Compressed Sparse Row format>

In [72]:
mnb = MultinomialNB()

In [66]:
# cv_results = cross_validate(clf, matrix, y_train, return_train_score=True)

In [67]:
# cv_results

{'fit_time': array([0.00264978, 0.00269008, 0.00276208]),
 'score_time': array([0.00113893, 0.00110984, 0.00072694]),
 'test_score': array([0.70833333, 0.73913043, 0.72727273]),
 'train_score': array([0.84444444, 0.91304348, 0.87234043])}

In [71]:
parameters = {
    'alpha': [0.0, 0.3, 0.7, 1.0],
    'fit_prior': [True, False]
}

In [73]:
clf = GridSearchCV(mnb, parameters, cv=5)

In [76]:
clf.fit(matrix, y_train)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=5, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.0, 0.3, 0.7, 1.0], 'fit_prior': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [77]:
clf.grid_scores_

[mean: 0.85507, std: 0.07172, params: {'alpha': 0.0, 'fit_prior': True},
 mean: 0.85507, std: 0.07172, params: {'alpha': 0.0, 'fit_prior': False},
 mean: 0.69565, std: 0.03768, params: {'alpha': 0.3, 'fit_prior': True},
 mean: 0.71014, std: 0.06765, params: {'alpha': 0.3, 'fit_prior': False},
 mean: 0.68116, std: 0.01256, params: {'alpha': 0.7, 'fit_prior': True},
 mean: 0.69565, std: 0.03768, params: {'alpha': 0.7, 'fit_prior': False},
 mean: 0.68116, std: 0.01256, params: {'alpha': 1.0, 'fit_prior': True},
 mean: 0.68116, std: 0.01256, params: {'alpha': 1.0, 'fit_prior': False}]

In [78]:
clf.cv_results_



{'mean_fit_time': array([0.00334802, 0.00300503, 0.00182881, 0.00181293, 0.00161963,
        0.00158157, 0.00168147, 0.00161762]),
 'std_fit_time': array([8.31133795e-04, 9.07774962e-04, 1.83101532e-04, 3.00281097e-04,
        1.17103906e-04, 6.28869577e-05, 2.15256388e-04, 7.44697522e-05]),
 'mean_score_time': array([0.00066032, 0.00063577, 0.00043941, 0.00044284, 0.00035653,
        0.00040178, 0.00039964, 0.0003654 ]),
 'std_score_time': array([1.28935269e-04, 1.11284599e-04, 9.00790702e-05, 1.60737456e-04,
        7.63326785e-06, 6.15630566e-05, 7.11741397e-05, 2.09035316e-05]),
 'param_alpha': masked_array(data=[0.0, 0.0, 0.3, 0.3, 0.7, 0.7, 1.0, 1.0],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_fit_prior': masked_array(data=[True, False, True, False, True, False, True, False],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
            

In [80]:
model = clf.best_estimator_

In [82]:
test_matrix = vectorizer.transform(X_test)
predictions = model.predict(test_matrix)

In [85]:
print(confusion_matrix(y_test, predictions))

[[ 8  2]
 [ 0 20]]


In [87]:
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

          0       1.00      0.80      0.89        10
          1       0.91      1.00      0.95        20

avg / total       0.94      0.93      0.93        30



In [90]:
print(accuracy_score(y_test, predictions))

0.9333333333333333


In [91]:
# retrain on all data for best model
clf.best_estimator_

MultinomialNB(alpha=0.0, class_prior=None, fit_prior=True)

In [92]:
v = TfidfVectorizer()
final_matrix = v.fit_transform(df['text'])

In [97]:
final_model = MultinomialNB(alpha=0.0)
final_model.fit(final_matrix, df['label'])

  'setting alpha = %.1e' % _ALPHA_MIN)


MultinomialNB(alpha=0.0, class_prior=None, fit_prior=True)

In [99]:
cross_validate(final_model, final_matrix, df['label'], cv=5)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


{'fit_time': array([0.00432897, 0.00435877, 0.00276804, 0.00305605, 0.00269079]),
 'score_time': array([0.00072408, 0.00070715, 0.00077915, 0.00063205, 0.00089598]),
 'test_score': array([0.9047619 , 0.80952381, 1.        , 1.        , 0.89473684]),
 'train_score': array([1., 1., 1., 1., 1.])}