In [91]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
import sklearn.model_selection as model_selection
from sklearn.pipeline import Pipeline
from combine_strings import *
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [153]:
def append_empty_string_class(xvec, yvec):
    num_to_add = int(len(xvec)/4)
    for i in range(num_to_add):
        xvec.append("")
        yvec.append("Unknown")
    

In [154]:
target_level = 2
level_codes = read_levels('all_codes')
title_records = read_titles('all_examples', target_level)

In [155]:
(verbose_x, verbose_y) = generate_combined(level_codes, title_records, target_level)
append_empty_string_class(verbose_x, verbose_y)

(simpler_x, simpler_y) = generate_uncombined_text_for_target_level(title_records, target_level)
append_empty_string_class(simpler_x, simpler_y)

In [156]:
strain_x, stest_x, strain_y, stest_y = model_selection.train_test_split(simpler_x, simpler_y, test_size=0.20)

vtrain_x, vtest_x, vtrain_y, vtest_y = model_selection.train_test_split(verbose_x, verbose_y, test_size=0.20)

In [157]:
#support vector version
# parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
#               'tfidf__use_idf': (True, False),
#               'clf__alpha': (1e-2, 1e-3)}

# vclf_pipe = Pipeline([('vect', CountVectorizer(stop_words='english')),
#                  ('tfidf', TfidfTransformer()),
#                  ('clf', SGDClassifier(loss='hinge', penalty='l2',
#                                        alpha=1e-3, max_iter=5, tol=None))])

# sclf_pipe = Pipeline([('vect', CountVectorizer(stop_words='english')),
#                  ('tfidf', TfidfTransformer()),
#                  ('clf', SGDClassifier(loss='hinge', penalty='l2',
#                                        alpha=1e-3, max_iter=5, tol=None))])
#bayesian version
sparameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}
vparameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3)}

vclf_pipe = Pipeline([('vect', CountVectorizer(stop_words='english')),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB(alpha=1e-3))])

sclf_pipe = Pipeline([('vect', CountVectorizer(stop_words='english')),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB(alpha=1e-3))])

vclf = GridSearchCV(vclf_pipe, vparameters, n_jobs=-1)
sclf = GridSearchCV(sclf_pipe, sparameters, n_jobs=-1)

In [158]:
sfit = sclf.fit(strain_x, strain_y)

vfit = vclf.fit(vtrain_x, vtrain_y)

In [159]:
stest_pred = sfit.predict(stest_x)
vtest_pred = vfit.predict(vtest_x)

In [160]:
stest_valid = stest_pred == stest_y
vtest_valid = vtest_pred == vtest_y
print(np.mean(stest_valid)*100)
print(np.mean(vtest_valid) * 100)


86.1317863398
100.0


In [161]:
print(metrics.classification_report(stest_y, stest_pred))
print(metrics.confusion_matrix(stest_y, stest_pred))

print(metrics.classification_report(vtest_y, vtest_pred))
print(metrics.confusion_matrix(vtest_y, vtest_pred))

             precision    recall  f1-score   support

         00       0.86      0.81      0.83        80
         01       0.87      0.89      0.88       139
         02       0.89      0.83      0.86        41
         03       0.88      0.65      0.75        23
         04       0.78      0.88      0.83       144
         05       0.88      0.96      0.92        51
         06       0.85      0.81      0.83        68
         07       0.93      0.91      0.92        56
         08       0.95      0.84      0.89        49
         09       0.91      0.71      0.79        41
         11       0.73      0.78      0.76        91
         12       0.79      0.77      0.78       173
         13       0.92      0.74      0.82        31
         14       0.86      0.73      0.79       116
         15       0.89      0.73      0.80        64
         21       0.90      0.75      0.82       235
         22       0.83      0.85      0.84       399
         30       0.93      0.78      0.85   

In [162]:
sv_pred = vfit.predict(stest_x)
otest_valid = sv_pred == stest_y
print(np.mean(otest_valid)*100)

95.2167250438


In [163]:
print(metrics.classification_report(stest_y, sv_pred))
print(metrics.confusion_matrix(stest_y, sv_pred))

             precision    recall  f1-score   support

         00       0.97      0.95      0.96        80
         01       0.95      0.96      0.96       139
         02       1.00      1.00      1.00        41
         03       1.00      0.96      0.98        23
         04       0.93      0.96      0.94       144
         05       0.98      1.00      0.99        51
         06       1.00      0.91      0.95        68
         07       0.96      0.96      0.96        56
         08       0.97      0.78      0.86        49
         09       0.97      0.93      0.95        41
         11       0.96      0.88      0.92        91
         12       0.97      0.96      0.97       173
         13       0.97      1.00      0.98        31
         14       0.95      0.91      0.93       116
         15       0.97      0.91      0.94        64
         21       0.97      0.87      0.92       235
         22       0.96      0.95      0.95       399
         30       0.95      0.97      0.96   

In [164]:
nonsense_title_string = ['catastrophe disaster artist']
pred = vfit.predict(nonsense_title_string)
pred_proba = vfit.predict_proba(nonsense_title_string)
print(pred)
print(pred_proba)

['Unknown']
[[ 0.06028302  0.00324995  0.00276899  0.00283419  0.05180329  0.00311126
   0.00291337  0.00298462  0.00285104  0.00286346  0.00321274  0.0032042
   0.03558223  0.00356242  0.00316098  0.00290025  0.00227501  0.00354784
   0.00342612  0.0033479   0.00347612  0.00308284  0.00240989  0.00318473
   0.00276195  0.00277908  0.1018577   0.13118433  0.00297339  0.00354146
   0.00305893  0.03986519  0.00292642  0.02164344  0.00236911  0.00214855
   0.00340979  0.00283703  0.00334435  0.00257072  0.00305565  0.00346371
   0.00219282  0.0013782   0.00259214  0.00307423  0.44293536]]


In [165]:
max(pred_proba[0])

0.44293536415512852