In [None]:
import itertools
import numpy as np
from io import StringIO

from matplotlib import pyplot as plt
from sklearn.datasets import fetch_20newsgroups
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn import metrics

### Training a simple Bayesian classification pipeline

We will train a very simple classifier on a small subset of the newsgroups data. Further review of [the Scikit-Learn library usefulness on text](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html) can be found in the documentation and [an overview of Pipelines and pipeline optimization](http://scikit-learn.org/stable/modules/pipeline.html) might be instructive as well.

In [None]:
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]

twenty_train = fetch_20newsgroups(subset='train',
     categories=categories, shuffle=True, random_state=42)

In [None]:
vectorizer = Pipeline([('vect', CountVectorizer(min_df=2, 
                                                max_df=.9)),
                       ('tfidf', TfidfTransformer())])


text_clf = Pipeline([('vectorizer', vectorizer), 
                     ('clf', MultinomialNB())])

In [None]:
MultinomialNB?

In [None]:
text_clf.fit(twenty_train.data, twenty_train.target) 

In [None]:
text_clf.steps

In [None]:
text_clf.get_params()

### How does our model do on the test data?

In [None]:
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)

In [None]:
predicted = text_clf.predict(twenty_test.data)

In [None]:
predicted

In [None]:
twenty_test.target_names

In [None]:
np.mean(predicted == twenty_test.target)

### We can also take a look at the confusion matrix

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
cm = metrics.confusion_matrix(twenty_test.target, predicted)

In [None]:
%matplotlib inline
plot_confusion_matrix(cm, classes=twenty_train.target_names)

In [None]:
text_clf.predict(StringIO("The world is a wonderful place."))

In [None]:
twenty_train.target_names

In [None]:
text_clf.classes_

### Your turn

Write a function that takes in model, target names and text and outputs the human-readible target name. If you have time, give it a short docstring describing what it does.

Hint: Take a look at the cell above for the prediction and think about how you can map the number that you get back to the target_names above. Maybe a dictionary would work? You can also use zip to zip two lists together (i.e. classes_ and target_names...)

In [None]:
#%load ../solutions/predict_function.py


In [None]:
predict_text(text_clf, twenty_train.target_names, 
             "I love GPUs.")

### Saving your model for reuse

In [None]:
from sklearn.externals import joblib
import json
import os

In [None]:
joblib.dump(text_clf, os.path.join('..', 
                                   'data', 
                                   '20_newsgroups_bayesian_model.pkl'))

In [None]:
!ls ../data

In [None]:
with open(os.path.join('..', 'data', '20_newsgroups_prediction_dict.json'), 'w') as predict_dict:
    json.dump(dict([(str(k), v) for k, v in 
                    zip(text_clf.classes_, 
                        twenty_train.target_names)]), 
              predict_dict)

In [None]:
!cat ../data/20_newsgroups_prediction_dict.json