In [None]:
import pandas as pd
import numpy as np
import itertools
import re

from ast import literal_eval
from collections import Counter
from matplotlib import pyplot as plt
from sklearn import metrics

from nltk.stem import PorterStemmer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

%matplotlib inline

In [None]:
df = pd.read_csv('../data/lobsters_full_2017_cleaned.csv')

In [None]:
df.head()

In [None]:
df = df.set_index('Unnamed: 0')

### Evaluating Tags to Use

In [None]:
df.tags.values

In [None]:
literal_eval(df.tags.values[0])

In [None]:
tag_list = [literal_eval(tag) for tag in df.tags.values]

In [None]:
counter = Counter(itertools.chain(*tag_list))
counter.most_common(10)

In [None]:
%matplotlib inline
n = 10
most_common = counter.most_common(n)
plt.barh(range(n), 
         [mc[1] for mc in most_common], 
        tick_label=[mc[0] for mc in most_common])

In [None]:
my_categories = ['security', 'web', 'hardware', 'culture']

In [None]:
itertools.combinations?

In [None]:
for first_cat, second_cat in itertools.combinations(my_categories, 2):
    overlap = df[(df[first_cat] == 1) & 
                 (df[second_cat] == 1)].shape[0]
    print(first_cat, second_cat, overlap)

In [None]:
sum(df['security'])

In [None]:
sum(df['security'].fillna(0))

In [None]:
sum(df['web'].fillna(0))

In [None]:
sum(df['hardware'].fillna(0))

In [None]:
# overlap between security and web (for web group)
252 / 2263

In [None]:
# ovelap between security and hardware (for hardware group)
174 / 1537

### Your Turn

- evaluate a few other categories that might be interesting to you
- choose a final `my_categories` list 

### Preprocessing Text

In [None]:
df[my_categories].sum(axis=1) == 1

In [None]:
columns = my_categories + ['title', 'description']
tag_data = df[df[my_categories].sum(axis=1) == 1][columns]

In [None]:
tag_data.head()

In [None]:
all_tags = counter.keys()
stemmer = PorterStemmer()

def clean_text(sentence):
    words = re.findall("\w+", sentence.lower())
    return [word for word in words if
            word not in ENGLISH_STOP_WORDS
            and (len(word) > 1 or word in all_tags)]


def remove_urls(text):
    return re.sub('http\S+', '', text)

In [None]:
def tokenize(text):
    return [stemmer.stem(w) for w in 
            clean_text(remove_urls(text.lower()))]

In [None]:
test_text = ' '.join([
    tag_data.iloc[0]['title'], 
    tag_data.iloc[0]['description']])
test_text

In [None]:
tag_data = tag_data.fillna('')

In [None]:
tokenize(test_text)

In [None]:
tag_data['full_text'] = tag_data['title'] + ' ' + tag_data['description']

In [None]:
tag_data.head()

In [None]:
tokenize(tag_data.iloc[3]['full_text'])

### Train Test Split

In [None]:
labels = [(k, v) for k, v in enumerate(my_categories)]

In [None]:
labels

In [None]:
def label_row(row, labels=labels):
    for label, tag_name in labels:
        if row[tag_name] == 1:
            return label

In [None]:
tag_data['tag_class'] = tag_data.apply(label_row, axis=1)

In [None]:
tag_data.head()

In [None]:
y = tag_data.tag_class
X = tag_data['full_text'].as_matrix()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [None]:
print("training data: ", X_train.shape[0])
print("testing data: ", X_test.shape[0])


### Create Vectorizers

In [None]:
count_vectorizer = CountVectorizer(tokenizer=tokenize,
                                  min_df=5, max_df=.9)

In [None]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize,
                                  min_df=5, max_df=.9)

### Create a Bayesian model

In [None]:
bayes_model = Pipeline([('vectorizer', count_vectorizer), 
                        ('clf', MultinomialNB())])

bayes_model.fit(X_train, y_train)

### Test Performance

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    See full source and example: 
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
    
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
predicted = bayes_model.predict(X_test)

In [None]:
np.mean(predicted == y_test)

In [None]:
cm = metrics.confusion_matrix(y_test, predicted)
plot_confusion_matrix(cm, my_categories)

In [None]:
bayes_model.predict_proba(
    ["Brand new GPUs from Nvidea."])

### Try a SVM

In [None]:
svm_model = Pipeline([('vectorizer', tfidf_vectorizer), 
                      ('clf', LinearSVC())])

svm_model.fit(X_train, y_train)

In [None]:
svm_predicted = svm_model.predict(X_test)

In [None]:
np.mean(svm_predicted == y_test)

In [None]:
svm_cm = metrics.confusion_matrix(y_test, svm_predicted)
plot_confusion_matrix(svm_cm, my_categories)

### Your Turn

- Try at least one more model combination (i.e. count with LinearSVC, tfidf with MultinomialNB, or try the PassiveAggressiveClassifier which is imported as well!)
- Test the accuracy using np.mean
- Plot the confusion matrix
- Share what you found in our discussion and on Slack!
- Bonus: Read https://machinelearningmastery.com/tactics-to-combat-imbalanced-classes-in-your-machine-learning-dataset/ and try implementing a suggestion to better train our models!

## Parameter tuning

In [None]:
svm_model.steps

In [None]:
svm_model.steps[1][1]

In [None]:
LinearSVC?

In [None]:
train_vectors = tfidf_vectorizer.transform(X_train)

In [None]:
np.logspace?

In [None]:
Cs = np.logspace(-6, 6, 20)
Cs

In [None]:
param_val = GridSearchCV(svm_model.steps[1][1], 
                   param_grid={
                       'C': Cs,
                       'loss': ['hinge', 'squared_hinge']
                   },
                   n_jobs=4)
param_val.fit(train_vectors, y_train)        

In [None]:
param_val.best_score_

In [None]:
param_val.best_estimator_.C

In [None]:
param_val.best_estimator_.loss

In [None]:
svm_model.steps[1][1].get_params()

In [None]:
svm_model.steps[1][1].set_params(
    C=param_val.best_estimator_.C)

In [None]:
svm_model.fit(X_train, y_train)

In [None]:
predicted = svm_model.predict(X_test)

In [None]:
np.mean(predicted == y_test)

### Your Turn

- Try GridSearchCV with the model you built. Did you find a new alpha or constant to use?
- If so, update your model parameters and train and test again
- Did you see any improvement? Why or why not?

## Save your model and training data

In [None]:
from sklearn.externals import joblib
joblib.dump(bayes_model, 
            '../data/lobsters_tag_classification_bayes.pkl')
joblib.dump(svm_model, 
            '../data/lobsters_tag_classification_svm.pkl')

In [None]:
tag_data.to_csv('../data/lobsters_tag_training_data.csv')

In [None]:
import json
json.dump(dict((str(k), v)
          for k, v in enumerate(my_categories)),
          open('../data/lobsters_tag_classes.json', 'w'))
json.dump(list(all_tags), 
          open('../data/lobsters_tag_list.json', 'w'))

In [None]:
!cat ../data/lobsters_tag_classes.json

### SVM with predict_proba

In [None]:
svm = LinearSVC()
clf = CalibratedClassifierCV(svm) 

svm_model = Pipeline([('vectorizer', tfidf_vectorizer), 
                      ('clf', clf)])

svm_model.fit(X_train, y_train)
joblib.dump(svm_model, 
            '../data/lobsters_tag_classification_svm_proba.pkl')