# Random Acts of Pizza

I will use Kaggle's Random Accts of Pizza classification task for my project.

https://www.kaggle.com/c/random-acts-of-pizza

Let's start by importing some libraries and loading the data into an ndarray.

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import cross_validation

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *

In [2]:
# Load the data
import urllib
import tarfile
import pandas as pd


# download the data and extract the tarball 
tf = urllib.URLopener()
tf.retrieve("http://cs.stanford.edu/~althoff/raop-dataset/pizza_request_dataset.tar.gz", "pizza.tar.gz")

tar = tarfile.open("pizza.tar.gz", "r:gz")
for name in tar.getnames():
    if name == "pizza_request_dataset/pizza_request_dataset.json":
        member = tar.getmember(name)
        f = tar.extractfile(member)
        if f is not None:
            json_data = f.read()

# convert data to a pandas dataframe
pizza_df = pd.read_json(json_data)
# print(pizza_df[:0])
pizza_df = np.asarray(pizza_df)

# shuffle the data
np.random.seed(0)
shuffle = np.random.permutation(np.arange(pizza_df.shape[0]))
pizza_df = pizza_df[shuffle]

# extract test and train data and labels
dev_data, dev_labels = np.delete(pizza_df[:500], 23, axis=1), [x for x in pizza_df[:500, 23]]
test_data, test_labels = np.delete(pizza_df[500:1000], 23, axis=1), [x for x in pizza_df[500:1000, 23]]
train_data, train_labels = np.delete(pizza_df[1000:], 23, axis=1), [x for x in pizza_df[1000:, 23]]

## Establishing a baseline

Let's establish a baseline using BernoulliNB, MultinomialNB and LogisticRegression. We will use just the post text (corresponding to column 7 in pizza_df) and the "requester_received_pizza" outcome (True, False).

In [3]:

# define a helper function to perform the analysis 
def perform_analysis(train_data, train_labels, dev_data, dev_features, 
                     vectorizer=CountVectorizer(), clf = BernoulliNB(), 
                     gsc_params = {}):
    
    train_data_features = vectorizer.fit_transform(train_data)
    dev_data_features = vectorizer.transform(dev_data)
    
    print("RESULTS FOR Default %s : -------------------------------" % (clf))
    clf.fit(train_data_features, train_labels)
    print("f1_score: %s\naccuracy_score: %s\nroc_auc_score: %s\n" 
              % (metrics.f1_score(dev_labels, clf.predict(dev_data_features)), 
          metrics.accuracy_score(dev_labels, clf.predict(dev_data_features)), 
                metrics.roc_auc_score(dev_labels, clf.predict(dev_data_features))))
    
    print("Calculating Cross Vaidation Scores: ")
    scores = cross_validation.cross_val_score(clf, train_data_features, train_labels, cv=10, scoring='f1_weighted')
    print("Scores: %s\n" % (scores))
    
    # Search for the best estimator
    print("STARTING GRID SEARCH...")
    gsc = GridSearchCV(clf, gsc_params, n_jobs=-1)
    gsc.fit(train_data_features, train_labels)
    print("Best estimator: %s\nBest alpha: %s\nBest score: %s\nScorer function: %s\n" 
          % (gsc.best_estimator_, gsc.best_params_, gsc.best_score_, gsc.scorer_))
    # return gsc.best_estimator_

In [4]:
# BernoulliNB using CountVectorizer
print("BernoulliNB using CountVectorizer")
perform_analysis(train_data[:, 7], train_labels, dev_data[:, 7], dev_labels, 
                 vectorizer=CountVectorizer(), clf = BernoulliNB(), 
                 gsc_params = {'alpha': np.arange(0, 1, 0.01)})

# BernoulliNB using TfidfVectorizer
print("BernoulliNB using TfidfVectorizer")
perform_analysis(train_data[:, 7], train_labels, dev_data[:, 7], dev_labels, 
                 vectorizer=TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english'), clf = BernoulliNB(), 
                 gsc_params = {'alpha': np.arange(0, 1, 0.01)})

BernoulliNB using CountVectorizer
RESULTS FOR Default BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) : -------------------------------
f1_score: 0.305555555556
accuracy_score: 0.7
roc_auc_score: 0.554981518817

Calculating Cross Vaidation Scores: 
Scores: [ 0.68923924  0.69364719  0.68049658  0.67924009  0.69507729  0.69611298
  0.67960775  0.66109026  0.68012268  0.67949566]

STARTING GRID SEARCH...
Best estimator: BernoulliNB(alpha=0.0, binarize=0.0, class_prior=None, fit_prior=True)
Best alpha: {'alpha': 0.0}
Best score: 0.737315350032
Scorer function: <function _passthrough_scorer at 0x000000000835A898>

BernoulliNB using TfidfVectorizer
RESULTS FOR Default BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) : -------------------------------
f1_score: 0.212290502793
accuracy_score: 0.718
roc_auc_score: 0.531207997312

Calculating Cross Vaidation Scores: 
Scores: [ 0.6948353   0.69313005  0.70509014  0.69313005  0.70155229  0.67149621
  0.6

In [5]:
# MultinomialNB using CountVectorizer
print("MultinomialNB using CountVectorizer")
perform_analysis(train_data[:, 7], train_labels, dev_data[:, 7], dev_labels, 
                 vectorizer=CountVectorizer(), clf = MultinomialNB(), 
                 gsc_params = {'alpha': np.arange(0, 1, 0.01)})

# MultinomialNB using TfidfVectorizer
print("MultinomialNB using TfidfVectorizer")
perform_analysis(train_data[:, 7], train_labels, dev_data[:, 7], dev_labels, 
                 vectorizer=TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english'), clf = MultinomialNB(), 
                 gsc_params = {'alpha': np.arange(0, 1, 0.01)})

MultinomialNB using CountVectorizer
RESULTS FOR Default MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) : -------------------------------
f1_score: 0.0915032679739
accuracy_score: 0.722
roc_auc_score: 0.503150201613

Calculating Cross Vaidation Scores: 
Scores: [ 0.69658503  0.67541664  0.67977306  0.67645383  0.69491832  0.6602577
  0.6839763   0.63693178  0.64502397  0.6718563 ]

STARTING GRID SEARCH...
Best estimator: MultinomialNB(alpha=0.98999999999999999, class_prior=None, fit_prior=True)
Best alpha: {'alpha': 0.98999999999999999}
Best score: 0.731749090131
Scorer function: <function _passthrough_scorer at 0x000000000835A898>

MultinomialNB using TfidfVectorizer
RESULTS FOR Default MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) : -------------------------------
f1_score: 0.0
accuracy_score: 0.744
roc_auc_score: 0.5

Calculating Cross Vaidation Scores: 
Scores: [ 0.64573692  0.64468864  0.64573692  0.64573692  0.64790979  0.64790979
  0.64790979  0.64719664 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Best estimator: MultinomialNB(alpha=0.91000000000000003, class_prior=None, fit_prior=True)
Best alpha: {'alpha': 0.91000000000000003}
Best score: 0.75294369514
Scorer function: <function _passthrough_scorer at 0x000000000835A898>



In [6]:
# LogisticRegression using CountVectorizer
print("LogisticRegression using CountVectorizer")
perform_analysis(train_data[:, 7], train_labels, dev_data[:, 7], dev_labels, 
                 vectorizer=CountVectorizer(), clf = LogisticRegression(), 
                 gsc_params = {'C': np.arange(0.01, 1, 0.01)})

# LogisticRegression using TfidfVectorizer
print("LogisticRegression using TfidfVectorizer")
perform_analysis(train_data[:, 7], train_labels, dev_data[:, 7], dev_labels, 
                 vectorizer=TfidfVectorizer(), clf = LogisticRegression(), 
                 gsc_params = {'C': np.arange(0.01, 1, 0.01)})

LogisticRegression using CountVectorizer
RESULTS FOR Default LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) : -------------------------------
f1_score: 0.209523809524
accuracy_score: 0.668
roc_auc_score: 0.50529233871

Calculating Cross Vaidation Scores: 
Scores: [ 0.71056113  0.70374342  0.68475426  0.70022931  0.66436359  0.68405106
  0.69450078  0.657613    0.68159917  0.70261076]

STARTING GRID SEARCH...
Best estimator: LogisticRegression(C=0.02, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Best alpha: {'C': 0.02}
Best score: 0.754656390495
Scorer function: <function _passthrou

In [7]:
# KNeighborsClassifier using CountVectorizer
print("KNeighborsClassifier using CountVectorizer")
perform_analysis(train_data[:, 7], train_labels, dev_data[:, 7], dev_labels, 
                 vectorizer=CountVectorizer(), clf = KNeighborsClassifier(), 
                 gsc_params = {'n_neighbors': np.arange(1, 100, 1)})

# KNeighborsClassifier using TfidfVectorizer
print("KNeighborsClassifier using TfidfVectorizer")
perform_analysis(train_data[:, 7], train_labels, dev_data[:, 7], dev_labels, 
                 vectorizer=TfidfVectorizer(), clf = KNeighborsClassifier(), 
                 gsc_params = {'n_neighbors': np.arange(1, 100, 1)})

KNeighborsClassifier using CountVectorizer
RESULTS FOR Default KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') : -------------------------------
f1_score: 0.103896103896
accuracy_score: 0.724
roc_auc_score: 0.507056451613

Calculating Cross Vaidation Scores: 
Scores: [ 0.66880342  0.65150464  0.67415834  0.66070829  0.65334379  0.66829823
  0.64546451  0.65698709  0.66896721  0.64462871]

STARTING GRID SEARCH...
Best estimator: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=52, p=2,
           weights='uniform')
Best alpha: {'n_neighbors': 52}
Best score: 0.75315778206
Scorer function: <function _passthrough_scorer at 0x000000000835A898>

KNeighborsClassifier using TfidfVectorizer
RESULTS FOR Default KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_para

As we see above, using default options with KNearestNeighbors, BernoulliNB, MultinomialNB and LogisticRegression yielded accuracy better that random guessing. All three classifiers scored between approximately 70% to 75%. We can use this as a baseline and look to improve the score.

## A better baseline?

I suppose 75% accuracy is a good baseline to start with. Let's look to improve this in the next steps. I will try a pipeline as the first attempt.

### Using a Pipeline (chaining classifiers)

Let's see if chained transformation via CountVectorizer and TfiddfTransformer, followed by LogisticRegression, improves the results.


In [8]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier

# let's learn about pipelines; they will come in handy for ensembles
# ****code below is taken from scikit-learn's documentation on pipelines****
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression()),
])
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    #'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2), (2, 2)),  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    #'clf__alpha': (0.00001, 0.000001),
    #'clf__penalty': ('l2', 'elasticnet'),
    # 'clf__n_iter': (10, 50, 80),
}
# **** end of code taken from scikit-learn's documentation ****

gsc = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)
gsc.fit(train_data[:, 7], train_labels)
print("Best estimator: %s\nBest alpha: %s\nBest score: %s\nScorer function: %s\n" 
      % (gsc.best_estimator_, gsc.best_params_, gsc.best_score_, gsc.scorer_))

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.8s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:   32.7s finished


Best estimator: Pipeline(steps=[('vect', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=0.5, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        st...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])
Best alpha: {'vect__ngram_range': (1, 1), 'tfidf__use_idf': True, 'vect__max_df': 0.5}
Best score: 0.756797259687
Scorer function: <function _passthrough_scorer at 0x000000000835A898>



###  Randomized trees and Ensembles
Let's first look at the results of randomized trees and then use an ensemble (the VotingClassifier). Let's also convert user's subreddits to a "bag of words" and run that through randomized trees.

In [9]:
# Extremely Randomized Trees
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import ExtraTreesClassifier

vectorizer = HashingVectorizer(n_features=1000, analyzer='word', norm='l2', stop_words='english')
train_data_features = vectorizer.fit_transform(train_data[:, 7])
train_labels = [x for x in pizza_df[1000:, 23]]
dev_data_features = vectorizer.transform(dev_data[:, 7])
dev_labels = [x for x in pizza_df[:500, 23]]

clf = ExtraTreesClassifier(n_estimators=250, max_depth=None, 
                          min_samples_split=2, random_state=0, 
                          criterion='entropy')
clf.fit(train_data_features, train_labels)

print("ExtraTreesClassifier score: %s" % (clf.score(dev_data_features, dev_labels)))

print("ExtraTreesClassifier roc_auc_score: %s" % (metrics.roc_auc_score(dev_labels, clf.predict(dev_data_features))))

ExtraTreesClassifier score: 0.744
ExtraTreesClassifier roc_auc_score: 0.5


In [10]:
# Extremely Randomized Trees
# With subreddit membership as a bag of words
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.ensemble import ExtraTreesClassifier

vectorizer = HashingVectorizer(n_features=1000, analyzer='word', norm='l2', stop_words='english')

td = [' '.join(x) for x in train_data[:, 23]]
dd = [' '.join(x) for x in dev_data[:, 23]]

train_data_features = vectorizer.fit_transform(td)
train_labels = [x for x in pizza_df[1000:, 23]]
dev_data_features = vectorizer.transform(dd)
dev_labels = [x for x in pizza_df[:500, 23]]

clf = ExtraTreesClassifier(n_estimators=250, max_depth=None, 
                          min_samples_split=2, random_state=0, 
                          criterion='entropy')
clf.fit(train_data_features, train_labels)

print("ExtraTreesClassifier score: %s" % (clf.score(dev_data_features, dev_labels)))
print("ExtraTreesClassifier roc_auc_score: %s" % (metrics.roc_auc_score(dev_labels, clf.predict(dev_data_features))))

ExtraTreesClassifier score: 0.722
ExtraTreesClassifier roc_auc_score: 0.485215053763


In [13]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB

vectorizer = CountVectorizer()

dev_data, dev_labels = np.delete(pizza_df[:500], 23, axis=1), [x for x in pizza_df[:500, 23]]
train_data, train_labels = np.delete(pizza_df[1000:], 23, axis=1), [x for x in pizza_df[1000:, 23]]

train_data_features = vectorizer.fit_transform(train_data[:, 7])
dev_data_features = vectorizer.transform(dev_data[:, 7])

clf1 = KNeighborsClassifier()
clf2 = BernoulliNB()
clf3 = LogisticRegression()
clf4 = ExtraTreesClassifier(n_estimators=250, max_depth=None, 
                          min_samples_split=2, random_state=0, 
                          criterion='entropy')
vclf1 = VotingClassifier(estimators=[('knn', clf1), ('bnb', clf2), ('lr', clf3), ('etc', clf4)], voting='hard')

vclf1.fit(train_data_features, train_labels)
print("VotingClassifier accuracy score: %s" % (metrics.accuracy_score(dev_labels, vclf1.predict(dev_data_features))))
print("VotingClassifier roc_auc_score: %s" % (metrics.roc_auc_score(dev_labels, vclf1.predict(dev_data_features))))

VotingClassifier accuracy score: 0.748
VotingClassifier roc_auc_score: 0.512936827957


I'm surprised to see that the ExtraTreesClassifier returns 72.2% accuracy for the subreddit membership which is marginally lower than the accuracy from the post itself (74%).
However, the overall accuracy is similar to the results of individual classifiers used earlier. It appears that a bag of words approve alone is not going to cut it. 

Let's look to improve upon this by first using a FeatureUnion on post words and subreddits (bag of words, yet again). After that I will look to expand the feature space.

## Feature Unions to improve the baseline


... to be continued