In [None]:
import pandas as pd
import json
import numpy as np
import re
from nltk import ngrams
from itertools import combinations

from sklearn.feature_extraction.text import *
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.svm import LinearSVC

%matplotlib inline

import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [None]:
cats = ['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino', 'french', 'greek', 'indian', 'irish', 'italian', 'jamaican','japanese', 'korean', 'mexican', 'moroccan', 'russian', 'southern_us', 'spanish', 'thai', 'vietnamese']

with open('train.json') as data_file:    
    data = json.load(data_file)

X = []
y = []
for item in data:
    X.append(', '.join(item['ingredients']))
    y.append(item['cuisine'])    

with open('train.json') as data_file:    
    test_data = json.load(data_file)

X_test = []
ID_test = []
for item in test_data:
    X_test.append(', '.join(item['ingredients']))
    ID_test.append(item['id'])    



X_train, X_dev, y_train, y_dev = train_test_split(X, y, random_state=2)


# 
# Feature Engineering

There are likely many other features which can be created based on the contents of the recipes which may help us classify them.  We'll look at some things like the number of ingredients, if any canned or boxed goods are used, and for specific brands.

In [None]:
# Convert to Dataframe
df_train = pd.DataFrame({"recipe": X_train, "label": y_train, "ID": None, "source": "train"})
df_dev = pd.DataFrame({"recipe": X_dev, "label": y_dev, "ID": None, "source": "dev"})
df_test = pd.DataFrame({"recipe": X_test, "label": None, "ID": ID_test, "source": "test"})
df = pd.concat([df_train, df_dev, df_test])

df["num_ingredients"] = df["recipe"].apply(lambda r: len(r.split(", ")))
df["canned"] = df["recipe"].apply(lambda r: int("can" in r.lower()))
#df["box"] = df["recipe"].apply(lambda r: "" in r.lower())

brands = ["kraft"]
df["brands"] = df["recipe"].apply(lambda r: (any([brand in r.lower() for brand in brands])))


def get_average_ingredient_length(recipe):
    """This function returns the average number of words in a recipe"""
    return np.mean(map(len, recipe.split(", ")))
df["ingredient_length"] = df["recipe"].apply(get_average_ingredient_length)


print df.head()
print "\n\n"
engineered_columns = ["canned", "num_ingredients", "ingredient_length", "brands"]
print df.loc[df["source"] == "train" , engineered_columns + ["label"]].groupby("label").agg(["mean", "std"])

df_train = df.loc[df["source"] == "train",]
df_dev = df.loc[df["source"] == "dev",]
df_test = df.loc[df["source"] == "test",]


In [None]:
# Feature Engineering Attemps

# ItemSelector copied from official scikit-learn examples
# http://scikit-learn.org/stable/auto_examples/hetero_feature_union.html#sphx-glr-auto-examples-hetero-feature-union-py
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.

    The data is expected to be stored in a 2D data structure, where the first
    index is over features and the second is over samples.  i.e.

    >> len(data[key]) == n_samples

    Please note that this is the opposite convention to scikit-learn feature
    matrixes (where the first index corresponds to sample).

    ItemSelector only requires that the collection implement getitem
    (data[key]).  Examples include: a dict of lists, 2D numpy array, Pandas
    DataFrame, numpy record array, etc.

    >> data = {'a': [1, 5, 2, 5, 2, 8],
               'b': [9, 4, 1, 4, 1, 3]}
    >> ds = ItemSelector(key='a')
    >> data['a'] == ds.transform(data)

    ItemSelector is not designed to handle data grouped by sample.  (e.g. a
    list of dicts).  If your data is structured this way, consider a
    transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.

    Parameters
    ----------
    key : hashable, required
        The key corresponding to the desired value in a mappable.
    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    #def transform(self, data_dict):
        #if len(self.key) == 1:
            #return list(data_dict[self.key])
        #else:
            #return data_dict.loc[:,self.key]
    def transform(self, data_dict):
       return data_dict.loc[:,self.key]

class DenseTransformer(TransformerMixin):
    """Transforms a sparse matrix into a dense matrix"""
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    def fit_transform(self, X, y=None, **fit_params):
        self.fit(X, y, **fit_params)
        return self.transform(X)

    def fit(self, X, y=None, **fit_params):
        return self

class Printer(BaseEstimator, TransformerMixin):

    def fit(self, x, y=None):
        print "fiting self!"
        return self

    def transform(self, data_dict):
        print data_dict.shape
        print "transforming", data_dict
        return data_dict

    def fit_transform(self, data_dict, labels=None):
        print "doing print transform"
        print data_dict.shape
        return list(data_dict)

vectorizer = CountVectorizer()#preprocessor = custom_preprocessor,
                             #tokenizer = custom_tokenizer)

model = LogisticRegression()
printer = Printer()

pipe_base = Pipeline([
                      ("selector", ItemSelector(key = ["recipe"])),
                      ("printer", printer),
                      ("vectorizer", vectorizer),
                      ("printer2", printer),
                      ("model", model)
                      ])

#pipe_base.fit(list(df_train["recipe"]), df_train["label"])

pipe = Pipeline([
     ("union", FeatureUnion([
         ("Ingredients", Pipeline([
             ("selector", ItemSelector(key = ["recipe"])),
             ("vectorizer", vectorizer)
             #("to_dense", DenseTransformer())

         ])),
         # No transformations on other features, just select them.
         ("selector", ItemSelector(key = engineered_columns))
         ], 
         ))
      ])
