# What's Cooking?
## W207 Final Project
## James Beck, Samir Datta, Chris Hipple  

The goal of this competition is to correctly classfiy the cuisine of a recipe given its ingredients. There are 20 different cuisine types from around the world to classify.

Kaggle link: https://www.kaggle.com/c/whats-cooking

In [43]:
import pandas as pd
import json
import numpy as np
import re
from nltk import ngrams
from itertools import combinations

from sklearn.feature_extraction.text import *
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.svm import LinearSVC

%matplotlib inline

import matplotlib.pyplot as plt
plt.style.use("ggplot")

In [3]:
cats = ['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino', 'french', 'greek', 'indian', 'irish', 'italian', 'jamaican','japanese', 'korean', 'mexican', 'moroccan', 'russian', 'southern_us', 'spanish', 'thai', 'vietnamese']

with open('train.json') as data_file:    
    data = json.load(data_file)

X = []
y = []
for item in data:
    X.append(', '.join(item['ingredients']))
    y.append(item['cuisine'])    

with open('train.json') as data_file:    
    data = json.load(data_file)

X_test = []
ID_test = []
for item in data:
    X_test.append(', '.join(item['ingredients']))
    ID_test.append(item['id'])    



X_train, X_dev, y_train, y_dev = train_test_split(X, y, random_state=2)

In [6]:
print("Number of recipes in training data: "+str(len(X_train)))
print("Number of recipes in development data: "+str(len(X_dev)))

Number of recipes in training data: 29830
Number of recipes in development data: 9944


# First attempt

Our first attempt to classify the recipes was to use the "bag of words" approach. We used the count vectorizer to create a sparse matrix of every word in the recipes, and fit a logistic regression model on the training data. We used this model to predict the development data.

In [8]:
cv = CountVectorizer()
tf_X_train = cv.fit_transform(X_train)
tf_X_dev = cv.transform(X_dev)

lr = LogisticRegression()
lr.fit(tf_X_train, y_train)
predictions = lr.predict(tf_X_dev)

print(classification_report(y_dev, predictions))
print("f1=score: "+str(metrics.f1_score(y_dev, predictions, average='weighted')))
print("Accuracy: "+str(np.mean(predictions==y_dev)))

              precision    recall  f1-score   support

   brazilian       0.76      0.55      0.63       121
     british       0.59      0.33      0.43       206
cajun_creole       0.78      0.70      0.74       376
     chinese       0.79      0.85      0.82       670
    filipino       0.71      0.54      0.61       190
      french       0.59      0.64      0.61       636
       greek       0.76      0.70      0.73       258
      indian       0.85      0.89      0.87       758
       irish       0.67      0.47      0.55       175
     italian       0.80      0.90      0.85      1963
    jamaican       0.81      0.70      0.75       123
    japanese       0.82      0.69      0.75       342
      korean       0.84      0.74      0.79       221
     mexican       0.91      0.92      0.91      1668
    moroccan       0.81      0.78      0.80       215
     russian       0.66      0.40      0.50       133
 southern_us       0.69      0.77      0.73      1056
     spanish       0.63    

Our simple approach to classification gave us an f1-score of .775 and an overall accuracy of .781.

# Exploratory Data Analysis

In [13]:
print("Number of unique ingredients: "+str(len(cv.vocabulary_)))

Number of unique ingredients: 2849


In [15]:
print("Most important ingredients for each cuisine:\n")

largestWeightedWords = []
largestWeightedIndeces = []
cv_featurenames = cv.get_feature_names()

for cat in range(20):
    print(cats[cat])
    weightIndeces = np.argsort(abs(lr.coef_[cat]))[-5:]
    for index in weightIndeces:
        weight = lr.coef_[cat][index]
        
        print(cv_featurenames[index] + " " + str(weight))
    print('\n')

Most important ingredients for each cuisine:

brazilian
curry -2.29897902984
açai 2.4226027732
tapioca 2.61786032914
manioc 3.04912543585
cachaca 5.66428087204


british
worcestershire 2.27248086882
marmite 2.67139865846
mincemeat 2.70626914232
haddock 2.72879032515
stilton 4.81098136838


cajun_creole
mortadella 1.81332465096
jambalaya 1.81889122476
salami 1.99265358265
creole 3.20484073611
cajun 3.68107968744


chinese
mein 2.14816618602
kimchi -2.3302530577
mirin -2.65507624154
mandarin 2.77164923395
szechwan 2.81858166975


filipino
dogs 2.34387283812
basil -2.35298612833
glutinous 2.35926921964
lumpia 2.83518421257
calamansi 3.45331717906


french
swiss 2.37677521465
niçoise 2.41997322075
crepes 2.47666412423
gruyère 2.48880633927
gruyere 2.86732227335


greek
tahini 2.69162180603
ouzo 2.84030054373
phyllo 3.14357148945
greek 3.33927999564
feta 4.26967378331


indian
cardamom 2.38706614407
yoghurt 2.45940191191
masala 2.51177177124
curry 2.91925879494
tandoori 3.78903804841


iris

In [28]:
#Create list of most common ingedients based off simple text parser
ingredient_freq = []
for featurename in cv_featurenames:
    i = 0
    for recipe in X_train:
        if featurename in recipe:
            i +=1
    ingredient_freq.append((featurename, i))

In [37]:
ingredients_sorted_by_freq = sorted(ingredient_freq, key=lambda tup: tup[1], reverse=True)
ingredients_sorted_by_freq[0:50]

[(u'in', 22462),
 (u'on', 22015),
 (u'ic', 21720),
 (u'la', 19673),
 (u'ro', 18848),
 (u'salt', 18421),
 (u'an', 18202),
 (u'oil', 15985),
 (u'lo', 15483),
 (u'pepper', 15270),
 (u'garlic', 13600),
 (u'st', 13226),
 (u'onion', 13159),
 (u'or', 13064),
 (u'to', 12805),
 (u'mi', 11928),
 (u'ice', 11366),
 (u'el', 10789),
 (u'fresh', 10203),
 (u'round', 9822),
 (u'ground', 9763),
 (u'de', 9648),
 (u'au', 8960),
 (u'red', 8875),
 (u'onions', 8762),
 (u'oliv', 8417),
 (u'olive', 8416),
 (u'sugar', 8412),
 (u'mo', 8377),
 (u'sauc', 7699),
 (u'sauce', 7671),
 (u'it', 7660),
 (u'black', 7614),
 (u'tom', 7491),
 (u'tomato', 7294),
 (u'water', 7135),
 (u'chee', 6967),
 (u'chees', 6963),
 (u'chicken', 6942),
 (u'butt', 6920),
 (u'cheese', 6890),
 (u'egg', 6890),
 (u'butter', 6739),
 (u'no', 6688),
 (u'all', 6557),
 (u'cho', 6523),
 (u'flour', 6238),
 (u'tomatoes', 6197),
 (u'gin', 6077),
 (u'green', 6069)]

# Bigrams

Our next step was to have the vectorizer detect word pairs in addition to single words. 

In [38]:
cv_bigrams = CountVectorizer(ngram_range=(1,2))
tf_X_train_bigrams = cv_bigrams.fit_transform(X_train)
tf_X_dev_bigrams = cv_bigrams.transform(X_dev)

lr_bigrams = LogisticRegression()
lr_bigrams.fit(tf_X_train_bigrams, y_train)
predictions = lr_bigrams.predict(tf_X_dev_bigrams)

print("f1=score: "+str(metrics.f1_score(y_dev, predictions, average='weighted')))
print("Accuracy: "+str(np.mean(predictions==y_dev)))

f1=score: 0.77820171114
Accuracy: 0.78268302494


In [11]:
largestWeightedWords = []
largestWeightedIndeces = []
cv_bigram_featurenames = cv_bigrams.get_feature_names()


for cat in range(20):
    print(cats[cat])
    weightIndeces = np.argsort(abs(lr_bigrams.coef_[cat]))[-5:]
    for index in weightIndeces:
        weight = lr_bigrams.coef_[cat][index]
        
        print(cv_bigram_featurenames[index] + " " + str(weight))
    print('\n')

brazilian
manioc 1.81684210974
manioc flour 1.81684210974
black beans 1.8963307854
tapioca flour 1.94343426701
cachaca 4.42654489565


british
stilton cheese 1.58576911257
jam 1.60918448539
mincemeat 1.70117801631
marmite 2.14242286153
stilton 3.28159490246


cajun_creole
oil powdered 1.39534041956
powder dried 1.50187637936
cajun seasoning 1.67942575972
creole 2.32921833546
cajun 2.3873676481


chinese
szechwan peppercorns 1.58492318786
kimchi -1.62362458022
mandarin 1.80026540523
sake -1.89346466114
mirin -2.62748695135


filipino
tilapia 1.38693410006
mirin -1.42974680294
basil -1.49834913221
calamansi 1.94418668915
lumpia 1.97017120327


french
snails 1.60195331392
grits -1.75811624417
duck 1.76781095857
pasta -1.77045743012
cognac 1.99113545665


greek
phyllo 2.25904621531
tahini 2.27620076599
feta cheese 2.56303692104
feta 2.63273494929
greek 2.91312357351


indian
curds 1.95553708379
tandoori 2.20226763204
masala 2.23408928644
curry 2.41600137731
yoghurt 2.5569324714


irish
bri

# Custom tokenizer and preprocessor

Our next step was to attempt to build a bustom tokenizer and preprocessor to remove some noise and keep only the most important and informative features.

In [41]:
#Our custom preprocessor removes features that are uninformative - 
#like numbers, unnecessary spaces, and words that are two characters long or less.
def custom_preprocessor(ingredients):
    result = []
    for ingredient in ingredients.split(', '):
        temp = ingredient.lower()
        
        #remove numbers
        temp = re.sub(r'\d+|&', '', temp)
        #remove unnecessary spaces
        temp = re.sub(r' +', ' ', temp)
        #remove any words that are two characters or less
        temp = ' '.join(word for word in temp.split() if len(word)>2)
        
        result.append("".join(temp))
    
    return ", ".join(result)

#our custom_tokenizer retuns every combination of every word
#in the ingredinet list.
#this increases the number of features by a lot, but also improves
#accuracy.
#
#the logic behind this tokenizer is that two ingreidents may not be
#informative on their own, but if seen together they may help to predict
#a certain cuisine.


def custom_tokenizer(string):
    result = []
    
    #overall note: the point of sorting the ingredients before adding
    #them to the list is to prevent duplicates that are just flipped
    #like "unsalted butter" and "butter unsalted"
    
    #create an empty list where we're going to put the ngrams
    #where n = 1 so we can later create combinations of those
    single_grams = []
    
    
    for ingredient in string.split(', '):
        for n in range(1,len(ingredient.split())+1):
            grams = ngrams(ingredient.split(' '), n)
            for gram in grams:
                #if the length of the ngram we're looking at is 1,
                #add it to our single grams list.
                if n == 1:
                    single_grams.append(gram[0])
                result.append(" ".join(sorted(list(gram))))
    
    #finally add every combination of the n = 1 ngrams
    #so from ['unsalted butter', 'baking powder']
    #we should be adding: 'butter unsalted', 'baking powder',
    #'baking butter', 'baking unsalted', 'butter powder', 'powder unsalted'
    for combo in combinations(single_grams, 2):
        result.append(' '.join(sorted(list(combo))))
    
    #return the unique elements of this list
    #since there will be plenty of duplicates
    return list(set(result))

In [44]:
model = LogisticRegression(penalty="l2")
vectorizer = CountVectorizer(preprocessor = custom_preprocessor,
                             tokenizer = custom_tokenizer,
                             ngram_range = (0,2))
                             


pipe = Pipeline([("vectorize", vectorizer), ("model", model)])
pipe.fit(X_train, y_train)
preds = pipe.predict(X_dev)

print(metrics.f1_score(y_dev, preds, average='weighted'))
print(pipe.score(X_dev, y_dev))

0.795135616269
0.800583266291
