In [138]:
from __future__ import print_function
import numpy as np
import pandas as pd
%matplotlib inline

In [139]:
# Load training data
train = pd.read_json('../yummly/train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


## Data Preprocessing - Exploration

In [140]:
train.shape

(39774, 3)

In [141]:
train.isnull().sum()

cuisine        0
id             0
ingredients    0
dtype: int64

In [142]:
train.dtypes

cuisine        object
id              int64
ingredients    object
dtype: object

In [143]:
train.loc[0,'ingredients']

[u'romaine lettuce',
 u'black olives',
 u'grape tomatoes',
 u'garlic',
 u'pepper',
 u'purple onion',
 u'seasoning',
 u'garbanzo beans',
 u'feta cheese crumbles']

In [144]:
type(train.loc[0,'ingredients'])

list

In [145]:
train.cuisine.nunique()

20

In [146]:
train.groupby('cuisine').count()

Unnamed: 0_level_0,id,ingredients
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1
brazilian,467,467
british,804,804
cajun_creole,1546,1546
chinese,2673,2673
filipino,755,755
french,2646,2646
greek,1175,1175
indian,3003,3003
irish,667,667
italian,7838,7838


### *Further Data Preprocessing - Cleaning/Exploration/Feature selection:*
(Suggestions)

* Convert all letters into lowercase
* Strip unicode
* Strip punctuation such as semicolons and commas
* Strip parantheses and the strings they enclose
* Do food descriptors add value to prediction or not
* Do brand names of ingredients add value to prediction or not
* Remove common ingredients such as salt
* Remove 10 least frequently occuring ingredients in each cuisine
* Tf-idf

## Feature Engineering 

In [147]:
train['num_ingredients'] = train.ingredients.apply(len)

In [148]:
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
3,indian,22213,"[water, vegetable oil, wheat, salt]",4
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20


In [149]:
train.groupby('cuisine').num_ingredients.describe().unstack()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
brazilian,467,9.520343,5.555139,2,5,9,13,59
british,804,9.708955,4.165011,2,7,9,12,30
cajun_creole,1546,12.617076,4.611601,2,9,12,16,31
chinese,2673,11.982791,4.042125,2,9,12,14,38
filipino,755,10.0,3.855135,2,7,10,12,38
french,2646,9.817838,4.144744,1,7,9,12,31
greek,1175,10.182128,3.729461,1,7,10,12,27
indian,3003,12.705961,5.016806,1,9,12,16,49
irish,667,9.29985,3.700505,2,7,9,12,27
italian,7838,9.909033,3.806708,1,7,10,12,65


In [150]:
train['ingredient_length'] = train.ingredients.apply(lambda x:np.mean([len(item) for item in x]))

In [151]:
train. head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients,ingredient_length
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,12.0
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,6.75
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1


In [152]:
train.groupby('cuisine').ingredient_length.describe().unstack()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
brazilian,467,10.565328,2.541263,4.666667,8.857143,10.5,12.249226,19.25
british,804,10.746523,2.177004,4.2,9.267045,10.875,12.25,19.666667
cajun_creole,1546,12.022298,2.063972,4.857143,10.714286,11.933333,13.1,22.666667
chinese,2673,11.517859,1.948698,4.5,10.230769,11.4,12.666667,20.8
filipino,755,9.813842,1.975612,4.0,8.422619,9.8,11.0,16.0
french,2646,11.909192,2.127827,4.5,10.5,11.9,13.25,27.5
greek,1175,12.117176,2.298987,5.5,10.654762,12.0,13.375,28.5
indian,3003,10.842267,2.004081,4.0,9.490741,10.666667,12.125,24.0
irish,667,10.950398,2.259638,4.0,9.4,10.875,12.416667,18.666667
italian,7838,12.814348,2.42328,4.5,11.285714,12.692308,14.153846,31.4


### *Further Feature Engineering:*
(Suggestions)

* Stemming
* Lemmatization    
* Bigrams
* Build Similarity groups 

## Training Model

In [153]:
train['ingredients_str'] = train.ingredients.astype(str)

In [154]:
train.loc[0,'ingredients_str']

"[u'romaine lettuce', u'black olives', u'grape tomatoes', u'garlic', u'pepper', u'purple onion', u'seasoning', u'garbanzo beans', u'feta cheese crumbles']"

In [155]:
X = train.ingredients_str
y = train.cuisine

#### Count Vectorizer

In [156]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [157]:
X_dtm = vect.fit_transform(X)
X_dtm.shape

(39774, 3028)

In [158]:
print(vect.get_feature_names()[0:100])

[u'00', u'10', u'100', u'14', u'15', u'25', u'33', u'40', u'43', u'95', u'96', u'abalone', u'abbamele', u'absinthe', u'abura', u'acai', u'accent', u'accompaniment', u'achiote', u'acid', u'acini', u'ackee', u'acorn', u'acting', u'activ', u'active', u'added', u'adobo', u'adzuki', u'agar', u'agave', u'age', u'aged', u'ahi', u'aioli', u'ajinomoto', u'ajwain', u'aka', u'alaskan', u'albacore', u'alcohol', u'ale', u'aleppo', u'alexia', u'alfalfa', u'alfredo', u'all', u'allspice', u'almond', u'almondmilk', u'almonds', u'aloe', u'alphabet', u'alum', u'amaranth', u'amarena', u'amaretti', u'amaretto', u'amba', u'amber', u'amberjack', u'amchur', u'america', u'american', u'aminos', u'ammonium', u'amontillado', u'ampalaya', u'an', u'anaheim', u'anasazi', u'ancho', u'anchovies', u'anchovy', u'and', u'andouille', u'anejo', u'angel', u'anglaise', u'angled', u'angostura', u'angus', u'anise', u'anisette', u'anjou', u'annatto', u'any', u'aonori', u'apple', u'apples', u'applesauce', u'applewood', u'apricot

In [159]:
vect = CountVectorizer(token_pattern=r"'([a-z ]+)'")
X_dtm = vect.fit_transform(X)
X_dtm.shape

(39774, 6250)

In [160]:
print(vect.get_feature_names()[0:100])

[u'a taste of thai rice noodles', u'abalone', u'abbamele', u'absinthe', u'abura age', u'acai juice', u'accent', u'accent seasoning', u'accompaniment', u'achiote', u'achiote paste', u'achiote powder', u'acini di pepe', u'ackee', u'acorn squash', u'active dry yeast', u'adobo', u'adobo all purpose seasoning', u'adobo sauce', u'adobo seasoning', u'adobo style seasoning', u'adzuki beans', u'agar', u'agar agar flakes', u'agave nectar', u'agave tequila', u'aged balsamic vinegar', u'aged cheddar cheese', u'aged gouda', u'aged manchego cheese', u'ahi', u'ahi tuna steaks', u'aioli', u'ajinomoto', u'ajwain', u'aka miso', u'alaskan king crab legs', u'alaskan king salmon', u'albacore', u'albacore tuna in water', u'alcohol', u'ale', u'aleppo', u'aleppo pepper', u'alexia waffle fries', u'alfalfa sprouts', u'alfredo sauce', u'alfredo sauce mix', u'all beef hot dogs', u'all potato purpos', u'all purpose seasoning', u'all purpose unbleached flour', u'allspice', u'allspice berries', u'almond butter', u'a

#### Multinomail Naive Bayes

In [161]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

#### *Algorithms to compare:*
(suggestions)

* Logistic Regression
* Random forests
* Deep Neural Networks

#### Building Pipeline

In [162]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(CountVectorizer(), MultinomialNB())

#### Cross Validation

In [163]:
from sklearn.cross_validation import cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.72326159492833919

#### Optimizing Hyper Parameters 

In [164]:
from sklearn.grid_search import RandomizedSearchCV

In [None]:
import scipy as sp
param_grid = {}
param_grid['countvectorizer__token_pattern'] = [r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid['countvectorizer__min_df'] = [1, 2, 3, 4]
param_grid['multinomialnb__alpha'] = sp.stats.uniform(scale=1)
np.random.seed(1)

In [None]:
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=10, random_state=1)
%time rand.fit(X, y)

In [None]:
rand.grid_scores_

In [None]:
print(rand.best_score_)
print(rand.best_params_)

### Predicting on Test Data

In [None]:
test = pd.read_json('../yummly/test.json')

In [None]:
test['ingredients_str'] = test.ingredients.astype(str)

In [None]:
X_test = test.ingredients_str

In [None]:
pred_class_rand = rand.predict(X_test)
pred_class_rand

In [None]:
pred_prob_rand = rand.predict_proba(X_test
                                   )
pred_prob_rand.shape

#### Kaggle submission

In [None]:
pd.DataFrame({'id':test.id, 'cuisine':pred_class_rand}).set_index('id').to_csv('sub1.csv')