In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
# Load training data
train = pd.read_json('input/train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


## Data Preprocessing - Exploration

In [3]:
train.shape

(39774, 3)

In [4]:
train.isnull().sum()

cuisine        0
id             0
ingredients    0
dtype: int64

In [5]:
train.dtypes

cuisine        object
id              int64
ingredients    object
dtype: object

In [6]:
train.loc[0,'ingredients']

[u'romaine lettuce',
 u'black olives',
 u'grape tomatoes',
 u'garlic',
 u'pepper',
 u'purple onion',
 u'seasoning',
 u'garbanzo beans',
 u'feta cheese crumbles']

In [7]:
type(train.loc[0,'ingredients'])

list

In [8]:
train.cuisine.nunique()

20

In [9]:
train.groupby('cuisine').count()

Unnamed: 0_level_0,id,ingredients
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1
brazilian,467,467
british,804,804
cajun_creole,1546,1546
chinese,2673,2673
filipino,755,755
french,2646,2646
greek,1175,1175
indian,3003,3003
irish,667,667
italian,7838,7838


### *Further Data Preprocessing - Cleaning/Exploration/Feature selection:*
(Suggestions)

* Convert all letters into lowercase
* Strip unicode
* Strip punctuation such as semicolons and commas
* Strip parantheses and the strings they enclose
* Do food descriptors add value to prediction or not
* Do brand names of ingredients add value to prediction or not
* Remove common ingredients such as salt
* Remove 10 least frequently occuring ingredients in each cuisine
* Tf-idf

## Feature Engineering 

In [10]:
train['num_ingredients'] = train.ingredients.apply(len)

In [11]:
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
3,indian,22213,"[water, vegetable oil, wheat, salt]",4
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20


In [12]:
train.groupby('cuisine').num_ingredients.describe().unstack()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
brazilian,467.0,9.520343,5.555139,2.0,5.0,9.0,13.0,59.0
british,804.0,9.708955,4.165011,2.0,7.0,9.0,12.0,30.0
cajun_creole,1546.0,12.617076,4.611601,2.0,9.0,12.0,16.0,31.0
chinese,2673.0,11.982791,4.042125,2.0,9.0,12.0,14.0,38.0
filipino,755.0,10.0,3.855135,2.0,7.0,10.0,12.0,38.0
french,2646.0,9.817838,4.144744,1.0,7.0,9.0,12.0,31.0
greek,1175.0,10.182128,3.729461,1.0,7.0,10.0,12.0,27.0
indian,3003.0,12.705961,5.016806,1.0,9.0,12.0,16.0,49.0
irish,667.0,9.29985,3.700505,2.0,7.0,9.0,12.0,27.0
italian,7838.0,9.909033,3.806708,1.0,7.0,10.0,12.0,65.0


In [13]:
train['ingredient_length'] = train.ingredients.apply(lambda x:np.mean([len(item) for item in x]))

In [14]:
train. head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients,ingredient_length
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,12.0
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,6.75
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1


In [15]:
train.groupby('cuisine').ingredient_length.describe().unstack()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
brazilian,467.0,10.565328,2.541263,4.666667,8.857143,10.5,12.249226,19.25
british,804.0,10.746523,2.177004,4.2,9.267045,10.875,12.25,19.666667
cajun_creole,1546.0,12.022298,2.063972,4.857143,10.714286,11.933333,13.1,22.666667
chinese,2673.0,11.517859,1.948698,4.5,10.230769,11.4,12.666667,20.8
filipino,755.0,9.813842,1.975612,4.0,8.422619,9.8,11.0,16.0
french,2646.0,11.909192,2.127827,4.5,10.5,11.9,13.25,27.5
greek,1175.0,12.117176,2.298987,5.5,10.654762,12.0,13.375,28.5
indian,3003.0,10.842267,2.004081,4.0,9.490741,10.666667,12.125,24.0
irish,667.0,10.950398,2.259638,4.0,9.4,10.875,12.416667,18.666667
italian,7838.0,12.814348,2.42328,4.5,11.285714,12.692308,14.153846,31.4


### *Further Feature Engineering:*
(Suggestions)

* Stemming
* Lemmatization    
* Bigrams
* Build Similarity groups 

## Training Model

In [16]:
train['ingredients_str'] = train.ingredients.astype(str)

In [17]:
train.loc[0,'ingredients_str']

"[u'romaine lettuce', u'black olives', u'grape tomatoes', u'garlic', u'pepper', u'purple onion', u'seasoning', u'garbanzo beans', u'feta cheese crumbles']"

**Cleaning the data**

In [18]:
import re
import nltk
from nltk.stem import WordNetLemmatizer

In [19]:
wnl = WordNetLemmatizer()
def clean_row(ings):
    ing1 = []

    for ing in ings:
        words = re.sub('[^A-Za-z]', ' ', ing).lower().split()
#         print (words)

        wls = " ".join([str(wnl.lemmatize(word)) for word in words])
#         print (wls)
        ing1.append(wls)
#     print (ing1)
    return ing1
        

In [20]:
a=train['ingredients'][45]
clean_row(a)

['egg',
 'mandarin orange',
 'water',
 'orange liqueur',
 'yellow cake mix',
 'frosting',
 'vegetable oil',
 'white sugar']

In [21]:
train['ingredients_string'] = train.ingredients.apply(clean_row)
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients,ingredient_length,ingredients_str,ingredients_string
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,12.0,"[u'romaine lettuce', u'black olives', u'grape ...","[romaine lettuce, black olive, grape tomato, g..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909,"[u'plain flour', u'ground pepper', u'salt', u'...","[plain flour, ground pepper, salt, tomato, gro..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333,"[u'eggs', u'pepper', u'salt', u'mayonaise', u'...","[egg, pepper, salt, mayonaise, cooking oil, gr..."
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,6.75,"[u'water', u'vegetable oil', u'wheat', u'salt']","[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1,"[u'black pepper', u'shallots', u'cornflour', u...","[black pepper, shallot, cornflour, cayenne pep..."


In [29]:
train['ingredients_str'] = train.ingredients_string.astype(str)

In [30]:
train.loc[4,'ingredients_str']

"['black pepper', 'shallot', 'cornflour', 'cayenne pepper', 'onion', 'garlic paste', 'milk', 'butter', 'salt', 'lemon juice', 'water', 'chili powder', 'passata', 'oil', 'ground cumin', 'boneless chicken skinless thigh', 'garam masala', 'double cream', 'natural yogurt', 'bay leaf']"

In [31]:
train.loc[4,'ingredients_string']

['black pepper',
 'shallot',
 'cornflour',
 'cayenne pepper',
 'onion',
 'garlic paste',
 'milk',
 'butter',
 'salt',
 'lemon juice',
 'water',
 'chili powder',
 'passata',
 'oil',
 'ground cumin',
 'boneless chicken skinless thigh',
 'garam masala',
 'double cream',
 'natural yogurt',
 'bay leaf']

In [32]:
X = train.ingredients_str
y = train.cuisine

#### Count Vectorizer

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
# vect = CountVectorizer()
# vect

In [34]:
# X_dtm = vect.fit_transform(X)
# X_dtm.shape

In [35]:
# print(vect.get_feature_names()[500:600])

In [36]:
vect = CountVectorizer(token_pattern='([a-z ]+)')
X_dtm = vect.fit_transform(X)
X_dtm.shape

(39774, 6678)

In [37]:
print(vect.get_feature_names()[4000:4100])

[u'mountain dew soda', u'mousse', u'mozzarella ball', u'mozzarella cheese', u'mozzarella string cheese', u'mr dash', u'mr dash seasoning mix', u'msg', u'muenster', u'muenster cheese', u'muesli', u'muffin', u'muffin mix', u'mulato chile', u'mullet', u'multi grain penne pasta', u'multigrain cereal', u'mung bean', u'mung bean noodle', u'mung bean sprout', u'mung bean vermicelli', u'muscadet', u'muscadine grape', u'muscat', u'muscovado sugar', u'muscovy', u'mushroom', u'mushroom broth', u'mushroom cap', u'mushroom powder', u'mushroom soup', u'mushroom soy sauce', u'mussel', u'mussel well scrubbed', u'mustard', u'mustard green', u'mustard oil', u'mustard powder', u'mustard sauce', u'mustard seed', u'mutton', u'myzithra', u'naan', u'nacho cheese tortilla chip', u'nacho chip', u'nakano seasoned rice vinegar', u'nam pla', u'nam prik', u'nama shoyu', u'napa cabbage', u'napa cabbage leaf', u'nappa cabbage', u'nashi', u'natto', u'natural low fat yogurt', u'natural peanut butter', u'natural pistac

In [38]:
vect

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='([a-z ]+)', tokenizer=None,
        vocabulary=None)

#### Multinomail Naive Bayes

In [39]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

#### *Algorithms to compare:*
(suggestions)

* Logistic Regression
* Random forests
* Deep Neural Networks

#### Building Pipeline

In [40]:
from sklearn.pipeline import make_pipeline
# pipe = make_pipeline(CountVectorizer(), MultinomialNB())

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer
pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())

#### Cross Validation

In [60]:
from sklearn.cross_validation import cross_val_score
cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()

0.67994100867600604

#### Optimizing Hyper Parameters 

In [43]:
from sklearn.grid_search import RandomizedSearchCV

In [44]:
import scipy as sp
param_grid = {}

In [45]:
# param_grid['countvectorizer__token_pattern'] = ['\\b\\w\\w+\\b' , '([a-z ]+)']
# # param_grid['countvectorizer__token_pattern'] = ['\b\w\w+\b', '([a-z ]+)']
# param_grid['countvectorizer__min_df'] = [1, 2, 3, 4]
# param_grid['countvectorizer__min_df'] = [1, 2, 3, 4]
# param_grid['multinomialnb__alpha'] = sp.stats.uniform(scale=1)
# np.random.seed(1)

In [69]:
param_grid = {}
param_grid['tfidfvectorizer__token_pattern'] = ['\\b\\w\\w+\\b', '([a-z ]+)']
# param_grid['tfidfvectorizer__min_df'] = [0.5]
param_grid['tfidfvectorizer__stop_words'] = [{'english'}]
param_grid['multinomialnb__alpha'] = sp.stats.uniform(scale=1)
np.random.seed(1)

In [70]:
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=10, random_state=1)
%time rand.fit(X, y)

CPU times: user 1min 11s, sys: 1 s, total: 1min 12s
Wall time: 1min 12s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smoo...   vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'tfidfvectorizer__token_pattern': ['\\b\\w\\w+\\b', '([a-z ]+)'], 'tfidfvectorizer__stop_words': [set(['english'])], 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x118c25990>},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          scoring='accuracy', verbose=0)

In [71]:
rand.grid_scores_

[mean: 0.68185, std: 0.00433, params: {'tfidfvectorizer__token_pattern': '([a-z ]+)', 'multinomialnb__alpha': 0.417022004702574, 'tfidfvectorizer__stop_words': set(['english'])},
 mean: 0.65025, std: 0.00531, params: {'tfidfvectorizer__token_pattern': '([a-z ]+)', 'multinomialnb__alpha': 0.7203244934421581, 'tfidfvectorizer__stop_words': set(['english'])},
 mean: 0.72545, std: 0.00340, params: {'tfidfvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 0.00011437481734488664, 'tfidfvectorizer__stop_words': set(['english'])},
 mean: 0.72019, std: 0.00427, params: {'tfidfvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 0.30233257263183977, 'tfidfvectorizer__stop_words': set(['english'])},
 mean: 0.72721, std: 0.00515, params: {'tfidfvectorizer__token_pattern': '([a-z ]+)', 'multinomialnb__alpha': 0.14675589081711304, 'tfidfvectorizer__stop_words': set(['english'])},
 mean: 0.73566, std: 0.00503, params: {'tfidfvectorizer__token_pattern': '([a-z ]+)', '

In [72]:
print(rand.best_score_)
print(rand.best_params_)

0.735656458993
{'tfidfvectorizer__token_pattern': '([a-z ]+)', 'multinomialnb__alpha': 0.0923385947687978, 'tfidfvectorizer__stop_words': set(['english'])}


In [82]:
from sklearn.svm import SVC

In [100]:
pipe = make_pipeline(CountVectorizer(token_pattern='([a-z ]+)'), SVC(kernel='linear'))
cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()

KeyboardInterrupt: 

In [87]:
SVC()


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [94]:
param_grid = {}
param_grid['countvectorizer__token_pattern'] = ['([a-z ]+)']
# param_grid['countvectorizer__min_df'] = [1,2,3,4]
param_grid['countvectorizer__stop_words'] = [{'english'}]
param_grid['svc__kernel'] = ['rbf']
C_range = np.logspace(-2, 10, 5)
gamma_range = np.logspace(-9, 3, 5)
param_grid['svc__C'] = C_range
param_grid['svc__gamma'] = gamma_range
np.random.seed(1)
print (C_range)
print (gamma_range)

[  1.00000000e-02   1.00000000e+01   1.00000000e+04   1.00000000e+07
   1.00000000e+10]
[  1.00000000e-09   1.00000000e-06   1.00000000e-03   1.00000000e+00
   1.00000000e+03]


In [95]:
from sklearn.grid_search import GridSearchCV
gscv = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
%time gscv.fit(X, y)

KeyboardInterrupt: 

In [None]:
gsvc.grid_scores_

In [None]:
print(gscv.best_score_)
print(gscv.best_params_)

In [103]:
from sklearn.svm import LinearSVC
pipe = make_pipeline(CountVectorizer(token_pattern='([a-z ]+)', stop_words={'english'}, ngram_range=(1,2)), \
                     LinearSVC(C= 5,loss='squared_hinge',penalty='l2', tol=0.01))
pipe

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=set([... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.01,
     verbose=0))])

In [104]:
cross_val_score(pipe, X, y, cv=10, scoring='accuracy').mean()

0.71818201064743703

In [112]:
pipe = make_pipeline(CountVectorizer(token_pattern='([a-z ]+)', stop_words={'english'}), \
                     LinearSVC( tol=0.01))
param_grid = {}
param_grid['countvectorizer__ngram_range'] = [(1,1), (1,2)]
# param_grid['countvectorizer__min_df'] = [1,2,3,4]
#param_grid['countvectorizer__stop_words'] = [{'english'}]

param_grid['linearsvc__C'] = [1,10,100]
#param_grid['linearsvc__penalty'] = ['l1','l2']

In [113]:
gscv = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
%time gscv.fit(X, y)

CPU times: user 10min 15s, sys: 1.54 s, total: 10min 16s
Wall time: 10min 16s


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=set([... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.01,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'linearsvc__C': [1, 10, 100], 'countvectorizer__ngram_range': [(1, 1), (1, 2)]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [115]:
gscv.grid_scores_

[mean: 0.76771, std: 0.00330, params: {'linearsvc__C': 1, 'countvectorizer__ngram_range': (1, 1)},
 mean: 0.73543, std: 0.00695, params: {'linearsvc__C': 10, 'countvectorizer__ngram_range': (1, 1)},
 mean: 0.70539, std: 0.01474, params: {'linearsvc__C': 100, 'countvectorizer__ngram_range': (1, 1)},
 mean: 0.73928, std: 0.00671, params: {'linearsvc__C': 1, 'countvectorizer__ngram_range': (1, 2)},
 mean: 0.70317, std: 0.00808, params: {'linearsvc__C': 10, 'countvectorizer__ngram_range': (1, 2)},
 mean: 0.69010, std: 0.00664, params: {'linearsvc__C': 100, 'countvectorizer__ngram_range': (1, 2)}]

In [116]:
print(gscv.best_score_)
print(gscv.best_params_)

0.767712576055
{'linearsvc__C': 1, 'countvectorizer__ngram_range': (1, 1)}


In [None]:
LinearSVC.

### Predicting on Test Data

In [126]:
test = pd.read_json('input/test.json')

In [127]:
test['ingredients_str'] = test.ingredients.apply(clean_row).astype(str)

In [128]:
X_test = test.ingredients_str

In [129]:
pred_class_gscv = gscv.predict(X_test)
pred_class_gscv

array([u'irish', u'southern_us', u'italian', ..., u'italian',
       u'southern_us', u'mexican'], dtype=object)

In [132]:
# pred_prob_rand = rand.predict_proba(X_test)
# pred_prob_rand.shape

#### Kaggle submission

In [131]:
pd.DataFrame({'id':test.id, 'cuisine':pred_class_gscv}).set_index('id').to_csv('sub6.csv')

In [None]:
#With count vectorizer and first version of lemmatizer: 0.73471 on leaderboard

In [None]:
#With count vectorizer and 2nd version of lemmatizer: 0.75543 on leaderboard

In [None]:
#With tf-idf vectorizer and 2nd version of lemmatizer: 0.73854 on leaderboard

In [None]:
#With count vectorizer and 2nd version of lemmatizer and linearSVC: 0.77263 on leaderboard