In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [4]:
import re
from collections import Counter

In [5]:
SPEC_CHARS = re.compile(r'[^\w\s\-_]')

chars = [re.findall(SPEC_CHARS, x)\
 for ilist in [r['ingredients'] for r in train+test] for x in ilist if re.search(SPEC_CHARS, x)]

Counter([x for charlist in chars for x in charlist])

Counter({'!': 34,
         '%': 394,
         '&': 479,
         "'": 240,
         '(': 55,
         ')': 55,
         ',': 814,
         '.': 57,
         '/': 2,
         '®': 244,
         '’': 8,
         '€': 1,
         '™': 79})

### Clean rules
* remove `'`, `’`, `( oz*)`, `(`, `)`
* replace `&` with `and`
* replace all else (`™`, `®`, `.`, `€`) with `' '`
* keep `%`

In [6]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

## Pipeline using TFIDF (bag of ingredients) and SVC

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [8]:
def itself(x):
    return x

In [9]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [10]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [11]:
train_ingrs = get_ingrs(train)

In [12]:
def combine_words(ilist):
    return ' '.join(ilist)

In [13]:
from sklearn.pipeline import FeatureUnion
from sklearn.svm import LinearSVC

In [14]:
ingr_word = Pipeline([
    ('union', FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words,
                                  stop_words='english')),
        ])),
    ("linear svc", LinearSVC(loss='hinge', C=10**0.1))
])

In [32]:
word = Pipeline([
    ("words", TfidfVectorizer(strip_accents='unicode',
                              preprocessor=combine_words,
                              ngram_range=(1, 2))),
    ("linear svc", LinearSVC(loss='hinge', C=10**0.1))
])

In [28]:
train_ingrs = get_ingrs(train)
train_labels = get_labels(train)

# Using stop words
## Both

In [17]:
%%time
scores = cross_val_score(ingr_word, train_ingrs, train_labels, cv=5, n_jobs=-1, verbose=10)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.7893413775766717, total=  12.5s
[CV] ....................... , score=0.7852298417483045, total=  13.1s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   13.6s remaining:   20.4s


[CV] ....................... , score=0.7856873349264244, total=  12.7s


[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   14.0s remaining:    9.3s


[CV] ....................... , score=0.7977853277966528, total=  13.1s
[CV] ....................... , score=0.7931641115858256, total=  14.0s
CPU times: user 1.12 s, sys: 108 ms, total: 1.23 s
Wall time: 15 s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.7s finished


In [18]:
scores

array([ 0.78522984,  0.79316411,  0.78934138,  0.78568733,  0.79778533])

In [19]:
scores.mean()

0.7902415987267758

## Word only

In [33]:
%%time
scores = cross_val_score(word, train_ingrs, train_labels, cv=5, n_jobs=-1, verbose=10)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.7937924101533048, total=  11.8s
[CV] ....................... , score=0.7901281085154483, total=  12.4s
[CV] ....................... , score=0.7914936454007802, total=  11.7s
[CV] ....................... , score=0.7806565211923029, total=  12.0s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.9s remaining:   19.4s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   13.0s remaining:    8.7s


[CV] ....................... , score=0.7859477124183006, total=  12.5s
CPU times: user 1.09 s, sys: 156 ms, total: 1.24 s
Wall time: 13.7 s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.4s finished


In [34]:
scores

array([ 0.79012811,  0.79379241,  0.78594771,  0.78065652,  0.79149365])

In [35]:
scores.mean()

0.78840367953602741

gridsearch

In [36]:
ingr_word1 = Pipeline([
    ('union', FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words)),
        ])),
    ("linear svc", LinearSVC(loss='hinge', C=10**0.1))
])

In [38]:
ingr_word1.get_params()

{'linear svc': LinearSVC(C=1.2589254117941673, class_weight=None, dual=True,
      fit_intercept=True, intercept_scaling=1, loss='hinge', max_iter=1000,
      multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
      verbose=0),
 'linear svc__C': 1.2589254117941673,
 'linear svc__class_weight': None,
 'linear svc__dual': True,
 'linear svc__fit_intercept': True,
 'linear svc__intercept_scaling': 1,
 'linear svc__loss': 'hinge',
 'linear svc__max_iter': 1000,
 'linear svc__multi_class': 'ovr',
 'linear svc__penalty': 'l2',
 'linear svc__random_state': None,
 'linear svc__tol': 0.0001,
 'linear svc__verbose': 0,
 'memory': None,
 'steps': [('union', FeatureUnion(n_jobs=1,
          transformer_list=[('ingrs', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2',
           prepr

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
param_test1 = {
    'union__ingrs__ngram_range': [(1, 1), (1, 2), (1, 3), (1, 4)],
    'union__ingrs__stop_words': [None, 'english']
}
gsearch1 = GridSearchCV(estimator=ingr_word1,
                        param_grid=param_test1,
                        scoring='accuracy',
                        n_jobs=4,
                        iid=False,
                        cv=3,
                        verbose=50)

In [43]:
from time import strftime

def print_time():
    print(strftime('%y%m%d-%H%M%S'))
    return

In [44]:
%%time
print_time()

gsearch1.fit(train_ingrs, train_labels)

print_time()

180406-185625
Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=None .
[CV] union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=None .
[CV] union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=None .
[CV] union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=english 
[CV]  union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=None, score=0.7845991402066521, total=   8.4s
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:   10.4s
[CV] union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=english 
[CV]  union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=None, score=0.7879816029555907, total=   9.3s
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:   11.2s
[CV] union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=english 
[CV]  union__ingrs__ngram_range=(1, 1), union__ingrs__stop_words=None, score=0.7905221853305161, total=   9.2s
[Parallel(n_jobs=4)]: Done   3 tasks   

In [46]:
gsearch1.best_params_

{'union__ingrs__ngram_range': (1, 4), 'union__ingrs__stop_words': None}

In [47]:
gsearch1.best_score_

0.79534408705787352

## Fitting to test data (after encoding all of train+test ingredients)

In [14]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words,
                                  ngram_range=(1, 4))),
        ]).fit(get_ingrs(train+test))

CPU times: user 9.31 s, sys: 156 ms, total: 9.47 s
Wall time: 9.46 s


In [18]:
test_bag = dvec_all.transform(get_ingrs(test))

In [19]:
dvec_all.transform(train_ingrs)

<39774x907529 sparse matrix of type '<class 'numpy.float64'>'
	with 3404836 stored elements in Compressed Sparse Row format>

In [50]:
svc_linear =  LinearSVC(loss='hinge', C=10**0.1)

In [51]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(train_ingrs), train_labels)

CPU times: user 26 s, sys: 135 ms, total: 26.2 s
Wall time: 26.2 s


In [52]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [53]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingrs(test)))

In [54]:
test_preds.shape

(9944,)

In [55]:
test_ids = [r['id'] for r in test]

In [56]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [57]:
df_test.to_csv('../_data/180406_ingr_14ngram_linearsvc_gridc.csv', index=False)

### 0.79605
### rank 270/1388
![Kaggle](../_images/180406_14ngram_ingr_linearsvc.png)
![Kaggle](../_images/180406_14ngram_ingr_linearsvc_standing.png)

## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)