In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [4]:
import re
from collections import Counter

In [5]:
SPEC_CHARS = re.compile(r'[^\w\s\-_]')

chars = [re.findall(SPEC_CHARS, x)\
 for ilist in [r['ingredients'] for r in train+test] for x in ilist if re.search(SPEC_CHARS, x)]

Counter([x for charlist in chars for x in charlist])

Counter({'!': 34,
         '%': 394,
         '&': 479,
         "'": 240,
         '(': 55,
         ')': 55,
         ',': 814,
         '.': 57,
         '/': 2,
         '®': 244,
         '’': 8,
         '€': 1,
         '™': 79})

### Clean rules
* remove `'`, `’`, `( oz*)`, `(`, `)`
* replace `&` with `and`
* replace all else (`™`, `®`, `.`, `€`) with `' '`
* keep `%`

In [6]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

## Pipeline using TFIDF (bag of ingredients) and SVC

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [8]:
def itself(x):
    return x

In [9]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [10]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [11]:
train_ingrs = get_ingrs(train)

In [64]:
def combine_words(ilist):
    return ' '.join(ilist)

def combine_words_slash(ilist):
    return '/'.join(ilist)

In [50]:
train_ingrs

[['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles'],
 ['plain flour',
  'ground pepper',
  'salt',
  'tomatoes',
  'ground black pepper',
  'thyme',
  'eggs',
  'green tomatoes',
  'yellow corn meal',
  'milk',
  'vegetable oil'],
 ['eggs',
  'pepper',
  'salt',
  'mayonaise',
  'cooking oil',
  'green chilies',
  'grilled chicken breasts',
  'garlic powder',
  'yellow onion',
  'soy sauce',
  'butter',
  'chicken livers'],
 ['water', 'vegetable oil', 'wheat', 'salt'],
 ['black pepper',
  'shallots',
  'cornflour',
  'cayenne pepper',
  'onions',
  'garlic paste',
  'milk',
  'butter',
  'salt',
  'lemon juice',
  'water',
  'chili powder',
  'passata',
  'oil',
  'ground cumin',
  'boneless chicken skinless thigh',
  'garam masala',
  'double cream',
  'natural yogurt',
  'bay leaf'],
 ['plain flour',
  'sugar',
  'butter',
  'eggs',
  'fresh ginger root',
  'salt',
  'ground 

In [71]:
tfidf = TfidfVectorizer(strip_accents='unicode', preprocessor=combine_words_slash, analyzer='char', ngram_range=(3, 3))

In [72]:
%%time
tfidf.fit(train_ingrs)

CPU times: user 2.48 s, sys: 96 ms, total: 2.58 s
Wall time: 2.58 s


TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(3, 3), norm='l2',
        preprocessor=<function combine_words_slash at 0x7f0012f47488>,
        smooth_idf=True, stop_words=None, strip_accents='unicode',
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None)

In [73]:
len(tfidf.vocabulary_)

4935

In [74]:
tfidf.vocabulary_

{'rom': 3811,
 'oma': 3271,
 'mai': 2751,
 'ain': 583,
 'ine': 2175,
 'ne ': 2964,
 'e l': 1271,
 ' le': 96,
 'let': 2571,
 'ett': 1528,
 'ttu': 4294,
 'tuc': 4298,
 'uce': 4386,
 'ce/': 986,
 'e/b': 1291,
 '/bl': 227,
 'bla': 865,
 'lac': 2521,
 'ack': 524,
 'ck ': 1021,
 'k o': 2339,
 ' ol': 117,
 'oli': 3262,
 'liv': 2603,
 'ive': 2266,
 'ves': 4579,
 'es/': 1503,
 's/g': 3927,
 '/gr': 277,
 'gra': 1815,
 'rap': 3660,
 'ape': 664,
 'pe ': 3486,
 'e t': 1279,
 ' to': 159,
 'tom': 4266,
 'mat': 2759,
 'ato': 722,
 'toe': 4261,
 'oes': 3209,
 '/ga': 270,
 'gar': 1741,
 'arl': 685,
 'rli': 3773,
 'lic': 2589,
 'ic/': 2078,
 'c/p': 950,
 '/pe': 347,
 'pep': 3499,
 'epp': 1474,
 'ppe': 3552,
 'per': 3500,
 'er/': 1479,
 'r/p': 3634,
 '/pu': 354,
 'pur': 3577,
 'urp': 4501,
 'rpl': 3827,
 'ple': 3528,
 'le ': 2553,
 'e o': 1274,
 ' on': 118,
 'oni': 3290,
 'nio': 3018,
 'ion': 2196,
 'on/': 3281,
 'n/s': 2909,
 '/se': 366,
 'sea': 3980,
 'eas': 1326,
 'aso': 706,
 'son': 4051,
 'nin': 3017

In [32]:
sorted(counts.items(), key=lambda x: x[1])

[('bark', 1),
 ('taiwanese', 1),
 ('membrillo', 1),
 ('blueberri', 1),
 ('guacamol', 1),
 ('muscavado', 1),
 ('liquorice', 1),
 ('beverages', 1),
 ('arame', 1),
 ('romana', 1),
 ('hurst', 1),
 ('harvest', 1),
 ('trimmed', 1),
 ('abbamele', 1),
 ('sablefish', 1),
 ('fiber', 1),
 ('lillet', 1),
 ('xuxu', 1),
 ('neapolitan', 1),
 ('poppyseeds', 1),
 ('usukuchi', 1),
 ('st', 1),
 ('germain', 1),
 ('pekoe', 1),
 ('sobrasada', 1),
 ('ti', 1),
 ('parslei', 1),
 ('argo', 1),
 ('mantou', 1),
 ('pekin', 1),
 ('stolichnaya', 1),
 ('shiromiso', 1),
 ('fiddlehead', 1),
 ('ferns', 1),
 ('poi', 1),
 ('skippy', 1),
 ('hoi', 1),
 ('pod', 1),
 ('mixers', 1),
 ('superior', 1),
 ('margherita', 1),
 ('beaters', 1),
 ('nian', 1),
 ('gao', 1),
 ('milkfat', 1),
 ('qua', 1),
 ('burro', 1),
 ('ocean', 1),
 ('caponata', 1),
 ('submarine', 1),
 ('calabrese', 1),
 ('pâte', 1),
 ('brisée', 1),
 ('gravlax', 1),
 ('bream', 1),
 ('genoise', 1),
 ('haricot', 1),
 ('belacan', 1),
 ('saba', 1),
 ('masur', 1),
 ('fowl', 1

In [25]:
from sklearn.pipeline import FeatureUnion

In [75]:
word_ingr = Pipeline([
    ('union', FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words)),
        ("ngram", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words_slash,
                                  analyzer='char',
                                  ngram_range=(3, 3))),
        ])),
    ("linear svc", SVC(kernel="linear", C=10**0.1))
])

In [76]:
%%time
scores = cross_val_score(word_ingr, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1, verbose=10)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.7914048755968837, total=16.1min
[CV] ....................... , score=0.7861090178347149, total=16.2min


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 16.2min remaining: 24.3min


[CV] ....................... , score=0.7800276694755377, total=16.8min


[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 16.8min remaining: 11.2min


[CV] ....................... , score=0.7888386123680241, total=16.9min
[CV] ....................... , score=0.7925003145841198, total=17.0min
CPU times: user 3.4 s, sys: 337 ms, total: 3.74 s
Wall time: 17min 1s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 17.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 17.0min finished


In [77]:
scores

array([ 0.78610902,  0.79140488,  0.78883861,  0.78002767,  0.79250031])

#### Cross-validation accuracy

In [78]:
scores.mean()

0.7877760979718561

### Only ngrams

In [84]:
word_ingr = Pipeline([
    ("ngram", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words_slash,
                                  analyzer='char',
                                  ngram_range=(3, 3))),
    ("linear svc", SVC(kernel="linear", C=10**0.1))
])

In [85]:
%%time
scores = cross_val_score(word_ingr, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1, verbose=50)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................ , score=0.782051282051282, total=13.1min
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 13.1min
[CV] ....................... , score=0.7766318702050057, total=13.4min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 13.5min remaining: 20.2min
[CV] ........................ , score=0.786460299484082, total=13.5min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 13.5min remaining:  9.0min
[CV] ....................... , score=0.7858758482030661, total=13.8min
[CV] ....................... , score=0.7778196433057021, total=13.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 13.8min rema

In [86]:
scores

array([ 0.77781964,  0.78587585,  0.78205128,  0.77663187,  0.7864603 ])

#### Cross-validation accuracy

In [87]:
# C=10**0.1 ~ 1.25
scores.mean()

0.78176778864982754

## Fitting to test data (after encoding all of train+test ingredients)

In [88]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words)),
        ("ngram", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words_slash,
                                  analyzer='char',
                                  ngram_range=(3, 3))),
        ]).fit(get_ingrs(train+test))

CPU times: user 6.16 s, sys: 104 ms, total: 6.26 s
Wall time: 6.26 s


In [89]:
test_bag = dvec_all.transform(get_ingrs(test))

In [90]:
svc_linear = SVC(kernel='linear', C=10**0.1)

In [92]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingrs(train)), get_labels(train))

CPU times: user 14min 3s, sys: 376 ms, total: 14min 3s
Wall time: 14min 4s


In [93]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [94]:
svc_linear.get_params()

{'C': 1.2589254117941673,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [95]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingrs(test)))

In [96]:
test_preds.shape

(9944,)

In [97]:
test_ids = [r['id'] for r in test]

In [98]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [99]:
df_test.to_csv('../_data/180406_clean_TFIDF_ingr_word_3gram_SVM_gridC.csv', index=False)

## Results
Accuracy 0.78921  
Rank 468

![kaggle image](../_images/180406_ingr_words.png)
![kaggle image](../_images/180406_ingr_words_standing.png)

## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)