In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [4]:
import re
from collections import Counter

In [5]:
SPEC_CHARS = re.compile(r'[^\w\s\-_]')

chars = [re.findall(SPEC_CHARS, x)\
 for ilist in [r['ingredients'] for r in train+test] for x in ilist if re.search(SPEC_CHARS, x)]

Counter([x for charlist in chars for x in charlist])

Counter({'!': 34,
         '%': 394,
         '&': 479,
         "'": 240,
         '(': 55,
         ')': 55,
         ',': 814,
         '.': 57,
         '/': 2,
         '®': 244,
         '’': 8,
         '€': 1,
         '™': 79})

### Clean rules
* remove `'`, `’`, `( oz*)`, `(`, `)`
* replace `&` with `and`
* replace all else (`™`, `®`, `.`, `€`) with `' '`
* keep `%`

In [6]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

## Pipeline using TFIDF (bag of ingredients) and SVC

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [8]:
def itself(x):
    return x

In [9]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [10]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [11]:
train_ingrs = get_ingrs(train)

In [12]:
def combine_words(ilist):
    return ' '.join(ilist)

In [13]:
from sklearn.pipeline import FeatureUnion
from sklearn.multiclass import OneVsRestClassifier

In [14]:
word_ingr = Pipeline([
    ('union', FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words)),
        ])),
    ("svc", SVC(kernel="rbf", C=100, gamma=1))
])

ovr = OneVsRestClassifier(word_ingr)

In [15]:
%%time
scores = cross_val_score(ovr, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1, verbose=50)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ....................... , score=0.804678656772733, total=155.2min
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 155.2min
[CV] ...................... , score=0.8125942684766214, total=157.0min
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 157.0min remaining: 235.5min
[CV] ........................ , score=0.80997236875157, total=157.3min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 157.3min remaining: 104.9min
[CV] ...................... , score=0.8141437020259217, total=157.8min
[CV] ...................... , score=0.8179190751445087, total=158.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 158.2mi

In [16]:
scores

array([ 0.80997237,  0.81791908,  0.81259427,  0.80467866,  0.8141437 ])

#### Cross-validation accuracy

In [17]:
scores.mean()

0.81186161423427095

### Only words

In [46]:
word_ingr = Pipeline([
    ("words", TfidfVectorizer(strip_accents='unicode',
                              preprocessor=combine_words)),
    ("linear svc", SVC(kernel="linear", C=10**0))
])

In [47]:
%%time
scores = cross_val_score(word_ingr, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 2.9 s, sys: 164 ms, total: 3.07 s
Wall time: 2min 52s


In [48]:
scores

array([ 0.78447626,  0.79228449,  0.78657617,  0.78065652,  0.79099031])

#### Cross-validation accuracy

In [49]:
# C=1
scores.mean()

0.78699675135350899

In [45]:
# C=10**0.1 ~ 1.25
scores.mean()

0.78868098685056176

## Fitting to test data (after encoding all of train+test ingredients)

In [19]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words)),
        ]).fit(get_ingrs(train+test))

CPU times: user 3.41 s, sys: 6.75 ms, total: 3.42 s
Wall time: 3.42 s


In [20]:
test_bag = dvec_all.transform(get_ingrs(test))

In [26]:
svc_rbf = SVC(kernel="rbf", C=100, gamma=1)
ovr = OneVsRestClassifier(svc_rbf, n_jobs=4)

In [23]:
from time import strftime

def print_time():
    print(strftime('%y%m%d-%H%M%S'))
    return

In [None]:
%%time
print_time()

ovr = ovr.fit(dvec_all.transform(get_ingrs(train)), get_labels(train))

print_time()

180406-182743


In [35]:
ovr.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [36]:
ovr.get_params()

{'C': 1.2589254117941673,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [37]:
test_preds = svc_rbf.predict(dvec_all.transform(get_ingrs(test)))

In [38]:
test_preds.shape

(9944,)

In [39]:
test_ids = [r['id'] for r in test]

In [40]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [41]:
df_test.to_csv('../_data/180406_clean_TFIDF_ingr_word_SVM_gridC.csv', index=False)

## Results
Accuracy 0.78921  
Rank 468

![kaggle image](../_images/180406_ingr_words.png)
![kaggle image](../_images/180406_ingr_words_standing.png)

## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)