In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [4]:
import re
from collections import Counter

In [14]:
SPEC_CHARS = re.compile(r'[^\w\s\-_]')

chars = [re.findall(SPEC_CHARS, x)\
 for ilist in [r['ingredients'] for r in train+test] for x in ilist if re.search(SPEC_CHARS, x)]

Counter([x for charlist in chars for x in charlist])

Counter({'!': 34,
         '%': 394,
         '&': 479,
         "'": 240,
         '(': 55,
         ')': 55,
         ',': 814,
         '.': 57,
         '/': 2,
         '®': 244,
         '’': 8,
         '€': 1,
         '™': 79})

In [123]:
SPEC_CHARS = re.compile(r'\®')

chars = [x\
 for ilist in [r['ingredients'] for r in train+test] for x in ilist if re.search(SPEC_ELSE, x)]

In [124]:
chars

['extra-virgin olive oil',
 'extra-virgin olive oil',
 'all-purpose flour',
 'broiler-fryer chicken',
 'low-fat mayonnaise',
 'all-purpose flour',
 'all-purpose flour',
 'chinese five-spice powder',
 'black-eyed peas',
 'extra-virgin olive oil',
 'all-purpose flour',
 'all-purpose flour',
 'extra-virgin olive oil',
 'all-purpose flour',
 'all-purpose flour',
 'part-skim mozzarella cheese',
 'low-fat sour cream',
 '1% low-fat milk',
 'demi-glace',
 'all-purpose flour',
 'all-purpose flour',
 'all-purpose flour',
 'part-skim mozzarella cheese',
 'english muffins, split and toasted',
 'all-purpose flour',
 'all-purpose flour',
 'chinese five-spice powder',
 'all-purpose flour',
 'all-purpose flour',
 '2% reduced-fat milk',
 'long-grain rice',
 'extra-virgin olive oil',
 'extra-virgin olive oil',
 'low-fat natural yogurt',
 'bone-in chicken breasts',
 'long-grain rice',
 'short-grain rice',
 'extra-virgin olive oil',
 'plain whole-milk yogurt',
 'all-purpose flour',
 'extra-lean ground bee

In [121]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [130]:
[i for i in [clean_ingr(x) for x in chars] if re.search(r'[^\w\s]', i)]

['1% low fat milk',
 '2% reduced fat milk',
 '2% reduced fat milk',
 '1% low fat cottage cheese',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '2% reduced fat milk',
 '1% low fat milk',
 '1% low fat buttermilk',
 '2% reduced fat milk',
 '1% low fat milk',
 '2% reduced fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '2% reduced fat milk',
 '1% low fat milk',
 '2% reduced fat milk',
 '1% low fat milk',
 '2% reduced fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat cottage cheese',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '2% reduced fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '2% reduced fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat milk',
 '1% low fat cottage cheese',
 '1% low fat milk',
 '1% low fat milk',
 '2% reduced fat milk',
 '2% reduced fat mil

* remove `'`, `’`, `( oz*)`, `(`, `)`
* replace `&` with `and`
* replace all else (`™`, `®`, `.`, `€`) with `' '`
* keep `%`

In [121]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

## Pipeline using TFIDF (bag of ingredients) and SVC

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [59]:
def itself(x):
    return x

In [60]:
svc = Pipeline([
    ("count_vectorizer", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("linear svc", SVC(kernel="linear"))])

In [27]:
SPEC_CHARS = re.compile(r'[^\w\s\-\&_]')

In [21]:
def clean_ingr(ingr):
    return ' '.join(re.sub(SPEC_CHARS, ' ', ingr).split())

In [26]:
clean_ingr('aaa$@% 2358&& aaa')

'aaa 2358 aaa'

In [61]:
def get_ingr_dict(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    ingrs_d = [{ingredient:1 for ingredient in recipe} for recipe in ingrs]
    return ingrs_d

def get_labels(given):
    return [r['cuisine'] for r in given]

In [62]:
%%time
scores = cross_val_score(svc, get_ingr_dict(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 1.1 s, sys: 176 ms, total: 1.27 s
Wall time: 2min 20s


In [63]:
scores

array([ 0.77982919,  0.78348831,  0.77916038,  0.77474532,  0.78633447])

#### Cross-validation accuracy

In [64]:
scores.mean()

0.78071153305702945

## Fitting to test data (after encoding all of train+test ingredients)

In [65]:
%%time
dvec_all = TfidfVectorizer(tokenizer=itself,
                           preprocessor=itself,
                          ).fit(get_ingr_dict(train+test))

CPU times: user 462 ms, sys: 8.01 ms, total: 470 ms
Wall time: 469 ms


In [66]:
test_bag = dvec_all.transform(get_ingr_dict(test))

In [67]:
svc_linear = SVC(kernel='linear')

In [72]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingr_dict(train)), get_labels(train))

CPU times: user 1min 40s, sys: 496 ms, total: 1min 41s
Wall time: 1min 41s


In [73]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [74]:
svc_linear.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [75]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingr_dict(test)))

In [76]:
test_preds.shape

(9944,)

In [77]:
test_ids = [r['id'] for r in test]

In [79]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [80]:
df_test.to_csv('../_data/180401_basic_SVM_TFIDF.csv', index=False)

## Results
Accuracy 0.78499  
Rank 612

![kaggle image](../_images/180401_bow_svm_tfidf.png)
![kaggle image](../_images/180401_bow_svm_tfidf_standing.png)

## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)