In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [4]:
import re
from collections import Counter

In [5]:
SPEC_CHARS = re.compile(r'[^\w\s\-_]')

chars = [re.findall(SPEC_CHARS, x)\
 for ilist in [r['ingredients'] for r in train+test] for x in ilist if re.search(SPEC_CHARS, x)]

Counter([x for charlist in chars for x in charlist])

Counter({'!': 34,
         '%': 394,
         '&': 479,
         "'": 240,
         '(': 55,
         ')': 55,
         ',': 814,
         '.': 57,
         '/': 2,
         '®': 244,
         '’': 8,
         '€': 1,
         '™': 79})

### Clean rules
* remove `'`, `’`, `( oz*)`, `(`, `)`
* replace `&` with `and`
* replace all else (`™`, `®`, `.`, `€`) with `' '`
* keep `%`

In [6]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

## Pipeline using TFIDF (bag of ingredients) and SVC

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [8]:
def itself(x):
    return x

In [9]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [11]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [12]:
train_ingrs = get_ingrs(train)

In [38]:
tfidf = TfidfVectorizer(strip_accents='unicode')

In [39]:
tfidf.fit([' '.join(x) for x in train_ingrs])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [30]:
counts = Counter([w for r in train_ingrs for i in r for w in i.split()])

In [32]:
sorted(counts.items(), key=lambda x: x[1])

[('bark', 1),
 ('taiwanese', 1),
 ('membrillo', 1),
 ('blueberri', 1),
 ('guacamol', 1),
 ('muscavado', 1),
 ('liquorice', 1),
 ('beverages', 1),
 ('arame', 1),
 ('romana', 1),
 ('hurst', 1),
 ('harvest', 1),
 ('trimmed', 1),
 ('abbamele', 1),
 ('sablefish', 1),
 ('fiber', 1),
 ('lillet', 1),
 ('xuxu', 1),
 ('neapolitan', 1),
 ('poppyseeds', 1),
 ('usukuchi', 1),
 ('st', 1),
 ('germain', 1),
 ('pekoe', 1),
 ('sobrasada', 1),
 ('ti', 1),
 ('parslei', 1),
 ('argo', 1),
 ('mantou', 1),
 ('pekin', 1),
 ('stolichnaya', 1),
 ('shiromiso', 1),
 ('fiddlehead', 1),
 ('ferns', 1),
 ('poi', 1),
 ('skippy', 1),
 ('hoi', 1),
 ('pod', 1),
 ('mixers', 1),
 ('superior', 1),
 ('margherita', 1),
 ('beaters', 1),
 ('nian', 1),
 ('gao', 1),
 ('milkfat', 1),
 ('qua', 1),
 ('burro', 1),
 ('ocean', 1),
 ('caponata', 1),
 ('submarine', 1),
 ('calabrese', 1),
 ('pâte', 1),
 ('brisée', 1),
 ('gravlax', 1),
 ('bream', 1),
 ('genoise', 1),
 ('haricot', 1),
 ('belacan', 1),
 ('saba', 1),
 ('masur', 1),
 ('fowl', 1

In [164]:
svc = Pipeline([
    ("count_vectorizer", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("linear svc", SVC(kernel="linear", C=10**0.1))])

In [165]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [166]:
%%time
scores = cross_val_score(svc, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 2.76 s, sys: 197 ms, total: 2.95 s
Wall time: 2min 30s


In [167]:
scores

array([ 0.77982919,  0.78587585,  0.78054299,  0.77461954,  0.78708947])

#### Cross-validation accuracy

In [168]:
scores.mean()

0.78159140714190012

## Fitting to test data (after encoding all of train+test ingredients)

In [170]:
%%time
dvec_all = TfidfVectorizer(tokenizer=itself,
                           preprocessor=itself,
                          ).fit(get_ingrs(train+test))

CPU times: user 2.64 s, sys: 7.65 ms, total: 2.65 s
Wall time: 2.65 s


In [172]:
test_bag = dvec_all.transform(get_ingrs(test))

In [174]:
svc_linear = SVC(kernel='linear', C=10**0.1)

In [176]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingrs(train)), get_labels(train))

CPU times: user 1min 48s, sys: 556 ms, total: 1min 48s
Wall time: 1min 48s


In [177]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [178]:
svc_linear.get_params()

{'C': 1.2589254117941673,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [179]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingrs(test)))

In [180]:
test_preds.shape

(9944,)

In [181]:
test_ids = [r['id'] for r in test]

In [182]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [183]:
df_test.to_csv('../_data/180405_clean_TFIDF_SVM_gridC.csv', index=False)

## Results
Accuracy 0.78650  
Rank 572

![kaggle image](../_images/180405_clean_tfidf_svm_c.png)
![kaggle image](../_images/180405_clean_tfidf_svm_c_ranking.png)

## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)