In [2]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [3]:
len(train), len(test)

(39774, 9944)

In [4]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [5]:
import re
from collections import Counter

In [6]:
SPEC_CHARS = re.compile(r'[^\w\s\-_]')

chars = [re.findall(SPEC_CHARS, x)\
 for ilist in [r['ingredients'] for r in train+test] for x in ilist if re.search(SPEC_CHARS, x)]

Counter([x for charlist in chars for x in charlist])

Counter({'!': 34,
         '%': 394,
         '&': 479,
         "'": 240,
         '(': 55,
         ')': 55,
         ',': 814,
         '.': 57,
         '/': 2,
         '®': 244,
         '’': 8,
         '€': 1,
         '™': 79})

### Clean rules
* remove `'`, `’`, `( oz*)`, `(`, `)`
* replace `&` with `and`
* replace all else (`™`, `®`, `.`, `€`) with `' '`
* keep `%`

In [7]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

## Pipeline using TFIDF (bag of ingredients) and SVC

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [9]:
def itself(x):
    return x

In [10]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [11]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [12]:
train_ingrs = get_ingrs(train)

In [13]:
def combine_words(ilist):
    return ' '.join(ilist)

In [21]:
tfidf.fit(train_ingrs)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function preproc_1 at 0x7f0016705a60>,
        smooth_idf=True, stop_words=None, strip_accents='unicode',
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None)

In [22]:
tfidf.vocabulary_

{'romaine': 2291,
 'lettuce': 1522,
 'black': 242,
 'olives': 1864,
 'grape': 1162,
 'tomatoes': 2766,
 'garlic': 1086,
 'pepper': 1993,
 'purple': 2177,
 'onion': 1868,
 'seasoning': 2399,
 'garbanzo': 1080,
 'beans': 180,
 'feta': 951,
 'cheese': 516,
 'crumbles': 732,
 'plain': 2076,
 'flour': 1006,
 'ground': 1197,
 'salt': 2345,
 'thyme': 2743,
 'eggs': 892,
 'green': 1183,
 'yellow': 2979,
 'corn': 670,
 'meal': 1662,
 'milk': 1703,
 'vegetable': 2873,
 'oil': 1856,
 'mayonaise': 1656,
 'cooking': 665,
 'chilies': 544,
 'grilled': 1190,
 'chicken': 530,
 'breasts': 325,
 'powder': 2129,
 'soy': 2541,
 'sauce': 2370,
 'butter': 369,
 'livers': 1548,
 'water': 2918,
 'wheat': 2929,
 'shallots': 2424,
 'cornflour': 675,
 'cayenne': 482,
 'onions': 1869,
 'paste': 1956,
 'lemon': 1513,
 'juice': 1381,
 'chili': 543,
 'passata': 1952,
 'cumin': 757,
 'boneless': 278,
 'skinless': 2487,
 'thigh': 2735,
 'garam': 1079,
 'masala': 1641,
 'double': 843,
 'cream': 702,
 'natural': 1794,
 '

In [30]:
counts = Counter([w for r in train_ingrs for i in r for w in i.split()])

In [32]:
sorted(counts.items(), key=lambda x: x[1])

[('bark', 1),
 ('taiwanese', 1),
 ('membrillo', 1),
 ('blueberri', 1),
 ('guacamol', 1),
 ('muscavado', 1),
 ('liquorice', 1),
 ('beverages', 1),
 ('arame', 1),
 ('romana', 1),
 ('hurst', 1),
 ('harvest', 1),
 ('trimmed', 1),
 ('abbamele', 1),
 ('sablefish', 1),
 ('fiber', 1),
 ('lillet', 1),
 ('xuxu', 1),
 ('neapolitan', 1),
 ('poppyseeds', 1),
 ('usukuchi', 1),
 ('st', 1),
 ('germain', 1),
 ('pekoe', 1),
 ('sobrasada', 1),
 ('ti', 1),
 ('parslei', 1),
 ('argo', 1),
 ('mantou', 1),
 ('pekin', 1),
 ('stolichnaya', 1),
 ('shiromiso', 1),
 ('fiddlehead', 1),
 ('ferns', 1),
 ('poi', 1),
 ('skippy', 1),
 ('hoi', 1),
 ('pod', 1),
 ('mixers', 1),
 ('superior', 1),
 ('margherita', 1),
 ('beaters', 1),
 ('nian', 1),
 ('gao', 1),
 ('milkfat', 1),
 ('qua', 1),
 ('burro', 1),
 ('ocean', 1),
 ('caponata', 1),
 ('submarine', 1),
 ('calabrese', 1),
 ('pâte', 1),
 ('brisée', 1),
 ('gravlax', 1),
 ('bream', 1),
 ('genoise', 1),
 ('haricot', 1),
 ('belacan', 1),
 ('saba', 1),
 ('masur', 1),
 ('fowl', 1

In [15]:
from sklearn.pipeline import FeatureUnion
from sklearn.svm import LinearSVC

In [24]:
word_ingr = Pipeline([
    ('union', FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words)),
        ])),
    ("linear svc", LinearSVC(loss='hinge', C=10**0.1))
])

In [25]:
%%time
scores = cross_val_score(word_ingr, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1, verbose=10)

[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV]  ................................................................
[CV] ........................ , score=0.788964303670186, total=   8.4s
[CV] ....................... , score=0.7937924101533048, total=   8.8s
[CV] ....................... , score=0.7866114041698066, total=   9.0s


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    9.2s remaining:   13.8s
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:    9.3s remaining:    6.2s


[CV] ....................... , score=0.7849327128663062, total=   9.5s
[CV] ....................... , score=0.7976594941487354, total=  10.1s
CPU times: user 2.72 s, sys: 121 ms, total: 2.85 s
Wall time: 13.1 s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   10.9s finished


In [26]:
scores

array([ 0.7866114 ,  0.79379241,  0.7889643 ,  0.78493271,  0.79765949])

#### Cross-validation accuracy

In [27]:
scores.mean()

0.79039206500166781

### Only words

In [46]:
word_ingr = Pipeline([
    ("words", TfidfVectorizer(strip_accents='unicode',
                              preprocessor=combine_words)),
    ("linear svc", SVC(kernel="linear", C=10**0))
])

In [47]:
%%time
scores = cross_val_score(word_ingr, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 2.9 s, sys: 164 ms, total: 3.07 s
Wall time: 2min 52s


In [48]:
scores

array([ 0.78447626,  0.79228449,  0.78657617,  0.78065652,  0.79099031])

#### Cross-validation accuracy

In [49]:
# C=1
scores.mean()

0.78699675135350899

In [45]:
# C=10**0.1 ~ 1.25
scores.mean()

0.78868098685056176

## Fitting to test data (after encoding all of train+test ingredients)

In [28]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words)),
        ]).fit(get_ingrs(train+test))

CPU times: user 3.19 s, sys: 19.9 ms, total: 3.21 s
Wall time: 3.2 s


In [29]:
test_bag = dvec_all.transform(get_ingrs(test))

In [30]:
svc_linear =  LinearSVC(loss='hinge', C=10**0.1)

In [31]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingrs(train)), get_labels(train))

CPU times: user 7.48 s, sys: 55.8 ms, total: 7.54 s
Wall time: 7.54 s


In [32]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [33]:
svc_linear.get_params()

{'C': 1.2589254117941673,
 'class_weight': None,
 'dual': True,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'loss': 'hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

In [34]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingrs(test)))

In [35]:
test_preds.shape

(9944,)

In [36]:
test_ids = [r['id'] for r in test]

In [37]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [38]:
df_test.to_csv('../_data/180406_clean_TFIDF_ingr_word_linearSVM_gridC.csv', index=False)

## Results
Accuracy 0.78972  
Rank 419

![kaggle image](../_images/180406_linearsvc.png)
![kaggle image](../_images/180406_linearsvc_standing.png)

## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)