In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

## Pipeline using DictVectorizer (bag of ingredients) and SVC

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [59]:
def itself(x):
    return x

In [60]:
svc = Pipeline([
    ("count_vectorizer", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("linear svc", SVC(kernel="linear"))])

In [61]:
def get_ingr_dict(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    ingrs_d = [{ingredient:1 for ingredient in recipe} for recipe in ingrs]
    return ingrs_d

def get_labels(given):
    return [r['cuisine'] for r in given]

In [62]:
%%time
scores = cross_val_score(svc, get_ingr_dict(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 1.1 s, sys: 176 ms, total: 1.27 s
Wall time: 2min 20s


In [63]:
scores

array([ 0.77982919,  0.78348831,  0.77916038,  0.77474532,  0.78633447])

#### Cross-validation accuracy

In [64]:
scores.mean()

0.78071153305702945

## Fitting to test data (after encoding all of train+test ingredients)

In [65]:
%%time
dvec_all = TfidfVectorizer(tokenizer=itself,
                           preprocessor=itself,
                          ).fit(get_ingr_dict(train+test))

CPU times: user 462 ms, sys: 8.01 ms, total: 470 ms
Wall time: 469 ms


In [66]:
test_bag = dvec_all.transform(get_ingr_dict(test))

In [67]:
svc_linear = SVC(kernel='linear')

In [72]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingr_dict(train)), get_labels(train))

CPU times: user 1min 40s, sys: 496 ms, total: 1min 41s
Wall time: 1min 41s


In [73]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [74]:
svc_linear.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [75]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingr_dict(test)))

In [76]:
test_preds.shape

(9944,)

In [77]:
test_ids = [r['id'] for r in test]

In [79]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [80]:
df_test.to_csv('../_data/180401_basic_SVM_TFIDF.csv', index=False)

## Results
Accuracy 0.78499  
Rank 612

![kaggle image](../_images/180401_bow_svm_tfidf.png)
![kaggle image](../_images/180401_bow_svm_tfidf_standing.png)

## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)