In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

## Pipeline using DictVectorizer (bag of ingredients) and SVC

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [6]:
svc = Pipeline([
    ("count_vectorizer", DictVectorizer()),
    ("linear svc", SVC(kernel="linear"))])

In [7]:
def get_ingr_dict(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    ingrs_d = [{ingredient:1 for ingredient in recipe} for recipe in ingrs]
    return ingrs_d

def get_labels(given):
    return [r['cuisine'] for r in given]

In [14]:
%%time
scores = cross_val_score(svc, get_ingr_dict(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 1.1 s, sys: 160 ms, total: 1.26 s
Wall time: 1min 59s


In [15]:
scores

array([ 0.75270033,  0.75923599,  0.75339367,  0.75009433,  0.76330691])

#### Cross-validation accuracy

In [53]:
scores.mean()

0.75574624333524398

## Fitting to test data (after encoding all of train+test ingredients)

In [19]:
%%time
dvec_all = DictVectorizer().fit(get_ingr_dict(train+test))

CPU times: user 442 ms, sys: 7.98 ms, total: 450 ms
Wall time: 448 ms


In [24]:
test_bag = dvec_all.transform(get_ingr_dict(test))

In [25]:
svc_linear = SVC(kernel='linear')

In [29]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingr_dict(train)), get_labels(train))

CPU times: user 1min 28s, sys: 364 ms, total: 1min 29s
Wall time: 1min 29s


In [31]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [33]:
svc_linear.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [34]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingr_dict(test)))

In [36]:
test_preds.shape

(9944,)

In [38]:
test_ids = [r['id'] for r in test]

In [39]:
test_ids

[18009,
 28583,
 41580,
 29752,
 35687,
 38527,
 19666,
 41217,
 28753,
 22659,
 21749,
 44967,
 42969,
 44883,
 20827,
 23196,
 35387,
 33780,
 19001,
 16526,
 42455,
 47453,
 42478,
 11885,
 16585,
 29639,
 26245,
 38516,
 47520,
 26212,
 23696,
 14926,
 13292,
 27346,
 1384,
 15959,
 42297,
 46235,
 21181,
 9809,
 22982,
 23120,
 26743,
 12741,
 7050,
 22437,
 14760,
 10649,
 48065,
 39931,
 32935,
 40142,
 19103,
 38267,
 30512,
 3324,
 26197,
 384,
 47230,
 39748,
 41038,
 24878,
 42624,
 45706,
 8874,
 23878,
 47234,
 24075,
 46323,
 22562,
 11358,
 9201,
 38620,
 47638,
 21238,
 11053,
 46602,
 13937,
 21876,
 20798,
 24381,
 48178,
 45832,
 27560,
 23247,
 28975,
 43230,
 9126,
 13021,
 28194,
 25617,
 36125,
 16454,
 1607,
 38526,
 18755,
 12035,
 16047,
 15419,
 16971,
 31459,
 27888,
 13900,
 12376,
 1658,
 29431,
 22290,
 28464,
 37953,
 20814,
 35987,
 2329,
 43237,
 30647,
 16675,
 37722,
 32438,
 20287,
 43296,
 6237,
 48560,
 20810,
 34951,
 31850,
 47745,
 37815,
 9872

In [50]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [52]:
df_test.to_csv('../_data/180401_basic_SVM.csv', index=False)

## Results
Accuracy 0.76146  
Rank 867

![kaggle image](../_images/180401_bow_svm.png)
![kaggle image](../_images/180401_bow_svm_standing.png)

## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)
* Use TF-IDF (reduce importance of salt, etc.)