In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

## Pipeline using DictVectorizer (bag of ingredients) and SVC

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [5]:
def itself(x):
    return x

In [6]:
svc = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("svc", SVC(kernel="linear"))])

In [7]:
def get_ingrs(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [41]:
%%time
scores = cross_val_score(svc, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 983 ms, sys: 172 ms, total: 1.16 s
Wall time: 2min 30s


In [42]:
scores

array([ 0.77982919,  0.78336265,  0.77916038,  0.77487109,  0.78633447])

#### Cross-validation accuracy

In [43]:
scores.mean()

0.78071155518300084

## Fitting to test data (after encoding all of train+test ingredients)

In [9]:
%%time
dvec_all = TfidfVectorizer(tokenizer=itself,
                           preprocessor=itself,
                          ).fit(get_ingrs(train+test))

CPU times: user 398 ms, sys: 11.7 ms, total: 410 ms
Wall time: 409 ms


In [11]:
test_bag = dvec_all.transform(get_ingrs(test))

In [13]:
svc_linear = SVC(kernel='linear')

In [14]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingr_dict(train)), get_labels(train))

CPU times: user 1min 46s, sys: 420 ms, total: 1min 47s
Wall time: 1min 47s


In [15]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [16]:
svc_linear.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [17]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingr_dict(test)))

In [18]:
test_preds.shape

(9944,)

In [19]:
test_ids = [r['id'] for r in test]

In [20]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [21]:
df_test.to_csv('../_data/180401_basic_SVM_TFIDF.csv', index=False)

## Results
Accuracy 0.78499  
Rank 612

![kaggle image](../_images/180401_bow_svm_tfidf.png)
![kaggle image](../_images/180401_bow_svm_tfidf_standing.png)

In [34]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split



In [35]:
train_ingr = get_ingrs(train)
train_labels = get_labels(train)

In [36]:
len(train_ingr), len(train_labels)

(39774, 39774)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(train_ingr,
                                                    train_labels,
                                                    test_size=0.2,
                                                   )

In [38]:
def itself(x):
    return x

In [39]:
svc = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("svc", SVC(kernel="linear", C=10**0.1))])

In [55]:
svc.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [57]:
y_pred = svc.predict(X_test)

In [70]:
y_pred[:10], y_test[:10]

(array(['indian', 'thai', 'italian', 'southern_us', 'vietnamese',
        'filipino', 'mexican', 'jamaican', 'british', 'italian'],
       dtype='<U12'),
 ['indian',
  'thai',
  'italian',
  'british',
  'vietnamese',
  'jamaican',
  'mexican',
  'jamaican',
  'russian',
  'italian'])

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.73      0.58      0.65        90
     british       0.69      0.39      0.50       172
cajun_creole       0.80      0.67      0.73       341
     chinese       0.79      0.89      0.84       521
    filipino       0.74      0.60      0.66       135
      french       0.59      0.64      0.62       555
       greek       0.84      0.68      0.75       216
      indian       0.86      0.89      0.88       606
       irish       0.66      0.46      0.55       136
     italian       0.78      0.89      0.83      1579
    jamaican       0.82      0.59      0.69       106
    japanese       0.86      0.66      0.75       294
      korean       0.84      0.68      0.75       152
     mexican       0.88      0.92      0.90      1280
    moroccan       0.83      0.78      0.80       165
     russian       0.68      0.37      0.48        92
 southern_us       0.71      0.83      0.77       855
     spanish       0.69    

In [68]:
pd.DataFrame(confusion_matrix(y_test, y_pred), columns=[x[:3] for x in svc.classes_], index=svc.classes_)

Unnamed: 0,bra,bri,caj,chi,fil,fre,gre,ind,iri,ita,jam,jap,kor,mex,mor,rus,sou,spa,tha,vie
brazilian,52,0,0,0,0,2,0,3,0,6,0,0,0,14,0,0,11,0,2,0
british,0,67,0,0,1,32,0,8,11,13,1,0,1,3,0,0,31,3,1,0
cajun_creole,1,0,230,0,0,19,0,1,0,24,0,0,0,12,0,0,52,2,0,0
chinese,2,1,0,464,3,2,0,0,1,5,0,10,9,6,0,0,7,0,9,2
filipino,3,1,1,10,81,4,0,1,1,8,0,3,0,7,0,0,9,0,1,5
french,0,3,3,0,1,356,0,2,6,124,1,1,0,5,2,2,44,5,0,0
greek,0,0,1,1,0,5,147,1,0,44,0,0,0,5,4,1,4,3,0,0
indian,3,1,0,2,2,1,5,539,0,8,2,2,0,16,12,2,6,1,3,1
irish,0,10,0,1,0,19,1,0,63,7,2,0,0,2,3,4,24,0,0,0
italian,2,2,7,2,0,72,16,5,2,1405,2,0,0,21,2,2,28,11,0,0


## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
svc = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                  preprocessor=itself,
                                  strip_accents='unicode',
                                 )),
    ("svc", SVC(kernel="linear"))])

In [10]:
svc.named_steps

{'svc': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
   max_iter=-1, probability=False, random_state=None, shrinking=True,
   tol=0.001, verbose=False),
 'tfidf_vec': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), norm='l2',
         preprocessor=<function itself at 0x7f47de744e18>, smooth_idf=True,
         stop_words=None, strip_accents='unicode', sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=<function itself at 0x7f47de744e18>, use_idf=True,
         vocabulary=None)}

In [11]:
svc.get_params()

{'memory': None,
 'steps': [('tfidf_vec',
   TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
           dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
           lowercase=True, max_df=1.0, max_features=None, min_df=1,
           ngram_range=(1, 1), norm='l2',
           preprocessor=<function itself at 0x7f47de744e18>, smooth_idf=True,
           stop_words=None, strip_accents='unicode', sublinear_tf=False,
           token_pattern='(?u)\\b\\w\\w+\\b',
           tokenizer=<function itself at 0x7f47de744e18>, use_idf=True,
           vocabulary=None)),
  ('svc', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
     decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
     max_iter=-1, probability=False, random_state=None, shrinking=True,
     tol=0.001, verbose=False))],
 'svc': SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
   decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
   max_iter

# based on C=1e-4 to 10

In [65]:
%%time
grid.fit(get_ingrs(train), get_labels(train))

CPU times: user 1min 57s, sys: 1.3 s, total: 1min 58s
Wall time: 39min 16s


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svc__C': array([  1.00000e-04,   1.00000e-03,   1.00000e-02,   1.00000e-01,
         1.00000e+00,   1.00000e+01]), 'svc__class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [66]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [67]:
grid.best_params_

{'svc__C': 1.0, 'svc__class_weight': None}

In [68]:
grid.best_score_

0.77477749283451502

In [70]:
grid.cv_results_



{'mean_fit_time': array([ 236.65419793,  517.4493014 ,  250.05366262,  581.02797771,
         265.26640757,  569.02121663,  184.71976638,  306.7764136 ,
         110.32019742,  152.12040401,  107.05606683,   83.58189861]),
 'mean_score_time': array([ 76.0904185 ,  81.39194083,  76.64345336,  79.28446945,
         77.04823335,  78.87881748,  62.78142309,  72.68669653,
         49.61088181,  55.21994917,  48.21731702,  37.58955971]),
 'mean_test_score': array([ 0.19706341,  0.03992558,  0.19706341,  0.03323779,  0.40772867,
         0.45391462,  0.67642178,  0.7050083 ,  0.77477749,  0.74935888,
         0.74742294,  0.74158998]),
 'mean_train_score': array([ 0.19706341,  0.03990944,  0.19706341,  0.03322874,  0.41014263,
         0.46456417,  0.69591937,  0.74118759,  0.87562239,  0.86501255,
         0.97152672,  0.95688152]),
 'param_svc__C': masked_array(data = [0.0001 0.0001 0.001 0.001 0.01 0.01 0.10000000000000001
  0.10000000000000001 1.0 1.0 10.0 10.0],
              mask = [Fal

# Based on 10^{-1, -0.75, -0.5, ... , 0.75, 1}

In [12]:
c_vals = np.logspace(-1, 1, 9)

In [13]:
param_grid = {
    'svc__C': c_vals,
}

In [14]:
grid = GridSearchCV(svc,
                    param_grid=param_grid,
                    cv=3,
                    scoring='accuracy',
                    n_jobs=-1)

In [15]:
%%time
grid.fit(get_ingrs(train), get_labels(train))

CPU times: user 1min 34s, sys: 704 ms, total: 1min 35s
Wall time: 17min 58s


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svc__C': array([  0.1    ,   0.17783,   0.31623,   0.56234,   1.     ,   1.77828,
         3.16228,   5.62341,  10.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [16]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [17]:
grid.best_params_

{'svc__C': 1.7782794100389228}

In [18]:
grid.best_score_

0.77563232262281889

In [20]:
10**0.25

1.7782794100389228

In [19]:
grid.cv_results_



{'mean_fit_time': array([ 157.98665675,  141.8716344 ,  130.96095832,  126.57304716,
         118.48142846,  115.55716753,  115.64803076,  112.06342483,
          62.41553322]),
 'mean_score_time': array([ 51.12805931,  47.22033993,  49.61519774,  52.10884198,
         51.88579361,  50.08882594,  50.20968072,  50.35080115,  30.51324789]),
 'mean_test_score': array([ 0.67642178,  0.7184593 ,  0.74800121,  0.76705888,  0.77477749,
         0.77563232,  0.76987479,  0.75911399,  0.74742294]),
 'mean_train_score': array([ 0.69591937,  0.7527655 ,  0.79732985,  0.83901561,  0.87562239,
         0.90834476,  0.93558612,  0.95683115,  0.97152672]),
 'param_svc__C': masked_array(data = [0.10000000000000001 0.17782794100389229 0.31622776601683794
  0.56234132519034907 1.0 1.7782794100389228 3.1622776601683795
  5.6234132519034912 10.0],
              mask = [False False False False False False False False False],
        fill_value = ?),
 'params': [{'svc__C': 0.10000000000000001},
  {'svc__C':

# Based on 10^{-0.1, -0.05, ..., 0.45, 0.5}

In [21]:
c_vals = np.logspace(-0.1, 0.5, 13)

In [22]:
c_vals

array([ 0.79432823,  0.89125094,  1.        ,  1.12201845,  1.25892541,
        1.41253754,  1.58489319,  1.77827941,  1.99526231,  2.23872114,
        2.51188643,  2.81838293,  3.16227766])

In [26]:
param_grid = {
    'svc__C': c_vals,
}

In [27]:
grid = GridSearchCV(svc,
                    param_grid=param_grid,
                    cv=5,
                    scoring='accuracy',
                    n_jobs=-1)

In [28]:
%%time
grid.fit(get_ingrs(train), get_labels(train))

CPU times: user 1min 39s, sys: 1.05 s, total: 1min 40s
Wall time: 43min 32s


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svc__C': array([ 0.79433,  0.89125,  1.     ,  1.12202,  1.25893,  1.41254,
        1.58489,  1.77828,  1.99526,  2.23872,  2.51189,  2.81838,  3.16228])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [29]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [30]:
grid.best_params_

{'svc__C': 1.2589254117941671}

In [31]:
grid.best_score_

0.78154070498315487

In [33]:
10**0.1

1.2589254117941673

In [32]:
grid.cv_results_



{'mean_fit_time': array([ 143.23508005,  141.68128681,  142.49367027,  138.52516761,
         138.25345206,  141.67964201,  134.36590509,  131.23971028,
         131.31269522,  130.88654017,  136.13537259,  130.27778497,
         114.3429522 ]),
 'mean_score_time': array([ 30.35004473,  30.59682322,  31.32538342,  32.35517311,
         32.77063403,  32.35771828,  33.70293155,  33.04715948,
         31.51931925,  32.57419186,  32.36193271,  32.1674602 ,  29.84162641]),
 'mean_test_score': array([ 0.77749283,  0.77890079,  0.78071102,  0.78116357,  0.7815407 ,
         0.78144014,  0.78131443,  0.78058531,  0.77988133,  0.77935335,
         0.77880022,  0.77698999,  0.77520491]),
 'mean_train_score': array([ 0.86047416,  0.86704884,  0.87342238,  0.87984624,  0.88610037,
         0.8921156 ,  0.89793594,  0.90383173,  0.90970242,  0.91545369,
         0.92033122,  0.92560476,  0.93024972]),
 'param_svc__C': masked_array(data = [0.79432823472428149 0.89125093813374556 1.0 1.12201845430196

## Random Forest

In [33]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

In [25]:
rf = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("rf", RandomForestClassifier())])

In [29]:
rf.steps[1][1]

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [7]:
def get_ingrs(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [30]:
%%time
scores = cross_val_score(rf, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 873 ms, sys: 75.7 ms, total: 948 ms
Wall time: 7.37 s


In [31]:
scores

array([ 0.66729465,  0.66436291,  0.65020111,  0.66771475,  0.667925  ])

#### Cross-validation accuracy

In [32]:
scores.mean()

0.66349968338573651

## Extra Trees

In [34]:
extr = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("extr", ExtraTreesClassifier())])

In [35]:
extr.steps[1][1]

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [36]:
def get_ingrs(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [37]:
%%time
scores = cross_val_score(extr, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 813 ms, sys: 102 ms, total: 914 ms
Wall time: 10.7 s


In [38]:
scores

array([ 0.68801809,  0.69703443,  0.6936903 ,  0.69186266,  0.69837675])

#### Cross-validation accuracy

In [39]:
scores.mean()

0.69379644360559156

## XGBoost

In [40]:
from xgboost import XGBClassifier

In [41]:
xgb1 = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("xgb", XGBClassifier())])

In [42]:
xgb1.steps[1][1]

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [36]:
def get_ingrs(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [43]:
%%time
scores = cross_val_score(xgb1, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 931 ms, sys: 111 ms, total: 1.04 s
Wall time: 1min 50s


In [44]:
scores

array([ 0.67395127,  0.68270922,  0.67697335,  0.67488366,  0.6795017 ])

#### Cross-validation accuracy

In [45]:
scores.mean()

0.67760384131581075