In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

## Pipeline using DictVectorizer (bag of ingredients) and SVC

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [5]:
def itself(x):
    return x

In [21]:
from xgboost import XGBClassifier
import xgboost as xgb

In [76]:
xgbc = XGBClassifier(max_depth=5,
              min_child_weight=1,
              gamma=0,
              subsample=0.8,
              colsample_bytree=0.8,
              scale_pos_weight=1,
              objective='multi:softmax',
              num_class=20)

In [108]:
pipe = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                  preprocessor=itself,
                                 )),
    ("xgbc", xgbc)
])

In [96]:
xgbc.get_xgb_params()

{'base_score': 0.5,
 'booster': 'gblinear',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'nthread': 1,
 'num_class': 20,
 'objective': 'multi:softmax',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 0,
 'silent': 1,
 'subsample': 0.8}

In [79]:
train_ingrs = get_ingrs(train)

In [80]:
tfidf_vec = TfidfVectorizer(tokenizer=itself,
                            preprocessor=itself,
                           )


In [81]:
train_mat = tfidf_vec.fit_transform(train_ingrs)

In [82]:
train_mat

<39774x6703 sparse matrix of type '<class 'numpy.float64'>'
	with 428249 stored elements in Compressed Sparse Row format>

In [83]:
xgbc.get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'num_class': 20,
 'objective': 'multi:softmax',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': True,
 'subsample': 0.8}

In [84]:
from sklearn.preprocessing import LabelEncoder

In [85]:
le = LabelEncoder()

In [86]:
train_labels = le.fit_transform(get_labels(train))

In [87]:
train_dm = xgb.DMatrix(data=train_mat, label=train_labels)

In [113]:
xgbc = XGBClassifier(max_depth=5,
              n_estimators=400,
              min_child_weight=1,
              gamma=0,
              subsample=0.8,
              colsample_bytree=0.8,
              scale_pos_weight=1,
              objective='multi:softmax',
              num_class=20,
)

In [92]:
cv_res = xgb.cv(params=xgbc.get_xgb_params(),
                dtrain=train_dm,
                num_boost_round=xgbc.get_params()['n_estimators'],
                nfold=5,
                metrics='merror',
                early_stopping_rounds=50,)

In [93]:
cv_res

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.491100,0.010000,0.475744,0.004700
1,0.437195,0.010293,0.419483,0.006491
2,0.419973,0.006007,0.399954,0.007211
3,0.407980,0.004936,0.384265,0.005241
4,0.400111,0.004968,0.376817,0.004378
5,0.394730,0.004124,0.369727,0.004602
6,0.391914,0.005298,0.365980,0.004739
7,0.388369,0.004935,0.361656,0.003250
8,0.385051,0.005228,0.357784,0.002693
9,0.383718,0.004857,0.354666,0.003983


In [115]:
pipe = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                  preprocessor=itself,
                                 )),
    ("xgbc", xgbc)
])

In [116]:
def get_ingrs(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [117]:
%%time
scores = cross_val_score(pipe, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1, verbose=10)

CPU times: user 1.39 s, sys: 288 ms, total: 1.68 s
Wall time: 10min 52s


In [118]:
scores

array([ 0.75194675,  0.75596884,  0.75653595,  0.74569237,  0.75261105])

#### Cross-validation accuracy

In [119]:
scores.mean()

0.75255098901727946

# Grid search to tune `C` parameter

In [8]:
from sklearn.model_selection import GridSearchCV

In [9]:
svc = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                  preprocessor=itself,
                                  strip_accents='unicode',
                                 )),
    ("svc", SVC(kernel="linear"))])

# Searching on 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1

In [65]:
%%time
grid.fit(get_ingrs(train), get_labels(train))

CPU times: user 1min 57s, sys: 1.3 s, total: 1min 58s
Wall time: 39min 16s


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svc__C': array([  1.00000e-04,   1.00000e-03,   1.00000e-02,   1.00000e-01,
         1.00000e+00,   1.00000e+01]), 'svc__class_weight': [None, 'balanced']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [66]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [67]:
grid.best_params_

{'svc__C': 1.0, 'svc__class_weight': None}

In [68]:
grid.best_score_

0.77477749283451502

In [70]:
grid.cv_results_



{'mean_fit_time': array([ 236.65419793,  517.4493014 ,  250.05366262,  581.02797771,
         265.26640757,  569.02121663,  184.71976638,  306.7764136 ,
         110.32019742,  152.12040401,  107.05606683,   83.58189861]),
 'mean_score_time': array([ 76.0904185 ,  81.39194083,  76.64345336,  79.28446945,
         77.04823335,  78.87881748,  62.78142309,  72.68669653,
         49.61088181,  55.21994917,  48.21731702,  37.58955971]),
 'mean_test_score': array([ 0.19706341,  0.03992558,  0.19706341,  0.03323779,  0.40772867,
         0.45391462,  0.67642178,  0.7050083 ,  0.77477749,  0.74935888,
         0.74742294,  0.74158998]),
 'mean_train_score': array([ 0.19706341,  0.03990944,  0.19706341,  0.03322874,  0.41014263,
         0.46456417,  0.69591937,  0.74118759,  0.87562239,  0.86501255,
         0.97152672,  0.95688152]),
 'param_svc__C': masked_array(data = [0.0001 0.0001 0.001 0.001 0.01 0.01 0.10000000000000001
  0.10000000000000001 1.0 1.0 10.0 10.0],
              mask = [Fal

# Searching on 10^-1, 10^-.75, ..., 10^.75, 10^1

In [12]:
c_vals = np.logspace(-1, 1, 9)

In [13]:
param_grid = {
    'svc__C': c_vals,
}

In [14]:
grid = GridSearchCV(svc,
                    param_grid=param_grid,
                    cv=3,
                    scoring='accuracy',
                    n_jobs=-1)

In [15]:
%%time
grid.fit(get_ingrs(train), get_labels(train))

CPU times: user 1min 34s, sys: 704 ms, total: 1min 35s
Wall time: 17min 58s


GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svc__C': array([  0.1    ,   0.17783,   0.31623,   0.56234,   1.     ,   1.77828,
         3.16228,   5.62341,  10.     ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [16]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [17]:
grid.best_params_

{'svc__C': 1.7782794100389228}

In [18]:
grid.best_score_

0.77563232262281889

In [20]:
10**0.25

1.7782794100389228

In [19]:
grid.cv_results_



{'mean_fit_time': array([ 157.98665675,  141.8716344 ,  130.96095832,  126.57304716,
         118.48142846,  115.55716753,  115.64803076,  112.06342483,
          62.41553322]),
 'mean_score_time': array([ 51.12805931,  47.22033993,  49.61519774,  52.10884198,
         51.88579361,  50.08882594,  50.20968072,  50.35080115,  30.51324789]),
 'mean_test_score': array([ 0.67642178,  0.7184593 ,  0.74800121,  0.76705888,  0.77477749,
         0.77563232,  0.76987479,  0.75911399,  0.74742294]),
 'mean_train_score': array([ 0.69591937,  0.7527655 ,  0.79732985,  0.83901561,  0.87562239,
         0.90834476,  0.93558612,  0.95683115,  0.97152672]),
 'param_svc__C': masked_array(data = [0.10000000000000001 0.17782794100389229 0.31622776601683794
  0.56234132519034907 1.0 1.7782794100389228 3.1622776601683795
  5.6234132519034912 10.0],
              mask = [False False False False False False False False False],
        fill_value = ?),
 'params': [{'svc__C': 0.10000000000000001},
  {'svc__C':

# Based on 10^-.1, 10^-.05, ..., 10^.45, 10^.5

In [21]:
c_vals = np.logspace(-0.1, 0.5, 13)

In [22]:
c_vals

array([ 0.79432823,  0.89125094,  1.        ,  1.12201845,  1.25892541,
        1.41253754,  1.58489319,  1.77827941,  1.99526231,  2.23872114,
        2.51188643,  2.81838293,  3.16227766])

In [26]:
param_grid = {
    'svc__C': c_vals,
}

In [27]:
grid = GridSearchCV(svc,
                    param_grid=param_grid,
                    cv=5,
                    scoring='accuracy',
                    n_jobs=-1)

In [28]:
%%time
grid.fit(get_ingrs(train), get_labels(train))

CPU times: user 1min 39s, sys: 1.05 s, total: 1min 40s
Wall time: 43min 32s


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'svc__C': array([ 0.79433,  0.89125,  1.     ,  1.12202,  1.25893,  1.41254,
        1.58489,  1.77828,  1.99526,  2.23872,  2.51189,  2.81838,  3.16228])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [29]:
grid.best_estimator_

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

## Best value: 10^.1

In [30]:
grid.best_params_

{'svc__C': 1.2589254117941671}

In [31]:
grid.best_score_

0.78154070498315487

In [33]:
10**0.1

1.2589254117941673

In [32]:
grid.cv_results_



{'mean_fit_time': array([ 143.23508005,  141.68128681,  142.49367027,  138.52516761,
         138.25345206,  141.67964201,  134.36590509,  131.23971028,
         131.31269522,  130.88654017,  136.13537259,  130.27778497,
         114.3429522 ]),
 'mean_score_time': array([ 30.35004473,  30.59682322,  31.32538342,  32.35517311,
         32.77063403,  32.35771828,  33.70293155,  33.04715948,
         31.51931925,  32.57419186,  32.36193271,  32.1674602 ,  29.84162641]),
 'mean_test_score': array([ 0.77749283,  0.77890079,  0.78071102,  0.78116357,  0.7815407 ,
         0.78144014,  0.78131443,  0.78058531,  0.77988133,  0.77935335,
         0.77880022,  0.77698999,  0.77520491]),
 'mean_train_score': array([ 0.86047416,  0.86704884,  0.87342238,  0.87984624,  0.88610037,
         0.8921156 ,  0.89793594,  0.90383173,  0.90970242,  0.91545369,
         0.92033122,  0.92560476,  0.93024972]),
 'param_svc__C': masked_array(data = [0.79432823472428149 0.89125093813374556 1.0 1.12201845430196

## Fitting to test data with new C value (after encoding all of train+test ingredients)

In [13]:
%%time
dvec_all = TfidfVectorizer(tokenizer=itself,
                           preprocessor=itself,
                          ).fit(get_ingrs(train+test))

CPU times: user 379 ms, sys: 20.2 ms, total: 399 ms
Wall time: 398 ms


In [14]:
test_bag = dvec_all.transform(get_ingrs(test))

In [15]:
svc_linear = SVC(kernel='linear', C=10**0.1)

In [17]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingrs(train)), get_labels(train))

CPU times: user 1min 31s, sys: 367 ms, total: 1min 31s
Wall time: 1min 31s


In [18]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [19]:
svc_linear.get_params()

{'C': 1.2589254117941673,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [21]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingrs(test)))

In [22]:
test_preds.shape

(9944,)

In [23]:
test_ids = [r['id'] for r in test]

In [24]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [25]:
df_test.to_csv('../_data/180404_basic_SVM_TFIDF_gridsearch_C.csv', index=False)

## Results
Accuracy 0.78559  
Rank 594

![kaggle image](../_images/180404_bow_svm_tfidf_grid.png)
![kaggle image](../_images/180404_bow_svm_tfidf_grid_standing.png)