In [1]:
import re
import json
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import FeatureUnion

from time import strftime

def print_time():
    print(strftime('%y%m%d-%H%M%S'))
    return

In [2]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
train = json.load(open('../_data/train.json'))
test = json.load(open('../_data/test.json'))

In [4]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

### Lemmatization

In [5]:
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

In [6]:
def lemmatize_ingrs(ingrs):
    wnl = WordNetLemmatizer()
    return [[' '.join([wnl.lemmatize(w) for w in ing.split(' ')]) for ing in recipe]\
           for recipe in ingrs]

In [7]:
%%time
train_labels = get_labels(train)
train_ingrs = get_ingrs(train)
train_lem = lemmatize_ingrs(train_ingrs)

CPU times: user 5.85 s, sys: 74 ms, total: 5.92 s
Wall time: 5.92 s


In [8]:
%%time
traintest_ingrs = get_ingrs(train+test)
traintest_lem = lemmatize_ingrs(traintest_ingrs)

CPU times: user 5.51 s, sys: 23.6 ms, total: 5.53 s
Wall time: 5.53 s


In [9]:
def itself(x):
    return x

def combine_words(ilist):
    return ' '.join(ilist)

In [10]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words,
                                  ngram_range=(1, 4),
                                  #stop_words='english',
                                  )),
        ]).fit(traintest_lem)

CPU times: user 6.4 s, sys: 156 ms, total: 6.55 s
Wall time: 6.55 s


In [11]:
%%time
train_mat = dvec_all.transform(train_lem)

CPU times: user 3.49 s, sys: 60.1 ms, total: 3.55 s
Wall time: 3.55 s


In [12]:
train_mat

<39774x890750 sparse matrix of type '<class 'numpy.float64'>'
	with 3398466 stored elements in Compressed Sparse Row format>

### RandomForest

In [48]:
rf1 = RandomForestClassifier(
    n_estimators=400,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=8,
    random_state=None,
    verbose=50,
    warm_start=False,
    class_weight=None)

In [49]:
%%time
rf1.fit(train_mat, train_labels)

building tree 2 of 400building tree 3 of 400
building tree 4 of 400building tree 5 of 400building tree 6 of 400building tree 7 of 400building tree 8 of 400building tree 1 of 400






[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:   24.7s
building tree 9 of 400
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:   24.8s
building tree 10 of 400
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:   25.0s
building tree 11 of 400
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:   25.2s
building tree 12 of 400
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:   25.2s
building tree 13 of 400
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:   25.8s
building tree 14 of 400
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:   26.1s
building tree 15 of 400
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:   26.3s
building tree 16 of 400
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:   49.3s
building tree 17 of 400
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:

[Parallel(n_jobs=8)]: Done  96 tasks      | elapsed:  5.0min
building tree 104 of 400
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed:  5.4min
building tree 105 of 400
[Parallel(n_jobs=8)]: Done  98 tasks      | elapsed:  5.4min
building tree 106 of 400
[Parallel(n_jobs=8)]: Done  99 tasks      | elapsed:  5.4min
building tree 107 of 400
[Parallel(n_jobs=8)]: Done 100 tasks      | elapsed:  5.4min
building tree 108 of 400
[Parallel(n_jobs=8)]: Done 101 tasks      | elapsed:  5.4min
building tree 109 of 400
[Parallel(n_jobs=8)]: Done 102 tasks      | elapsed:  5.4min
building tree 110 of 400
[Parallel(n_jobs=8)]: Done 103 tasks      | elapsed:  5.4min
building tree 111 of 400
[Parallel(n_jobs=8)]: Done 104 tasks      | elapsed:  5.4min
building tree 112 of 400
[Parallel(n_jobs=8)]: Done 105 tasks      | elapsed:  5.8min
building tree 113 of 400
[Parallel(n_jobs=8)]: Done 106 tasks      | elapsed:  5.8min
building tree 114 of 400
[Parallel(n_jobs=8)]: Done 107 tasks      | elapsed:  

[Parallel(n_jobs=8)]: Done 192 tasks      | elapsed: 10.2min
building tree 200 of 400
[Parallel(n_jobs=8)]: Done 193 tasks      | elapsed: 10.4min
building tree 201 of 400
[Parallel(n_jobs=8)]: Done 194 tasks      | elapsed: 10.5min
building tree 202 of 400
[Parallel(n_jobs=8)]: Done 195 tasks      | elapsed: 10.5min
building tree 203 of 400
[Parallel(n_jobs=8)]: Done 196 tasks      | elapsed: 10.5min
building tree 204 of 400
[Parallel(n_jobs=8)]: Done 197 tasks      | elapsed: 10.6min
building tree 205 of 400
[Parallel(n_jobs=8)]: Done 198 tasks      | elapsed: 10.6min
building tree 206 of 400
[Parallel(n_jobs=8)]: Done 199 tasks      | elapsed: 10.6min
building tree 207 of 400
[Parallel(n_jobs=8)]: Done 200 tasks      | elapsed: 10.7min
building tree 208 of 400
[Parallel(n_jobs=8)]: Done 201 tasks      | elapsed: 10.8min
building tree 209 of 400
[Parallel(n_jobs=8)]: Done 202 tasks      | elapsed: 10.9min
building tree 210 of 400
[Parallel(n_jobs=8)]: Done 203 tasks      | elapsed: 1

[Parallel(n_jobs=8)]: Done 288 tasks      | elapsed: 15.2min
building tree 296 of 400
[Parallel(n_jobs=8)]: Done 289 tasks      | elapsed: 15.5min
building tree 297 of 400
[Parallel(n_jobs=8)]: Done 290 tasks      | elapsed: 15.5min
building tree 298 of 400
[Parallel(n_jobs=8)]: Done 291 tasks      | elapsed: 15.5min
building tree 299 of 400
[Parallel(n_jobs=8)]: Done 292 tasks      | elapsed: 15.5min
building tree 300 of 400
[Parallel(n_jobs=8)]: Done 293 tasks      | elapsed: 15.5min
building tree 301 of 400
[Parallel(n_jobs=8)]: Done 294 tasks      | elapsed: 15.6min
building tree 302 of 400
[Parallel(n_jobs=8)]: Done 295 tasks      | elapsed: 15.6min
building tree 303 of 400
[Parallel(n_jobs=8)]: Done 296 tasks      | elapsed: 15.6min
building tree 304 of 400
[Parallel(n_jobs=8)]: Done 297 tasks      | elapsed: 15.9min
building tree 305 of 400
[Parallel(n_jobs=8)]: Done 298 tasks      | elapsed: 15.9min
building tree 306 of 400
[Parallel(n_jobs=8)]: Done 299 tasks      | elapsed: 1

[Parallel(n_jobs=8)]: Done 384 tasks      | elapsed: 20.3min
building tree 392 of 400
[Parallel(n_jobs=8)]: Done 385 tasks      | elapsed: 20.5min
building tree 393 of 400
building tree 394 of 400
building tree 395 of 400
building tree 396 of 400
building tree 397 of 400
building tree 398 of 400
building tree 399 of 400
building tree 400 of 400
[Parallel(n_jobs=8)]: Done 394 out of 400 | elapsed: 21.0min remaining:   19.2s
[Parallel(n_jobs=8)]: Done 400 out of 400 | elapsed: 21.1min finished
CPU times: user 2h 44min 46s, sys: 4.13 s, total: 2h 44min 50s
Wall time: 21min 7s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=8,
            oob_score=False, random_state=None, verbose=50,
            warm_start=False)

#### Prediction with RandomForest

In [35]:
test_lem = lemmatize_ingrs(get_ingrs(test))
test_preds = rf1.predict(dvec_all.transform(test_lem))

In [51]:
test_preds.shape

(9944,)

In [52]:
test_ids = [r['id'] for r in test]

In [53]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [54]:
df_test.to_csv('../_data/180414_lemmatized_RF_400.csv', index=False)

### Ensemble (voting, ad-hoc)
* RandomForest 0.72123
  * lemmatized, traintest
* XGBoost 0.79314
  * non-lemmatized, traintest
* LinearSVC 0.79605 (prioritized)
  * non-lemmatized, traintest


In [18]:
%%time
dvec_all_lem = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words,
                                  ngram_range=(1, 4),
                                  stop_words=None)),
        ]).fit(traintest_lem)

CPU times: user 7.25 s, sys: 67.9 ms, total: 7.32 s
Wall time: 7.32 s


In [19]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words,
                                  ngram_range=(1, 4),
                                  stop_words=None)),
        ]).fit(traintest_ingrs)

CPU times: user 7.14 s, sys: 108 ms, total: 7.25 s
Wall time: 7.24 s


In [20]:
import pickle

In [21]:
xgb2 = pickle.load(open('./model.pkl', 'rb'))

In [22]:
xgb2

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.35, learning_rate=0.05,
       max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
       n_estimators=888, n_jobs=6, nthread=None, num_class=20,
       objective='multi:softprob', random_state=0, reg_alpha=0.001,
       reg_lambda=1.0, scale_pos_weight=1, seed=None, silent=True,
       subsample=1.0)

In [23]:
svc3 =  LinearSVC(loss='hinge', C=10**0.1)

In [24]:
%%time
svc3 = svc3.fit(dvec_all.transform(train_ingrs), train_labels)

CPU times: user 15.3 s, sys: 112 ms, total: 15.4 s
Wall time: 15.4 s


#### Getting predictions for each classifier

In [69]:
test_ingrs = get_ingrs(test)
test_lem = lemmatize_ingrs(test_ingrs)
test_bag_lem = dvec_all_lem.transform(test_lem)
test_bag_lem.shape

In [77]:
test_bag = dvec_all.transform(test_ingrs)
test_bag.shape

In [79]:
rf_pred = rf1.predict(test_bag_lem)

[Parallel(n_jobs=8)]: Done   1 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   3 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   4 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   5 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   6 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   7 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   8 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  11 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  12 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  13 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  14 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done  15 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 157 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 158 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 159 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 160 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 161 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 162 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 163 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 164 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 165 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 166 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 167 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 168 tasks      | elapsed:    1.6s
[Parallel(n_jobs=8)]: Done 169 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 170 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 171 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 172 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Do

[Parallel(n_jobs=8)]: Done 306 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 307 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 308 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 309 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 310 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 311 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 312 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 313 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 314 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 315 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 316 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 317 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 318 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 319 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 320 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Done 321 tasks      | elapsed:    3.1s
[Parallel(n_jobs=8)]: Do

In [111]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

train_labels = [x['cuisine'] for x in train]
train_labels_ = le.fit_transform(train_labels0)

le.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [86]:
%%time
xg_pred = le.inverse_transform(xgb2.predict(test_bag))

CPU times: user 49 s, sys: 7.98 ms, total: 49 s
Wall time: 6.71 s


In [81]:
%%time
svc_pred = svc3.predict(test_bag)

CPU times: user 39.8 ms, sys: 0 ns, total: 39.8 ms
Wall time: 38.5 ms


In [83]:
rf_pred[:10]

array(['southern_us', 'southern_us', 'italian', 'cajun_creole', 'italian',
       'southern_us', 'southern_us', 'chinese', 'mexican', 'southern_us'],
      dtype='<U12')

In [87]:
xg_pred[:10]

array(['irish', 'southern_us', 'italian', 'cajun_creole', 'italian',
       'southern_us', 'italian', 'chinese', 'mexican', 'british'],
      dtype='<U12')

In [82]:
svc_pred[:10]

array(['irish', 'southern_us', 'italian', 'cajun_creole', 'italian',
       'southern_us', 'spanish', 'chinese', 'mexican', 'british'],
      dtype='<U12')

#### Identifying items where SVC is outvoted

In [128]:
doubt = np.intersect1d(np.where(svc_pred != rf_pred)[0], np.where(rf_pred == xg_pred)[0])

In [144]:
doubt.shape

(686,)

In [129]:
svc_pred[doubt]

array(['vietnamese', 'italian', 'jamaican', 'irish', 'greek',
       'southern_us', 'korean', 'southern_us', 'mexican', 'southern_us',
       'italian', 'mexican', 'mexican', 'greek', 'thai', 'british',
       'korean', 'french', 'french', 'russian', 'mexican', 'indian',
       'indian', 'korean', 'british', 'italian', 'greek', 'chinese',
       'italian', 'british', 'southern_us', 'french', 'chinese', 'french',
       'cajun_creole', 'british', 'italian', 'korean', 'british', 'french',
       'moroccan', 'vietnamese', 'british', 'filipino', 'greek', 'french',
       'russian', 'spanish', 'vietnamese', 'spanish', 'italian', 'french',
       'french', 'british', 'italian', 'jamaican', 'french', 'korean',
       'french', 'japanese', 'italian', 'southern_us', 'british', 'thai',
       'greek', 'french', 'french', 'japanese', 'indian', 'italian',
       'french', 'indian', 'vietnamese', 'french', 'italian', 'japanese',
       'japanese', 'french', 'spanish', 'french', 'greek', 'mexican',


In [130]:
rf_pred[doubt]

array(['thai', 'southern_us', 'cajun_creole', 'italian', 'southern_us',
       'french', 'japanese', 'mexican', 'italian', 'french', 'french',
       'cajun_creole', 'southern_us', 'southern_us', 'vietnamese', 'irish',
       'chinese', 'southern_us', 'italian', 'french', 'italian', 'mexican',
       'mexican', 'chinese', 'italian', 'cajun_creole', 'italian',
       'italian', 'mexican', 'southern_us', 'mexican', 'italian', 'thai',
       'southern_us', 'southern_us', 'southern_us', 'french', 'chinese',
       'southern_us', 'italian', 'indian', 'mexican', 'southern_us',
       'vietnamese', 'italian', 'japanese', 'french', 'mexican', 'thai',
       'mexican', 'southern_us', 'italian', 'southern_us', 'french',
       'indian', 'thai', 'british', 'chinese', 'italian', 'chinese',
       'mexican', 'thai', 'italian', 'italian', 'southern_us', 'italian',
       'italian', 'italian', 'italian', 'mexican', 'italian', 'greek',
       'thai', 'italian', 'french', 'southern_us', 'chinese',
    

In [131]:
xg_pred[doubt]

array(['thai', 'southern_us', 'cajun_creole', 'italian', 'southern_us',
       'french', 'japanese', 'mexican', 'italian', 'french', 'french',
       'cajun_creole', 'southern_us', 'southern_us', 'vietnamese', 'irish',
       'chinese', 'southern_us', 'italian', 'french', 'italian', 'mexican',
       'mexican', 'chinese', 'italian', 'cajun_creole', 'italian',
       'italian', 'mexican', 'southern_us', 'mexican', 'italian', 'thai',
       'southern_us', 'southern_us', 'southern_us', 'french', 'chinese',
       'southern_us', 'italian', 'indian', 'mexican', 'southern_us',
       'vietnamese', 'italian', 'japanese', 'french', 'mexican', 'thai',
       'mexican', 'southern_us', 'italian', 'southern_us', 'french',
       'indian', 'thai', 'british', 'chinese', 'italian', 'chinese',
       'mexican', 'thai', 'italian', 'italian', 'southern_us', 'italian',
       'italian', 'italian', 'italian', 'mexican', 'italian', 'greek',
       'thai', 'italian', 'french', 'southern_us', 'chinese',
    

### Updating prediction
Replacing outvoted items in SVC prediction

In [132]:
new_pred = svc_pred.copy()

In [138]:
new_pred[doubt] = xg_pred[doubt]

In [140]:
svc_pred[:50]

array(['irish', 'southern_us', 'italian', 'cajun_creole', 'italian',
       'southern_us', 'spanish', 'chinese', 'mexican', 'british',
       'italian', 'greek', 'indian', 'italian', 'british', 'french',
       'mexican', 'southern_us', 'mexican', 'southern_us', 'japanese',
       'indian', 'irish', 'vietnamese', 'italian', 'southern_us',
       'vietnamese', 'korean', 'italian', 'italian', 'mexican', 'thai',
       'mexican', 'japanese', 'chinese', 'mexican', 'russian', 'indian',
       'indian', 'cajun_creole', 'cajun_creole', 'chinese', 'french',
       'mexican', 'italian', 'italian', 'spanish', 'indian', 'vietnamese',
       'chinese'],
      dtype='<U12')

In [139]:
new_pred[:50]

array(['irish', 'southern_us', 'italian', 'cajun_creole', 'italian',
       'southern_us', 'spanish', 'chinese', 'mexican', 'british',
       'italian', 'greek', 'indian', 'italian', 'british', 'french',
       'mexican', 'southern_us', 'mexican', 'southern_us', 'japanese',
       'indian', 'irish', 'vietnamese', 'italian', 'southern_us', 'thai',
       'korean', 'italian', 'southern_us', 'mexican', 'thai', 'mexican',
       'japanese', 'chinese', 'mexican', 'russian', 'indian', 'indian',
       'cajun_creole', 'cajun_creole', 'chinese', 'french', 'mexican',
       'italian', 'italian', 'spanish', 'indian', 'vietnamese', 'chinese'],
      dtype='<U12')

In [141]:
new_pred.shape

(9944,)

In [142]:
test_ids = [r['id'] for r in test]

In [143]:
df_test = pd.DataFrame([test_ids, list(new_pred)]).transpose()
df_test.columns = ['id', 'cuisine']

In [145]:
df_test.to_csv('../_data/180415_rf1000_xg_svc_ensemble.csv', index=False)

Accuracy: 0.79967  
Rank 217/1388
![Kaggle result](../_images/180416_ensemble.png)
![Kaggle result](../_images/180416_ensemble_standing.png)