In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train = json.load(open('../_data/train.json'))
test = json.load(open('../_data/test.json'))

In [3]:
train_as_text = [' '.join(sample['ingredients']).lower() for sample in train]
train_cuisine = [sample['cuisine'] for sample in train]

test_as_text = [' '.join(sample['ingredients']).lower() for sample in test]

In [4]:
train_as_text[283]

'bertolli® classico olive oil boneless skinless chicken breast halves eggs linguine chicken broth bacon, crisp-cooked and crumbled bertolli vineyard premium collect marinara with burgundi wine sauc bread crumb fresh shredded mozzarella cheese'

In [5]:
from time import strftime

def print_time():
    print(strftime('%y%m%d-%H%M%S'))
    return

In [6]:
from sklearn.pipeline import FeatureUnion

In [7]:
def itself(x):
    return x

In [8]:
import re

In [9]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [10]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [11]:
%%time
print_time()
train_ingrs = get_ingrs(train)
print_time()

180407-020839
180407-020841
CPU times: user 1.82 s, sys: 0 ns, total: 1.82 s
Wall time: 1.82 s


In [12]:
def combine_words(ilist):
    return ' '.join(ilist)

In [13]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words,
                                  #stop_words='english',
                                  )),
        ]).fit(get_ingrs(train+test))

CPU times: user 3.34 s, sys: 13.2 ms, total: 3.36 s
Wall time: 3.36 s


In [14]:
%%time
train_mat = dvec_all.transform(train_ingrs)
train_mat

CPU times: user 1.01 s, sys: 7.65 ms, total: 1.02 s
Wall time: 1.02 s


In [15]:
import xgboost as xgb
from xgboost import XGBClassifier

In [42]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values,
                              label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds,
                          show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors],
            dtrain['Disbursed'],
            eval_metric='accuracy')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))


In [17]:
train_mat.shape

(39774, 10228)

In [35]:
t0 = train_mat.todense()[283]
t0 = np.squeeze(np.asarray(t0))

In [36]:
t0.shape

(10228,)

In [37]:
t0ing = np.where(t0 != 0)[0]
feats = np.array(dvec_all.get_feature_names())

In [38]:
sorted(train[283]['ingredients'])

['Bertolli® Classico Olive Oil',
 'bacon, crisp-cooked and crumbled',
 'bertolli vineyard premium collect marinara with burgundi wine sauc',
 'boneless skinless chicken breast halves',
 'bread crumb fresh',
 'chicken broth',
 'eggs',
 'linguine',
 'shredded mozzarella cheese']

In [39]:
feats[t0ing]

array(['ingrs__bacon crisp cooked and crumbled',
       'ingrs__bertolli classico olive oil',
       'ingrs__bertolli vineyard premium collect marinara with burgundi wine sauc',
       'ingrs__boneless skinless chicken breast halves',
       'ingrs__bread crumb fresh', 'ingrs__chicken broth', 'ingrs__eggs',
       'ingrs__linguine', 'ingrs__shredded mozzarella cheese',
       'words__and', 'words__bacon', 'words__bertolli', 'words__boneless',
       'words__bread', 'words__breast', 'words__broth', 'words__burgundi',
       'words__cheese', 'words__chicken', 'words__classico',
       'words__collect', 'words__cooked', 'words__crisp', 'words__crumb',
       'words__crumbled', 'words__eggs', 'words__fresh', 'words__halves',
       'words__linguine', 'words__marinara', 'words__mozzarella',
       'words__oil', 'words__olive', 'words__premium', 'words__sauc',
       'words__shredded', 'words__skinless', 'words__vineyard',
       'words__wine', 'words__with'],
      dtype='<U76')

In [40]:
dvec_all.get_feature_names()

['ingrs__1% low fat buttermilk',
 'ingrs__1% low fat chocolate milk',
 'ingrs__1% low fat cottage cheese',
 'ingrs__1% low fat milk',
 'ingrs__2 1 2 to 3 lb chicken cut into serving pieces',
 'ingrs__2% low fat cheddar chees',
 'ingrs__2% low fat cottage cheese',
 'ingrs__2% lowfat greek yogurt',
 'ingrs__2% milk shredded mozzarella cheese',
 'ingrs__2% reduced fat chocolate milk',
 'ingrs__2% reduced fat milk',
 'ingrs__25% less sodium chicken broth',
 'ingrs__33% less sodium cooked deli ham',
 'ingrs__33% less sodium cooked ham',
 'ingrs__33% less sodium ham',
 'ingrs__33% less sodium smoked fully cooked ham',
 'ingrs__33% less sodium smoked ham',
 'ingrs__40% less sodium taco seasoning',
 'ingrs__40% less sodium taco seasoning mix',
 'ingrs__50% less sodium black beans',
 'ingrs__7 up',
 'ingrs__8 ounc ziti pasta cook and drain',
 'ingrs__95% lean ground beef',
 'ingrs__a taste of thai rice noodles',
 'ingrs__abalone',
 'ingrs__abbamele',
 'ingrs__absinthe',
 'ingrs__abura age',
 'i

In [41]:
#Choose all predictors except target & IDcols
#predictors = [x for x in train.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    scale_pos_weight=1,
    num_class=20,
    seed=27)

In [None]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values,
                              label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds,
                          show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors],
            dtrain['Disbursed'],
            eval_metric='accuracy')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))


In [43]:
xgb_param = xgb1.get_xgb_params()

In [44]:
xgb_param

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bytree': 0.8,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 1000,
 'nthread': 1,
 'objective': 'multi:softmax',
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': 27,
 'silent': 1,
 'subsample': 0.8}

In [48]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [52]:
train_labels0 = [x['cuisine'] for x in train]
train_labels = le.fit_transform(train_labels0)

In [51]:
le.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [50]:
train_labels

array([ 6, 16,  4, ...,  8,  3, 13])

In [69]:
#Choose all predictors except target & IDcols
#predictors = [x for x in train.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    scale_pos_weight=1,
    num_class=20,
    seed=27)

In [70]:
xgb_param = xgb1.get_xgb_params()

In [71]:
xgtrain = xgb.DMatrix(data=train_mat,
                      label=train_labels,
                      feature_names=feats)

In [80]:
%%time
print_time()

cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=xgb1.get_params()['n_estimators'],
                  nfold=3,
                  metrics='merror',
                  early_stopping_rounds=50,
                  verbose_eval=True)

print_time()

180406-173215
[0]	train-merror:0.401996+0.00735423	test-merror:0.4249+0.00199022
[1]	train-merror:0.349626+0.00592512	test-merror:0.375748+0.0044444
[2]	train-merror:0.326256+0.00443734	test-merror:0.357344+0.00219483
[3]	train-merror:0.311347+0.00391534	test-merror:0.343365+0.00281701
[4]	train-merror:0.303779+0.00331625	test-merror:0.33708+0.00419314
[5]	train-merror:0.297682+0.00422495	test-merror:0.330894+0.00405201
[6]	train-merror:0.292716+0.00451367	test-merror:0.327701+0.00452633
[7]	train-merror:0.288228+0.0047356	test-merror:0.324433+0.00438258
[8]	train-merror:0.284771+0.00512682	test-merror:0.321869+0.00527425
[9]	train-merror:0.281314+0.0045686	test-merror:0.320435+0.00536342
[10]	train-merror:0.277028+0.00424043	test-merror:0.318349+0.00555399
[11]	train-merror:0.274287+0.00328526	test-merror:0.316262+0.00485837
[12]	train-merror:0.271622+0.00301118	test-merror:0.313773+0.00533575
[13]	train-merror:0.269045+0.0034705	test-merror:0.312641+0.00494532
[14]	train-merror:0.266

[118]	train-merror:0.125685+0.00152999	test-merror:0.237442+0.00504962
[119]	train-merror:0.12478+0.00128914	test-merror:0.237416+0.00505204
[120]	train-merror:0.124152+0.00115151	test-merror:0.236989+0.00495805
[121]	train-merror:0.123435+0.00120791	test-merror:0.236612+0.00502155
[122]	train-merror:0.122718+0.00140816	test-merror:0.235983+0.00535974
[123]	train-merror:0.121964+0.00150109	test-merror:0.236184+0.00508109
[124]	train-merror:0.121424+0.00152082	test-merror:0.235958+0.00505616
[125]	train-merror:0.120745+0.00138821	test-merror:0.235832+0.00486602
[126]	train-merror:0.12028+0.00136354	test-merror:0.235757+0.00484628
[127]	train-merror:0.119575+0.00132876	test-merror:0.235581+0.00498409
[128]	train-merror:0.118796+0.00126113	test-merror:0.235505+0.00508198
[129]	train-merror:0.11818+0.0013242	test-merror:0.235304+0.00511047
[130]	train-merror:0.117589+0.0013806	test-merror:0.23528+0.00509501
[131]	train-merror:0.117099+0.00120959	test-merror:0.234927+0.00503882
[132]	train-

[234]	train-merror:0.072497+0.00145221	test-merror:0.22296+0.0040733
[235]	train-merror:0.0723463+0.00138489	test-merror:0.222935+0.0038411
[236]	train-merror:0.0720947+0.00150221	test-merror:0.222708+0.00399226
[237]	train-merror:0.0717803+0.0015378	test-merror:0.222482+0.00382932
[238]	train-merror:0.0713783+0.00187578	test-merror:0.222507+0.00383148
[239]	train-merror:0.0710767+0.00174282	test-merror:0.222331+0.00370407
[240]	train-merror:0.0708503+0.00161099	test-merror:0.222155+0.00384227
[241]	train-merror:0.0706617+0.00147597	test-merror:0.222055+0.0036554
[242]	train-merror:0.070121+0.00140431	test-merror:0.221954+0.00367784
[243]	train-merror:0.06992+0.00149539	test-merror:0.221501+0.00349611
[244]	train-merror:0.069669+0.00151891	test-merror:0.221577+0.00379229
[245]	train-merror:0.0691407+0.00149787	test-merror:0.221552+0.00336739
[246]	train-merror:0.0687633+0.00130711	test-merror:0.221552+0.00329687
[247]	train-merror:0.0685877+0.00133829	test-merror:0.221527+0.00310878
[2

[349]	train-merror:0.0451927+0.00127524	test-merror:0.217378+0.00424561
[350]	train-merror:0.0450167+0.00123793	test-merror:0.217127+0.00427402
[351]	train-merror:0.04469+0.00130912	test-merror:0.216926+0.00390295
[352]	train-merror:0.044401+0.00124723	test-merror:0.216926+0.00400567
[353]	train-merror:0.044225+0.00117664	test-merror:0.216951+0.00418722
[354]	train-merror:0.0441243+0.00120582	test-merror:0.216976+0.00415147
[355]	train-merror:0.0440867+0.00126106	test-merror:0.2168+0.00402202
[356]	train-merror:0.043898+0.00132825	test-merror:0.2168+0.00402202
[357]	train-merror:0.0437977+0.00117018	test-merror:0.217001+0.00423264
[358]	train-merror:0.043609+0.00129526	test-merror:0.217051+0.00441291
[359]	train-merror:0.04337+0.00131087	test-merror:0.216951+0.00433962
[360]	train-merror:0.0431063+0.00140745	test-merror:0.216876+0.00434877
[361]	train-merror:0.0429677+0.00137836	test-merror:0.216498+0.00434157
[362]	train-merror:0.0428297+0.00139289	test-merror:0.216624+0.00437322
[363

[464]	train-merror:0.0277943+0.000886687	test-merror:0.214789+0.00364665
[465]	train-merror:0.027782+0.000844845	test-merror:0.214562+0.00355082
[466]	train-merror:0.0276183+0.000823074	test-merror:0.214562+0.00371373
[467]	train-merror:0.0274803+0.000759549	test-merror:0.214688+0.00367361
[468]	train-merror:0.0273923+0.000838445	test-merror:0.214487+0.00364682
[469]	train-merror:0.027166+0.000786436	test-merror:0.214612+0.00377081
[470]	train-merror:0.0270783+0.000675223	test-merror:0.214512+0.00379453
[471]	train-merror:0.026927+0.000747114	test-merror:0.214437+0.00412041
[472]	train-merror:0.0268263+0.0008853	test-merror:0.214487+0.00407469
[473]	train-merror:0.0266883+0.000852744	test-merror:0.21416+0.0041084
[474]	train-merror:0.0266383+0.000801366	test-merror:0.214235+0.00406358
[475]	train-merror:0.0265627+0.000646913	test-merror:0.214412+0.00406187
[476]	train-merror:0.026437+0.000594684	test-merror:0.214512+0.00403975
[477]	train-merror:0.026349+0.000561515	test-merror:0.21456

In [81]:
cvresult

Unnamed: 0,test-merror-mean,test-merror-std,train-merror-mean,train-merror-std
0,0.424900,0.001990,0.401996,0.007354
1,0.375748,0.004444,0.349626,0.005925
2,0.357344,0.002195,0.326256,0.004437
3,0.343365,0.002817,0.311347,0.003915
4,0.337080,0.004193,0.303779,0.003316
5,0.330894,0.004052,0.297682,0.004225
6,0.327701,0.004526,0.292716,0.004514
7,0.324433,0.004383,0.288228,0.004736
8,0.321869,0.005274,0.284771,0.005127
9,0.320435,0.005363,0.281314,0.004569


In [82]:
cvresult.shape[0]

501

In [83]:
xgb1.set_params(n_estimators=cvresult.shape[0])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=501,
       n_jobs=1, nthread=None, num_class=20, objective='multi:softmax',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=27, silent=True, subsample=0.8)

In [85]:
from sklearn.model_selection import GridSearchCV

In [91]:
param_test1 = {
    'max_depth': range(3, 8, 2),
    'min_child_weight': range(1, 6, 2)
}
gsearch1 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=200,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    scale_pos_weight=1,
    num_class=20),
                        param_grid=param_test1,
                        scoring='accuracy',
                        n_jobs=-1,
                        iid=False,
                        cv=3,
                        verbose=50)

In [92]:
%%time
print_time()

gsearch1.fit(train_mat, train_labels)

print_time()

180406-190658
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] max_depth=3, min_child_weight=1 .................................
[CV] max_depth=3, min_child_weight=1 .................................
[CV] max_depth=3, min_child_weight=1 .................................
[CV] max_depth=3, min_child_weight=3 .................................
[CV] max_depth=3, min_child_weight=3 .................................
[CV] max_depth=3, min_child_weight=3 .................................
[CV] max_depth=3, min_child_weight=5 .................................
[CV] max_depth=3, min_child_weight=5 .................................
[CV]  max_depth=3, min_child_weight=5, score=0.7540538502149483, total= 8.6min
[CV] max_depth=3, min_child_weight=5 .................................
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  9.0min
[CV]  max_depth=3, min_child_weight=5, score=0.7578979114830732, total= 8.7min
[CV] max_depth=5, min_child_weight=1 ..............................

In [93]:
gsearch1.best_params_

{'max_depth': 7, 'min_child_weight': 3}

In [94]:
gsearch1.best_score_

0.77869955124110046

In [95]:
gsearch1.cv_results_



{'mean_fit_time': array([  538.10237368,   517.99939839,   509.5840528 ,   823.01591134,
          783.21375354,   770.10929608,  1063.15835166,  1000.81456772,
          650.79184222]),
 'mean_score_time': array([ 13.66572698,  12.1548841 ,   8.37650228,  23.32987881,
         19.73705904,  13.35118564,  24.81856028,  20.10989674,   7.72592799]),
 'mean_test_score': array([ 0.76009449,  0.75934045,  0.7564238 ,  0.77578306,  0.77442554,
         0.77236391,  0.77764327,  0.77869955,  0.77565733]),
 'mean_train_score': array([ 0.83968164,  0.83026602,  0.8220445 ,  0.91543467,  0.89598736,
         0.88129182,  0.95904349,  0.93598838,  0.9185273 ]),
 'param_max_depth': masked_array(data = [3 3 3 5 5 5 7 7 7],
              mask = [False False False False False False False False False],
        fill_value = ?),
 'param_min_child_weight': masked_array(data = [1 3 5 1 3 5 1 3 5],
              mask = [False False False False False False False False False],
        fill_value = ?),
 'para

In [101]:
param_test2 = {
    'max_depth': range(7, 11, 1),
}
gsearch2 = GridSearchCV(estimator=XGBClassifier(
    learning_rate=0.1,
    n_estimators=200,
    max_depth=5,
    min_child_weight=3,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    scale_pos_weight=1,
    num_class=20),
                        param_grid=param_test2,
                        scoring='accuracy',
                        n_jobs=-1,
                        iid=False,
                        cv=2,
                        verbose=50)

In [102]:
%%time
print_time()

gsearch2.fit(train_mat, train_labels)

print_time()

180406-201520
Fitting 2 folds for each of 4 candidates, totalling 8 fits
[CV] max_depth=7 .....................................................
[CV] max_depth=7 .....................................................
[CV] max_depth=8 .....................................................
[CV] max_depth=8 .....................................................
[CV] max_depth=9 .....................................................
[CV] max_depth=9 .....................................................
[CV] max_depth=10 ....................................................
[CV] max_depth=10 ....................................................
[CV] ............ max_depth=7, score=0.7643478698254615, total=12.2min
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 12.7min
[CV] ............. max_depth=7, score=0.768310460966169, total=12.4min
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed: 12.9min remaining: 38.8min
[CV] ............ max_depth=8, score=0.7701201427637863, total=13.7min
[Pa

In [104]:
gsearch2.best_params_

{'max_depth': 8}