In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train = json.load(open('../_data/train.json'))
test = json.load(open('../_data/test.json'))

In [3]:
train_as_text = [' '.join(sample['ingredients']).lower() for sample in train]
train_cuisine = [sample['cuisine'] for sample in train]

test_as_text = [' '.join(sample['ingredients']).lower() for sample in test]

In [4]:
train_as_text[283]

'bertolli® classico olive oil boneless skinless chicken breast halves eggs linguine chicken broth bacon, crisp-cooked and crumbled bertolli vineyard premium collect marinara with burgundi wine sauc bread crumb fresh shredded mozzarella cheese'

In [5]:
from time import strftime

def print_time():
    print(strftime('%y%m%d-%H%M%S'))
    return

In [6]:
from sklearn.pipeline import FeatureUnion

In [7]:
def itself(x):
    return x

In [8]:
import re

In [9]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [10]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [11]:
%%time
print_time()
train_ingrs = get_ingrs(train)
print_time()

180412-150910
180412-150912
CPU times: user 1.84 s, sys: 7.15 ms, total: 1.85 s
Wall time: 1.85 s


In [12]:
def combine_words(ilist):
    return ' '.join(ilist)

In [37]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words,
                                  analyzer='word',
                                  ngram_range=(1, 4),
                                  #stop_words='english',
                                  )),
        ]).fit(get_ingrs(train+test))

CPU times: user 9.59 s, sys: 72 ms, total: 9.66 s
Wall time: 9.66 s


In [38]:
%%time
train_mat = dvec_all.transform(train_ingrs)
train_mat

CPU times: user 4.49 s, sys: 32 ms, total: 4.52 s
Wall time: 4.52 s


In [39]:
import xgboost as xgb
from xgboost import XGBClassifier

In [40]:
train_mat.shape

(39774, 907529)

In [41]:
feats = np.array(dvec_all.get_feature_names())

In [42]:
feats

array(['ingrs__1% low fat buttermilk', 'ingrs__1% low fat chocolate milk',
       'ingrs__1% low fat cottage cheese', ..., 'words__épices shallots',
       'words__épices shallots muscovy',
       'words__épices shallots muscovy thyme'],
      dtype='<U76')

In [43]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [44]:
train_labels0 = [x['cuisine'] for x in train]
train_labels = le.fit_transform(train_labels0)

In [45]:
le.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [46]:
train_labels

array([ 6, 16,  4, ...,  8,  3, 13])

In [47]:
#Choose all predictors except target & IDcols
#predictors = [x for x in train.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(
    learning_rate=0.05,
    n_estimators=888,
    max_depth=7,
    min_child_weight=1,
    gamma=0.35,
    subsample=1.0,
    colsample_bytree=0.5,
    reg_alpha=0.001,
    reg_lambda=1.0,
    objective='multi:softmax',
    scale_pos_weight=1,
    num_class=20,
    n_jobs=6)

In [48]:
%%time
print_time()
xgb1.fit(train_mat, train_labels, verbose=50)
print_time()

180412-154232
180412-165624
CPU times: user 6h 44min 9s, sys: 15.4 s, total: 6h 44min 25s
Wall time: 1h 13min 52s


In [49]:
%%time
print_time()
test_ingrs = get_ingrs(test)
print_time()

180412-170411
180412-170411
CPU times: user 498 ms, sys: 4 ms, total: 502 ms
Wall time: 500 ms


In [50]:
%%time
test_mat = dvec_all.transform(test_ingrs)
test_mat

CPU times: user 1.01 s, sys: 20 ms, total: 1.03 s
Wall time: 1.03 s


In [51]:
test_pred = xgb1.predict(test_mat)

In [52]:
test_ids = [x['id'] for x in test]

In [53]:
test_cuisine = le.inverse_transform(test_pred)

In [54]:
df_pred = pd.DataFrame({'id': test_ids, 'cuisine': test_cuisine}, columns=['id', 'cuisine'])

In [55]:
df_pred.to_csv('../_data/180412_xgb_ingr_4gramword.csv', index=False)

In [57]:
import pickle

In [61]:
%%time
pickle.dump(xgb1, open('./xgb_ingr_4gram.pkl', 'wb'))

CPU times: user 269 ms, sys: 36 ms, total: 305 ms
Wall time: 485 ms


In [59]:
%%time
xgb2 = pickle.load(open('./model.pkl', 'rb'))

CPU times: user 90.2 ms, sys: 8 ms, total: 98.2 ms
Wall time: 96.6 ms


In [60]:
%%time
test_pred = xgb2.predict(test_mat)

CPU times: user 37.8 s, sys: 60 ms, total: 37.9 s
Wall time: 6.72 s
