In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train = json.load(open('../_data/train.json'))
test = json.load(open('../_data/test.json'))

In [3]:
train_as_text = [' '.join(sample['ingredients']).lower() for sample in train]
train_cuisine = [sample['cuisine'] for sample in train]

test_as_text = [' '.join(sample['ingredients']).lower() for sample in test]

In [4]:
train_as_text[283]

'bertolli® classico olive oil boneless skinless chicken breast halves eggs linguine chicken broth bacon, crisp-cooked and crumbled bertolli vineyard premium collect marinara with burgundi wine sauc bread crumb fresh shredded mozzarella cheese'

In [5]:
from time import strftime

def print_time():
    print(strftime('%y%m%d-%H%M%S'))
    return

In [6]:
from sklearn.pipeline import FeatureUnion

In [7]:
def itself(x):
    return x

In [8]:
import re

In [9]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [10]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [11]:
%%time
print_time()
train_ingrs = get_ingrs(train)
print_time()

180412-142706
180412-142708
CPU times: user 1.91 s, sys: 5.51 ms, total: 1.91 s
Wall time: 1.91 s


In [12]:
def combine_words(ilist):
    return ' '.join(ilist)

In [26]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words,
                                  analyzer='word',
                                  ngram_range=(1, 1),
                                  #stop_words='english',
                                  )),
        ]).fit(get_ingrs(train+test))

CPU times: user 3.85 s, sys: 24 ms, total: 3.88 s
Wall time: 3.88 s


In [27]:
%%time
train_mat = dvec_all.transform(train_ingrs)
train_mat

CPU times: user 1.2 s, sys: 0 ns, total: 1.2 s
Wall time: 1.2 s


In [28]:
import xgboost as xgb
from xgboost import XGBClassifier

In [29]:
train_mat.shape

(39774, 10228)

In [30]:
feats = np.array(dvec_all.get_feature_names())

In [31]:
feats

array(['ingrs__1% low fat buttermilk', 'ingrs__1% low fat chocolate milk',
       'ingrs__1% low fat cottage cheese', ..., 'words__ziti',
       'words__zucchini', 'words__épices'],
      dtype='<U76')

In [32]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [33]:
train_labels0 = [x['cuisine'] for x in train]
train_labels = le.fit_transform(train_labels0)

In [34]:
le.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [35]:
train_labels

array([ 6, 16,  4, ...,  8,  3, 13])

In [36]:
#Choose all predictors except target & IDcols
#predictors = [x for x in train.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(
    learning_rate=0.05,
    n_estimators=888,
    max_depth=7,
    min_child_weight=1,
    gamma=0.35,
    subsample=1.0,
    colsample_bytree=0.5,
    reg_alpha=0.001,
    reg_lambda=1.0,
    objective='multi:softmax',
    scale_pos_weight=1,
    num_class=20,
    n_jobs=4)

In [37]:
%%time
xgb1.fit(train_mat, train_labels, verbose=50)

CPU times: user 55min 16s, sys: 3.19 s, total: 55min 19s
Wall time: 13min 50s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.5, gamma=0.35, learning_rate=0.05,
       max_delta_step=0, max_depth=7, min_child_weight=1, missing=None,
       n_estimators=888, n_jobs=4, nthread=None, num_class=20,
       objective='multi:softprob', random_state=0, reg_alpha=0.001,
       reg_lambda=1.0, scale_pos_weight=1, seed=None, silent=True,
       subsample=1.0)

In [39]:
%%time
print_time()
test_ingrs = get_ingrs(test)
print_time()

180412-150428
180412-150429
CPU times: user 480 ms, sys: 0 ns, total: 480 ms
Wall time: 478 ms


In [40]:
%%time
test_mat = dvec_all.transform(test_ingrs)
test_mat

CPU times: user 258 ms, sys: 4 µs, total: 258 ms
Wall time: 257 ms


In [41]:
test_pred = xgb1.predict(test_mat)

In [45]:
test_ids = [x['id'] for x in test]

In [46]:
test_cuisine = le.inverse_transform(test_pred)

In [49]:
df_pred = pd.DataFrame({'id': test_ids, 'cuisine': test_cuisine}, columns=['id', 'cuisine'])

In [51]:
df_pred.to_csv('../_data/180412_xgb_ingr_word.csv', index=False)