In [1]:
import json
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train = json.load(open('../_data/train.json'))
test = json.load(open('../_data/test.json'))

In [3]:
train_as_text = [' '.join(sample['ingredients']).lower() for sample in train]
train_cuisine = [sample['cuisine'] for sample in train]

test_as_text = [' '.join(sample['ingredients']).lower() for sample in test]

In [4]:
train_as_text[283]

'bertolli® classico olive oil boneless skinless chicken breast halves eggs linguine chicken broth bacon, crisp-cooked and crumbled bertolli vineyard premium collect marinara with burgundi wine sauc bread crumb fresh shredded mozzarella cheese'

In [7]:

tfidf_enc = TfidfVectorizer(binary=True)
lbl_enc = LabelEncoder()

X = tfidf_enc.fit_transform(train_as_text)
X = X.astype('float16')

X_test = tfidf_enc.transform(test_as_text)
X_test = X_test.astype('float16')

y = lbl_enc.fit_transform(train_cuisine)

In [10]:
len(tfidf_enc.vocabulary_)

3010

In [12]:
%%time
clf = SVC(C=100, kernel='rbf', degree=3,
          gamma=1, coef0=1, shrinking=True, 
          probability=False, tol=0.001, cache_size=200,
          class_weight=None, verbose=True, max_iter=-1,
          decision_function_shape=None, random_state=None)
model = OneVsRestClassifier(clf, n_jobs=4)
model.fit(X,y)

CPU times: user 1.26 s, sys: 268 ms, total: 1.53 s
Wall time: 25min 32s


In [41]:
X.shape

(39774, 3010)

In [44]:
%%time
#
clf2 = LinearSVC(penalty='l2',
                 loss='hinge',
                 dual=True,
                 tol=0.0001,
                 C=1.0,
                 multi_class='ovr',
                 fit_intercept=True,
                 intercept_scaling=1,
                 class_weight=None,
                 verbose=10,
                 random_state=None,
                 max_iter=10000)
model2 = OneVsRestClassifier(clf2, n_jobs=4)
model2.fit(X,y)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]CPU times: user 375 ms, sys: 92.1 ms, total: 467 ms
Wall time: 2.88 s


In [85]:
%%time
print_time()

clf3 = XGBClassifier(max_depth=3,
              learning_rate=0.1,
              n_estimators=1000,
              silent=True,
              objective='binary:logistic',
              booster='gbtree',
              n_jobs=1,
              nthread=None,
              gamma=0,
              min_child_weight=1,
              max_delta_step=0,
              subsample=1,
              colsample_bytree=1,
              colsample_bylevel=1,
              reg_alpha=0,
              reg_lambda=1,
              scale_pos_weight=1,
              base_score=0.5,
              random_state=0,
              seed=None,
              missing=None)
model3 = OneVsRestClassifier(clf3, n_jobs=4)
model3.fit(X,y)

print_time()

180406-165603
180406-170601
CPU times: user 715 ms, sys: 272 ms, total: 988 ms
Wall time: 9min 57s


In [86]:
%%time
print_time()

y_test3 = model3.predict(X_test)
test_cuisine3 = lbl_enc.inverse_transform(y_test3)

print_time()

180406-170716
180406-170722
CPU times: user 16.2 s, sys: 8.61 ms, total: 16.2 s
Wall time: 5.72 s


In [87]:
test_id = [sample['id'] for sample in test]

submission_df = pd.DataFrame({'id': test_id, 'cuisine': test_cuisine3}, columns=['id', 'cuisine'])
submission_df.to_csv('../_data/svm_submission5.csv', index=False)

---



In [22]:
xx = X.astype('float32').todense()

In [26]:
np.where(xx[0] != 0)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([ 184,  247,  522,  738,  958, 1088, 1094, 1169, 1527, 1868, 1872,
        1998, 2181, 2295, 2403, 2770]))

In [27]:
xx[0][xx[0] != 0]

matrix([[ 0.20751953,  0.13989258,  0.14562988,  0.33422852,  0.30395508,
          0.38842773,  0.10528564,  0.3503418 ,  0.26635742,  0.26098633,
          0.16455078,  0.10211182,  0.23913574,  0.34277344,  0.23010254,
          0.15185547]], dtype=float32)

In [28]:
tfidf_enc.vocabulary_

{'romaine': 2295,
 'lettuce': 1527,
 'black': 247,
 'olives': 1868,
 'grape': 1169,
 'tomatoes': 2770,
 'garlic': 1094,
 'pepper': 1998,
 'purple': 2181,
 'onion': 1872,
 'seasoning': 2403,
 'garbanzo': 1088,
 'beans': 184,
 'feta': 958,
 'cheese': 522,
 'crumbles': 738,
 'plain': 2080,
 'flour': 1013,
 'ground': 1204,
 'salt': 2349,
 'thyme': 2747,
 'eggs': 898,
 'green': 1190,
 'yellow': 2983,
 'corn': 676,
 'meal': 1666,
 'milk': 1707,
 'vegetable': 2877,
 'oil': 1860,
 'mayonaise': 1660,
 'cooking': 671,
 'chilies': 550,
 'grilled': 1197,
 'chicken': 536,
 'breasts': 330,
 'powder': 2133,
 'soy': 2545,
 'sauce': 2374,
 'butter': 375,
 'livers': 1553,
 'water': 2922,
 'wheat': 2933,
 'shallots': 2428,
 'cornflour': 681,
 'cayenne': 488,
 'onions': 1873,
 'paste': 1961,
 'lemon': 1518,
 'juice': 1386,
 'chili': 549,
 'passata': 1957,
 'cumin': 763,
 'boneless': 283,
 'skinless': 2491,
 'thigh': 2739,
 'garam': 1087,
 'masala': 1645,
 'double': 849,
 'cream': 708,
 'natural': 1799,
 '

In [29]:

tfidf_enc2 = TfidfVectorizer()
lbl_enc2 = LabelEncoder()

X2 = tfidf_enc2.fit_transform(train_as_text)
X2 = X2.astype('float16')

X2_test = tfidf_enc2.transform(test_as_text)
X2_test = X2_test.astype('float16')

y2 = lbl_enc2.fit_transform(train_cuisine)

In [30]:
xx2 = X2.astype('float32').todense()

In [32]:
np.where(xx[0] != 0)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([ 184,  247,  522,  738,  958, 1088, 1094, 1169, 1527, 1868, 1872,
        1998, 2181, 2295, 2403, 2770]))

In [31]:
np.where(xx2[0] != 0)

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([ 184,  247,  522,  738,  958, 1088, 1094, 1169, 1527, 1868, 1872,
        1998, 2181, 2295, 2403, 2770]))

In [27]:
xx[0][xx[0] != 0]

matrix([[ 0.20751953,  0.13989258,  0.14562988,  0.33422852,  0.30395508,
          0.38842773,  0.10528564,  0.3503418 ,  0.26635742,  0.26098633,
          0.16455078,  0.10211182,  0.23913574,  0.34277344,  0.23010254,
          0.15185547]], dtype=float32)

In [33]:
xx2[0][xx2[0] != 0]

matrix([[ 0.20751953,  0.13989258,  0.14562988,  0.33422852,  0.30395508,
          0.38842773,  0.10528564,  0.3503418 ,  0.26635742,  0.26098633,
          0.16455078,  0.10211182,  0.23913574,  0.34277344,  0.23010254,
          0.15185547]], dtype=float32)

In [28]:
tfidf_enc.vocabulary_

{'romaine': 2295,
 'lettuce': 1527,
 'black': 247,
 'olives': 1868,
 'grape': 1169,
 'tomatoes': 2770,
 'garlic': 1094,
 'pepper': 1998,
 'purple': 2181,
 'onion': 1872,
 'seasoning': 2403,
 'garbanzo': 1088,
 'beans': 184,
 'feta': 958,
 'cheese': 522,
 'crumbles': 738,
 'plain': 2080,
 'flour': 1013,
 'ground': 1204,
 'salt': 2349,
 'thyme': 2747,
 'eggs': 898,
 'green': 1190,
 'yellow': 2983,
 'corn': 676,
 'meal': 1666,
 'milk': 1707,
 'vegetable': 2877,
 'oil': 1860,
 'mayonaise': 1660,
 'cooking': 671,
 'chilies': 550,
 'grilled': 1197,
 'chicken': 536,
 'breasts': 330,
 'powder': 2133,
 'soy': 2545,
 'sauce': 2374,
 'butter': 375,
 'livers': 1553,
 'water': 2922,
 'wheat': 2933,
 'shallots': 2428,
 'cornflour': 681,
 'cayenne': 488,
 'onions': 1873,
 'paste': 1961,
 'lemon': 1518,
 'juice': 1386,
 'chili': 549,
 'passata': 1957,
 'cumin': 763,
 'boneless': 283,
 'skinless': 2491,
 'thigh': 2739,
 'garam': 1087,
 'masala': 1645,
 'double': 849,
 'cream': 708,
 'natural': 1799,
 '

In [34]:
tfidf_enc2.vocabulary_

{'romaine': 2295,
 'lettuce': 1527,
 'black': 247,
 'olives': 1868,
 'grape': 1169,
 'tomatoes': 2770,
 'garlic': 1094,
 'pepper': 1998,
 'purple': 2181,
 'onion': 1872,
 'seasoning': 2403,
 'garbanzo': 1088,
 'beans': 184,
 'feta': 958,
 'cheese': 522,
 'crumbles': 738,
 'plain': 2080,
 'flour': 1013,
 'ground': 1204,
 'salt': 2349,
 'thyme': 2747,
 'eggs': 898,
 'green': 1190,
 'yellow': 2983,
 'corn': 676,
 'meal': 1666,
 'milk': 1707,
 'vegetable': 2877,
 'oil': 1860,
 'mayonaise': 1660,
 'cooking': 671,
 'chilies': 550,
 'grilled': 1197,
 'chicken': 536,
 'breasts': 330,
 'powder': 2133,
 'soy': 2545,
 'sauce': 2374,
 'butter': 375,
 'livers': 1553,
 'water': 2922,
 'wheat': 2933,
 'shallots': 2428,
 'cornflour': 681,
 'cayenne': 488,
 'onions': 1873,
 'paste': 1961,
 'lemon': 1518,
 'juice': 1386,
 'chili': 549,
 'passata': 1957,
 'cumin': 763,
 'boneless': 283,
 'skinless': 2491,
 'thigh': 2739,
 'garam': 1087,
 'masala': 1645,
 'double': 849,
 'cream': 708,
 'natural': 1799,
 '

In [5]:
from time import strftime

def print_time():
    print(strftime('%y%m%d-%H%M%S'))
    return

In [6]:
from sklearn.pipeline import FeatureUnion

In [8]:
def itself(x):
    return x

In [9]:
import re

In [10]:
SPEC_REMOVE = re.compile(r'(\'|\’|\(.*oz.*\)|(\()|(\)))')
SPEC_AND = re.compile(r'\&')
SPEC_ELSE = re.compile(r'[^\w\s\%_]')

def clean_ingr(ingr):
    ingr = re.sub(SPEC_REMOVE, '', ingr)
    ingr = re.sub(SPEC_AND, 'and', ingr)
    ingr = re.sub(SPEC_ELSE, ' ', ingr)
    return ' '.join(ingr.split())

In [11]:
def get_ingrs(given):
    ingrs = [[clean_ingr(i).lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [12]:
%%time
print_time()
train_ingrs = get_ingrs(train)
print_time()

180406-171605
180406-171609
CPU times: user 3.75 s, sys: 28.3 ms, total: 3.78 s
Wall time: 3.8 s


In [13]:
def combine_words(ilist):
    return ' '.join(ilist)

In [14]:
%%time
dvec_all = FeatureUnion([
        ("ingrs", TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself)),
        ("words", TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words)),
        ]).fit(get_ingrs(train+test))

CPU times: user 5.47 s, sys: 51.9 ms, total: 5.52 s
Wall time: 5.55 s


In [77]:
TfidfVectorizer(strip_accents='unicode',
                                  tokenizer=itself,
                                  preprocessor=itself).fit_transform(get_ingrs(train+test))

<49718x7109 sparse matrix of type '<class 'numpy.float64'>'
	with 535639 stored elements in Compressed Sparse Row format>

In [78]:
TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words).fit_transform(get_ingrs(train+test))

<49718x3119 sparse matrix of type '<class 'numpy.float64'>'
	with 953376 stored elements in Compressed Sparse Row format>

In [79]:
TfidfVectorizer(strip_accents='unicode',
                                  preprocessor=combine_words).fit_transform(train_ingrs)

<39774x3005 sparse matrix of type '<class 'numpy.float64'>'
	with 762277 stored elements in Compressed Sparse Row format>

In [76]:
all_ingrs = dvec_all.transform(

<49718x10228 sparse matrix of type '<class 'numpy.float64'>'
	with 1489015 stored elements in Compressed Sparse Row format>

In [93]:
train_mat = dvec_all.transform(train_ingrs)

In [94]:
train_mat

<39774x10228 sparse matrix of type '<class 'numpy.float64'>'
	with 1190521 stored elements in Compressed Sparse Row format>

10228

In [81]:
import xgboost as xgb
from xgboost import XGBClassifier

In [None]:
XGBClassifier(max_depth=3,
              learning_rate=0.1,
              n_estimators=100,
              silent=True,
              objective='binary:logistic',
              booster='gbtree',
              n_jobs=1,
              nthread=None,
              gamma=0,
              min_child_weight=1,
              max_delta_step=0,
              subsample=1,
              colsample_bytree=1,
              colsample_bylevel=1,
              reg_alpha=0,
              reg_lambda=1,
              scale_pos_weight=1,
              base_score=0.5,
              random_state=0,
              seed=None,
              missing=None)

In [90]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values,
                              label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds,
                          show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors],
            dtrain['Disbursed'],
            eval_metric='accuracy')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))


In [97]:
train_mat.shape

(39774, 10228)

In [110]:
t0 = train_mat.todense()[0]

AttributeError: 'matrix' object has no attribute 'to_array'

In [114]:
t0 = np.squeeze(np.asarray(t0))

In [116]:
t0.shape

(10228,)

In [120]:
t0ing = np.where(t0 != 0)[0]

In [119]:
feats = np.array(dvec_all.get_feature_names())

In [122]:
train[0]['ingredients']

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles']

In [121]:
feats[t0ing]

array(['ingrs__black olives', 'ingrs__feta cheese crumbles',
       'ingrs__garbanzo beans', 'ingrs__garlic', 'ingrs__grape tomatoes',
       'ingrs__pepper', 'ingrs__purple onion', 'ingrs__romaine lettuce',
       'ingrs__seasoning', 'words__beans', 'words__black', 'words__cheese',
       'words__crumbles', 'words__feta', 'words__garbanzo',
       'words__garlic', 'words__grape', 'words__lettuce', 'words__olives',
       'words__onion', 'words__pepper', 'words__purple', 'words__romaine',
       'words__seasoning', 'words__tomatoes'],
      dtype='<U76')

In [99]:
dvec_all.get_feature_names()

['words__zaatar',
 'words__zabaglione',
 'words__zatarains',
 'words__zero',
 'words__zest',
 'words__zesty',
 'words__zinfandel',
 'words__ziti',
 'words__zucchini',
 'words__épices']

In [92]:
#Choose all predictors except target & IDcols
#predictors = [x for x in train.columns if x not in [target, IDcol]]

xgb1 = XGBClassifier(
    learning_rate=0.1,
    n_estimators=1000,
    max_depth=5,
    min_child_weight=1,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    scale_pos_weight=1,
    seed=27)

In [None]:
modelfit(xgb1, train, predictors)