In [1]:
import numpy as np
import json


with open('../_data/train.json', 'r') as f:
    train = json.load(f)
with open('../_data/test.json', 'r') as f:
    test = json.load(f)

In [2]:
len(train), len(test)

(39774, 9944)

In [3]:
train[0]

{'cuisine': 'greek',
 'id': 10259,
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

## Pipeline using DictVectorizer (bag of ingredients) and SVC

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [5]:
def itself(x):
    return x

In [87]:
svc = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("svc", SVC(kernel="linear"))])  # better than other kernels

In [88]:
def get_ingrs(given):
    ingrs = [[i.lower() for i in recipe['ingredients']] for recipe in given]
    return ingrs

def get_labels(given):
    return [r['cuisine'] for r in given]

In [89]:
%%time
scores = cross_val_score(svc, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 976 ms, sys: 216 ms, total: 1.19 s
Wall time: 2min 28s


In [90]:
scores

array([ 0.77982919,  0.78336265,  0.77916038,  0.77487109,  0.78633447])

#### Cross-validation accuracy

In [43]:
scores.mean()

0.78071155518300084

## Fitting to test data (after encoding all of train ingredients)

In [101]:
%%time
dvec_all = TfidfVectorizer(tokenizer=itself,
                           preprocessor=itself,
                          ).fit(get_ingr_dict(train))

CPU times: user 319 ms, sys: 20.1 ms, total: 339 ms
Wall time: 338 ms


In [102]:
test_bag = dvec_all.transform(get_ingr_dict(test))

In [110]:
svc_linear = SVC(kernel='linear')

In [111]:
%%time
svc_linear = svc_linear.fit(dvec_all.transform(get_ingr_dict(train)), get_labels(train))

CPU times: user 1min 35s, sys: 299 ms, total: 1min 36s
Wall time: 1min 36s


In [112]:
svc_linear.classes_

array(['brazilian', 'british', 'cajun_creole', 'chinese', 'filipino',
       'french', 'greek', 'indian', 'irish', 'italian', 'jamaican',
       'japanese', 'korean', 'mexican', 'moroccan', 'russian',
       'southern_us', 'spanish', 'thai', 'vietnamese'],
      dtype='<U12')

In [113]:
svc_linear.get_params()

{'C': 1.0,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'auto',
 'kernel': 'linear',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [114]:
test_preds = svc_linear.predict(dvec_all.transform(get_ingr_dict(test)))

In [115]:
test_preds.shape

(9944,)

In [116]:
test_ids = [r['id'] for r in test]

In [117]:
df_test = pd.DataFrame([test_ids, list(test_preds)]).transpose()
df_test.columns = ['id', 'cuisine']

In [118]:
df_test.to_csv('../_data/180401_basic_SVM_TFIDF_train_words_only.csv', index=False)

In [134]:
df_aaa = pd.Series(['a; b; c', 'd e;f'])

In [136]:
df_aaa.str.split(';? ?')

  f = lambda x: regex.split(x, maxsplit=n)


0    [a, b, c]
1    [d, e, f]
dtype: object

## Results
Accuracy 0.78499 (vocab in train+test)  
0.78459 (vocab in train only)  
Rank 612

![kaggle image](../_images/180401_bow_svm_tfidf.png)
![kaggle image](../_images/180401_bow_svm_tfidf_standing.png)

In [62]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split



In [49]:
train_ingr = get_ingrs(train)
train_labels = get_labels(train)

In [50]:
len(train_ingr), len(train_labels)

(39774, 39774)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(train_ingr,
                                                    train_labels,
                                                    test_size=0.2,
                                                   )

In [5]:
def itself(x):
    return x

In [36]:
svc = Pipeline([
    ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
    ("svc", SVC(kernel="linear"))])

In [55]:
svc.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [57]:
y_pred = svc.predict(X_test)

In [70]:
y_pred[:10], y_test[:10]

(array(['indian', 'thai', 'italian', 'southern_us', 'vietnamese',
        'filipino', 'mexican', 'jamaican', 'british', 'italian'],
       dtype='<U12'),
 ['indian',
  'thai',
  'italian',
  'british',
  'vietnamese',
  'jamaican',
  'mexican',
  'jamaican',
  'russian',
  'italian'])

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.73      0.58      0.65        90
     british       0.69      0.39      0.50       172
cajun_creole       0.80      0.67      0.73       341
     chinese       0.79      0.89      0.84       521
    filipino       0.74      0.60      0.66       135
      french       0.59      0.64      0.62       555
       greek       0.84      0.68      0.75       216
      indian       0.86      0.89      0.88       606
       irish       0.66      0.46      0.55       136
     italian       0.78      0.89      0.83      1579
    jamaican       0.82      0.59      0.69       106
    japanese       0.86      0.66      0.75       294
      korean       0.84      0.68      0.75       152
     mexican       0.88      0.92      0.90      1280
    moroccan       0.83      0.78      0.80       165
     russian       0.68      0.37      0.48        92
 southern_us       0.71      0.83      0.77       855
     spanish       0.69    

In [68]:
pd.DataFrame(confusion_matrix(y_test, y_pred), columns=[x[:3] for x in svc.classes_], index=svc.classes_)

Unnamed: 0,bra,bri,caj,chi,fil,fre,gre,ind,iri,ita,jam,jap,kor,mex,mor,rus,sou,spa,tha,vie
brazilian,52,0,0,0,0,2,0,3,0,6,0,0,0,14,0,0,11,0,2,0
british,0,67,0,0,1,32,0,8,11,13,1,0,1,3,0,0,31,3,1,0
cajun_creole,1,0,230,0,0,19,0,1,0,24,0,0,0,12,0,0,52,2,0,0
chinese,2,1,0,464,3,2,0,0,1,5,0,10,9,6,0,0,7,0,9,2
filipino,3,1,1,10,81,4,0,1,1,8,0,3,0,7,0,0,9,0,1,5
french,0,3,3,0,1,356,0,2,6,124,1,1,0,5,2,2,44,5,0,0
greek,0,0,1,1,0,5,147,1,0,44,0,0,0,5,4,1,4,3,0,0
indian,3,1,0,2,2,1,5,539,0,8,2,2,0,16,12,2,6,1,3,1
irish,0,10,0,1,0,19,1,0,63,7,2,0,0,2,3,4,24,0,0,0
italian,2,2,7,2,0,72,16,5,2,1405,2,0,0,21,2,2,28,11,0,0


In [94]:
from xgboost import XGBClassifier

In [49]:
train_ingr = get_ingrs(train)
train_labels = get_labels(train)

In [50]:
len(train_ingr), len(train_labels)

(39774, 39774)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(train_ingr,
                                                    train_labels,
                                                    test_size=0.2,
                                                   )

In [5]:
def itself(x):
    return x

In [95]:
xgb1 = Pipeline([
     ("tfidf_vec", TfidfVectorizer(tokenizer=itself,
                                         preprocessor=itself,
                                        )),
     ("xgb", XGBClassifier(n_jobs=-1))])

In [96]:
%%time
scores = cross_val_score(xgb1, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 1.98 s, sys: 481 ms, total: 2.46 s
Wall time: 39min 48s


In [97]:
scores

array([ 0.67395127,  0.68270922,  0.67697335,  0.67488366,  0.6795017 ])

In [99]:
%%time
scores_svc = cross_val_score(svc, get_ingrs(train), get_labels(train), cv=5, n_jobs=-1)

CPU times: user 981 ms, sys: 184 ms, total: 1.16 s
Wall time: 2min 22s


In [100]:
scores_svc

array([ 0.77982919,  0.78336265,  0.77916038,  0.77487109,  0.78633447])

#### Cross-validation accuracy

In [98]:
scores.mean()

0.67760384131581075

In [55]:
svc.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf_vec', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2',
        preprocessor=<function it...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [57]:
y_pred = svc.predict(X_test)

In [70]:
y_pred[:10], y_test[:10]

(array(['indian', 'thai', 'italian', 'southern_us', 'vietnamese',
        'filipino', 'mexican', 'jamaican', 'british', 'italian'],
       dtype='<U12'),
 ['indian',
  'thai',
  'italian',
  'british',
  'vietnamese',
  'jamaican',
  'mexican',
  'jamaican',
  'russian',
  'italian'])

In [61]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

   brazilian       0.73      0.58      0.65        90
     british       0.69      0.39      0.50       172
cajun_creole       0.80      0.67      0.73       341
     chinese       0.79      0.89      0.84       521
    filipino       0.74      0.60      0.66       135
      french       0.59      0.64      0.62       555
       greek       0.84      0.68      0.75       216
      indian       0.86      0.89      0.88       606
       irish       0.66      0.46      0.55       136
     italian       0.78      0.89      0.83      1579
    jamaican       0.82      0.59      0.69       106
    japanese       0.86      0.66      0.75       294
      korean       0.84      0.68      0.75       152
     mexican       0.88      0.92      0.90      1280
    moroccan       0.83      0.78      0.80       165
     russian       0.68      0.37      0.48        92
 southern_us       0.71      0.83      0.77       855
     spanish       0.69    

In [68]:
pd.DataFrame(confusion_matrix(y_test, y_pred), columns=[x[:3] for x in svc.classes_], index=svc.classes_)

Unnamed: 0,bra,bri,caj,chi,fil,fre,gre,ind,iri,ita,jam,jap,kor,mex,mor,rus,sou,spa,tha,vie
brazilian,52,0,0,0,0,2,0,3,0,6,0,0,0,14,0,0,11,0,2,0
british,0,67,0,0,1,32,0,8,11,13,1,0,1,3,0,0,31,3,1,0
cajun_creole,1,0,230,0,0,19,0,1,0,24,0,0,0,12,0,0,52,2,0,0
chinese,2,1,0,464,3,2,0,0,1,5,0,10,9,6,0,0,7,0,9,2
filipino,3,1,1,10,81,4,0,1,1,8,0,3,0,7,0,0,9,0,1,5
french,0,3,3,0,1,356,0,2,6,124,1,1,0,5,2,2,44,5,0,0
greek,0,0,1,1,0,5,147,1,0,44,0,0,0,5,4,1,4,3,0,0
indian,3,1,0,2,2,1,5,539,0,8,2,2,0,16,12,2,6,1,3,1
irish,0,10,0,1,0,19,1,0,63,7,2,0,0,2,3,4,24,0,0,0
italian,2,2,7,2,0,72,16,5,2,1405,2,0,0,21,2,2,28,11,0,0


## Possible improvements
* Grid search on hyperparameters (kernel type, etc.)

In [137]:
with open('../_data/train.json', 'r') as f:
    train11 = json.load(f)

In [154]:
train11[283]

{'cuisine': 'italian',
 'id': 23437,
 'ingredients': ['Bertolli® Classico Olive Oil',
  'boneless skinless chicken breast halves',
  'eggs',
  'linguine',
  'chicken broth',
  'bacon, crisp-cooked and crumbled',
  'bertolli vineyard premium collect marinara with burgundi wine sauc',
  'bread crumb fresh',
  'shredded mozzarella cheese']}

In [159]:
def clean_ingredient(s):
    """
    Method that returns a cleaned up version of the entered ingredient.
    """
    from re import sub
    return sub('[^A-Za-z0-9]+', ' ', s)
def get_train(x):
    """
    Method that returns a dictionary of data for the training set.
    """
    return {
        'cuisine': x['cuisine'],
        'ingredients': ', '.join([clean_ingredient(x) for x in x['ingredients']]),
    }

In [162]:
pd.Series(get_train(train11[283])).str.split(',? ')

cuisine                                                [italian]
ingredients    [Bertolli, Classico, Olive, Oil, boneless, ski...
dtype: object

In [163]:
def stripString(s):
    return ', '.join([''.join(y.lower() for y in x if y.isalnum()) for x in s.split(',')])

In [165]:
tfff = TfidfVectorizer(
    strip_accents='unicode',analyzer="char",
    ngram_range=(2,6),preprocessor=stripString)

In [172]:
train_ingr_joined = [', '.join(x) for x in train_ingr]

In [174]:
stripString(train_ingr_joined[0])

'romainelettuce, blackolives, grapetomatoes, garlic, pepper, purpleonion, seasoning, garbanzobeans, fetacheesecrumbles'

In [170]:
tfff.fit(train_ingr_joined[:100])

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(2, 6), norm='l2',
        preprocessor=<function stripString at 0x7fccec368a60>,
        smooth_idf=True, stop_words=None, strip_accents='unicode',
        sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, use_idf=True, vocabulary=None)

In [171]:
tfff.vocabulary_

{'ro': 11908,
 'om': 10196,
 'ma': 8649,
 'ai': 1868,
 'in': 7055,
 'ne': 9301,
 'el': 4768,
 'le': 8032,
 'et': 5441,
 'tt': 13639,
 'tu': 13662,
 'uc': 13722,
 'ce': 3135,
 'e,': 4071,
 ', ': 960,
 ' b': 40,
 'bl': 2877,
 'la': 7893,
 'ac': 1690,
 'ck': 3364,
 'ko': 7718,
 'ol': 10127,
 'li': 8179,
 'iv': 7428,
 've': 14126,
 'es': 5222,
 's,': 12228,
 ' g': 337,
 'gr': 6026,
 'ra': 11336,
 'ap': 2268,
 'pe': 10853,
 'to': 13489,
 'at': 2600,
 'oe': 10006,
 'ga': 5770,
 'ar': 2322,
 'rl': 11831,
 'ic': 6658,
 'c,': 2995,
 ' p': 602,
 'ep': 4988,
 'pp': 11061,
 'er': 5019,
 'r,': 11165,
 'pu': 11138,
 'ur': 13930,
 'rp': 12014,
 'pl': 11002,
 'eo': 4975,
 'on': 10232,
 'ni': 9500,
 'io': 7223,
 'n,': 9048,
 ' s': 728,
 'se': 12500,
 'ea': 4231,
 'as': 2502,
 'so': 12798,
 'ng': 9399,
 'g,': 5741,
 'rb': 11451,
 'ba': 2782,
 'an': 2112,
 'nz': 9855,
 'zo': 14584,
 'ob': 9927,
 'be': 2845,
 'ns': 9670,
 ' f': 279,
 'fe': 5590,
 'ta': 13141,
 'ch': 3253,
 'he': 6259,
 'ee': 4562,
 'ec': 

In [153]:
for i, x in enumerate(train11):
    if any([i for i in x['ingredients'] if '®' in i]):
        print(i)

283
404
1168
1464
1526
1533
1599
2682
2858
3278
3281
3493
4110
4146
4228
4327
4537
4577
4866
5139
5256
5387
5472
5688
6145
6845
6879
7324
8195
8320
8465
8584
9039
9783
10566
11750
11954
12061
12696
12997
13117
13223
13460
13658
14070
14118
14177
14267
14744
15401
15575
15589
16156
16169
16476
16642
16717
17116
17148
17588
18075
18092
18295
18327
18647
19282
19388
19786
20197
20418
20944
21166
21282
21801
21847
22313
22827
22913
23398
23607
24012
24036
24476
24817
25049
25050
25057
25238
25708
26451
26468
26879
27385
27784
28107
28161
28164
28498
28699
29192
29527
29600
29637
30770
30980
31078
31225
31516
31660
31726
32101
32680
33041
33461
33773
33834
34608
34638
34786
34937
35000
35150
35198
35439
35929
36019
36055
36447
36550
36581
37938
37964
38016
38222
38781
38818
38855
39490
39544
39597
39622
39682
39695


In [179]:
from nltk.stem import WordNetLemmatizer
import nltk

In [180]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/tkinter/__init__.py", line 1699, in __call__
    return self.func(*args)
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/site-packages/nltk/draw/table.py", line 194, in _resize_column_motion_cb
    lb['width'] = max(3, lb['width'] + (x1-x2) // charwidth)
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/tkinter/__init__.py", line 1486, in __setitem__
    self.configure({key: value})
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/tkinter/__init__.py", line 1479, in configure
    return self._configure('configure', cnf, kw)
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/tkinter/__init__.py", line 1470, in _configure
    self.tk.call(_flatten((self._w, cmd)) + self._options(cnf))
_tkinter.TclError: expected integer but got "45.0"
Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/aeatda/anaconda3/envs

Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/tkinter/__init__.py", line 1699, in __call__
    return self.func(*args)
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/site-packages/nltk/draw/table.py", line 194, in _resize_column_motion_cb
    lb['width'] = max(3, lb['width'] + (x1-x2) // charwidth)
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/tkinter/__init__.py", line 1486, in __setitem__
    self.configure({key: value})
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/tkinter/__init__.py", line 1479, in configure
    return self._configure('configure', cnf, kw)
  File "/home/aeatda/anaconda3/envs/cooking/lib/python3.6/tkinter/__init__.py", line 1470, in _configure
    self.tk.call(_flatten((self._w, cmd)) + self._options(cnf))
_tkinter.TclError: expected integer but got "36.0"
Exception in Tkinter callback
Traceback (most recent call last):
  File "/home/aeatda/anaconda3/envs

True

In [177]:
wnl = WordNetLemmatizer()

In [181]:
s

NameError: name 's' is not defined