# Vectorization improvement
We will use this notebook to experiment with vectorization techniques, that:
- vectorizes ingredients, not words
- takes ingredients' positions on their list into account

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

import matplotlib.pyplot as plt
import seaborn as sns

import os

In [2]:
train = json.load(open('./input/cooking_train.json', 'r'))
test = json.load(open('./input/cooking_test.json', 'r'))

## Vectorizing ingredients
We can use preprocessing to squash ingredient words together and make sklearns' TfIdfVectorizer work on them.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

from typing import List

In [4]:
%%time
all_recipes = train + test
print(len(all_recipes))

39774
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 2.02 ms


In [5]:
def preprocess_ingredients(recipe_list: List[str]) -> str:
    strip_ingredient = lambda ingredient: "".join([word.lower() for word in ingredient.split(" ") if word.isalnum()])
    return ", ".join([strip_ingredient(ingredient) for ingredient in recipe_list])

In [6]:
%%time
vectorizer = TfidfVectorizer(preprocessor=preprocess_ingredients)
all_ingredients = [r['ingredients'] for r in all_recipes]
all_vectors = vectorizer.fit_transform(all_ingredients)
print(type(all_vectors))
assert(len(all_recipes) == all_vectors.shape[0])
print(all_vectors.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(39774, 6462)
CPU times: user 940 ms, sys: 0 ns, total: 940 ms
Wall time: 938 ms


# Assembling model input

In [7]:
import scipy as sp

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, chi2

In [8]:
# assert(all_vectors.shape[0] == features.shape[0])
# data = sp.sparse.hstack([all_vectors, sp.sparse.csr_matrix(features)], format='csr')
# type(data)
# there are no features right now

In [9]:
cousine_names = [r['cuisine'] for r in train]
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(categories='auto')
labels = label_encoder.fit_transform(cousine_names)
labels_onehot = onehot_encoder.fit_transform(labels.reshape(-1, 1)).toarray()

In [10]:
# feature_selector = SelectPercentile(chi2, percentile=67)
# train_data = feature_selector.fit_transform(data[:len(labels)], labels)
# test_data = feature_selector.transform(data[len(labels):])

In [11]:
train_data, test_data = all_vectors[:len(labels)].toarray(), all_vectors[len(labels):].toarray()

In [12]:
train_data.shape, test_data.shape, labels.shape, labels_onehot.shape

((30000, 6462), (9774, 6462), (30000,), (30000, 20))

# Model training and cross-validation

In [13]:
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="3"

In [14]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split

from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense
from keras.optimizers import Adam
import keras.backend as K
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


In [64]:
def build_model(use_embedding=True, embedding_dim=512):
    K.clear_session()
    model = Sequential()
    if use_embedding:
        model.add(Embedding(input_dim=train_data.shape[1], output_dim=embedding_dim))
    model.add(Conv1D(filters=embedding_dim//2, kernel_size=3, padding='valid'))
    model.add(MaxPooling1D(pool_size=2, padding='valid'))
    model.add(Conv1D(filters=embedding_dim//8, kernel_size=3, padding='valid'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(embedding_dim//16, activation='relu'))
    model.add(Dense(labels_onehot.shape[1], activation='sigmoid'))
    model.compile(optimizer=Adam(lr=0.0002137), loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [65]:
build_model().summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 512)         3308544   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 256)         393472    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 256)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 64)          49216     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                2080      
_________________________________________________________________
dense_2 (Dense)              (None, 20)                660       
Total para

In [66]:
train_y_.shape

(22500, 20)

In [70]:
%%time
splitter = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
results = []
train_X_, eval_X_, train_y_, eval_y_ = train_test_split(train_data, labels_onehot, stratify=labels, test_size=0.25, shuffle=True, random_state=42)
fit_params = {
    'batch_size': 128,
    'epochs': 4,
    'validation_data': (eval_X_, eval_y_),
    'shuffle': True,
    'verbose': 1,
}
model = build_model()
model.fit(train_X_, train_y_, **fit_params)
score = model.evaluate(eval_X_, eval_y_)[0]
print(f"val_accuracy={score:.4f}")
results.append({
    'score': score,
    'model': model
})

Train on 22500 samples, validate on 7500 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
val_accuracy=0.1778
CPU times: user 3min 44s, sys: 43.1 s, total: 4min 27s
Wall time: 5min 23s


# Submission generation
Code below is more complex than it must be, but its copied from baseline and wil allow us for easy ensembling of various models later.

In [71]:
sample_subm = pd.read_csv('./input/sample_submission.csv')
print(sample_subm.shape)
sample_subm.head()

(9774, 2)


Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [78]:
preds = model.predict(test_data, batch_size=128, verbose=1)



In [75]:
result_ids = [r['id'] for r in test]
print(f"Using label encored: {label_encoder}")
print(f"Using result ids: {result_ids[:7]}...")

def generate_predictions(model_data) -> pd.DataFrame:
    model = model_data['model']
    preds = model.predict(test_data, batch_size=128, verbose=1)
    pred_labels = onehot_encoder.inverse_transform(preds)
    pred_names = label_encoder.inverse_transform(pred_labels.ravel())
    return pd.DataFrame({
        'id': result_ids,
        'cuisine': pred_names
    })

Using label encored: LabelEncoder()
Using result ids: [24888, 43564, 21898, 6991, 37700, 43546, 20544]...


In [76]:
%%time
subm_dfs = [generate_predictions(model_data) for model_data in results]

CPU times: user 7.65 s, sys: 1.61 s, total: 9.26 s
Wall time: 8.55 s


In [77]:
subm = pd.concat(subm_dfs)
print(subm.shape)
subm.head()

(9774, 2)


Unnamed: 0,id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [50]:
%%time
_sf = subm.groupby('id').cuisine.apply(lambda arr: sp.stats.mode(arr).mode[0])
subm_final = pd.DataFrame({
    'Id': _sf.index,
    'cuisine': _sf.values
})



CPU times: user 2 s, sys: 76 ms, total: 2.08 s
Wall time: 1.97 s


In [40]:
subm_final.head()

Unnamed: 0,Id,cuisine
0,16,indian
1,22,mexican
2,24,southern_us
3,32,japanese
4,48,indian


In [44]:
# sanity checks
assert(subm_final.notna().all().all())
assert(sorted(sample_subm['Id'].unique()) == sorted(subm_final['Id'].unique()))
assert(sample_subm.shape == subm_final.shape)

In [48]:
scores = [model_data['score'] for model_data in results]
mean_cv_score = np.mean(scores)
std_cv_score = np.std(scores)
model_name = 'LGBM'
subm_filename = f'{model_name}-cvmean={mean_cv_score:.4f}-cvstd={std_cv_score:.4f}.csv'
subm_path = os.path.join('./submissions/', subm_filename)
subm_path

'./submissions/LGBM-cvmean=0.7815-cvstd=0.0079.csv'

In [52]:
subm_final.to_csv(subm_path, index=False)

In [53]:
!kaggle competitions submit -f {subm_path} -m "Baseline" ml1819-whats-cooking

100%|████████████████████████████████████████| 136k/136k [00:02<00:00, 53.8kB/s]
Successfully submitted to ML1819 - What's Cooking?