# Vectorization improvement
We will use this notebook to experiment with vectorization techniques, that:
- vectorizes ingredients, not words
- takes ingredients' positions on their list into account

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

import matplotlib.pyplot as plt
import seaborn as sns

import os

In [2]:
train = json.load(open('./input/cooking_train.json', 'r'))
test = json.load(open('./input/cooking_test.json', 'r'))

## Vectorizing ingredients
We can use preprocessing to squash ingredient words together and make sklearns' TfIdfVectorizer work on them.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

from typing import List

In [4]:
%%time
all_recipes = train + test
print(len(all_recipes))

39774
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 1.92 ms


In [5]:
def preprocess_ingredients(recipe_list: List[str]) -> str:
    strip_ingredient = lambda ingredient: "".join([word.lower() for word in ingredient.split(" ") if word.isalnum()])
    return ", ".join([strip_ingredient(ingredient) for ingredient in recipe_list])

In [6]:
%%time
vectorizer = TfidfVectorizer(preprocessor=preprocess_ingredients)
all_ingredients = [r['ingredients'] for r in all_recipes]
all_vectors = vectorizer.fit_transform(all_ingredients)
print(type(all_vectors))
assert(len(all_recipes) == all_vectors.shape[0])
print(all_vectors.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(39774, 6462)
CPU times: user 1.11 s, sys: 12 ms, total: 1.12 s
Wall time: 1.11 s


# Assembling model input

In [7]:
import scipy as sp

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_selection import SelectPercentile, chi2

In [8]:
# assert(all_vectors.shape[0] == features.shape[0])
# data = sp.sparse.hstack([all_vectors, sp.sparse.csr_matrix(features)], format='csr')
# type(data)
# there are no features right now

In [9]:
cousine_names = [r['cuisine'] for r in train]
label_encoder = LabelEncoder()
onehot_encoder = OneHotEncoder(categories='auto')
labels = label_encoder.fit_transform(cousine_names)
labels_onehot = onehot_encoder.fit_transform(labels.reshape(-1, 1)).toarray()

In [10]:
# feature_selector = SelectPercentile(chi2, percentile=67)
# train_data = feature_selector.fit_transform(data[:len(labels)], labels)
# test_data = feature_selector.transform(data[len(labels):])

In [11]:
train_data, test_data = all_vectors[:len(labels)].toarray(), all_vectors[len(labels):].toarray()

In [12]:
train_data.shape, test_data.shape, labels.shape, labels_onehot.shape

((30000, 6462), (9774, 6462), (30000,), (30000, 20))

# Model training and cross-validation

In [37]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from imblearn.over_sampling import SMOTE

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.optim as optim

In [71]:
class RecipesDataset(Dataset):
    def __init__(self, X, y=None):
        if y is not None:
            assert(X.shape[0] == y.shape[0])
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, index):
        return (self.X[index], self.y[index])

In [None]:
class BaselineCnn(nn.Module):
    def __init__(self):
        super(BaselineCnn, self).__init__()
        self.conv1 = nn.Conv1d(
            in_channels=1, 
            out_channels=5, 
            kernel_size=3, 
            stride=1
        )
        self.conv2 = nn.Conv1d(
            in_channels=5, 
            out_channels=5, 
            kernel_size=3, 
            stride=1
        )
        self.fc1 = nn.Linear(in_features=5*5*5, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=20)
        self.relu = nn.ReLU()
        self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.max_pool1d(x)
        x = self.relu(self.conv2(x))
        x = self.max_pool1d(x)
        x = x.view(-1, 5*5*5)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return self.softmax(x)

# def build_model(use_embedding=False, embedding_dim=512):
#     K.clear_session()
#     model = Sequential()
#     if use_embedding:
#         model.add(Embedding(input_dim=train_data.shape[1], output_dim=embedding_dim))
#         model.add(Conv1D(filters=embedding_dim//2, kernel_size=3, padding='valid', activation='relu'), input_shape=(1,embedding_dim))
#     else:
#         model.add(Reshape((1, train_data.shape[1]), input_shape=(train_data.shape[1],)))
#         model.add(Conv1D(filters=embedding_dim//2, kernel_size=3, padding='valid', activation='relu'))
#     model.add(MaxPooling1D(pool_size=2, padding='valid'))
#     model.add(Conv1D(filters=embedding_dim//8, kernel_size=3, padding='valid', activation='relu'))
#     model.add(GlobalMaxPooling1D())
#     model.add(Dense(embedding_dim//16, activation='relu'))
#     model.add(Dense(labels_onehot.shape[1], activation='sigmoid'))
#     model.compile(optimizer=Adam(lr=0.0002137), loss='binary_crossentropy', metrics=['accuracy'])
#     return model

In [None]:
print(model)

In [88]:
def train(log_interval, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device).reshape((data.shape[0], 1, data.shape[-1])), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss_f = nn.NLLLoss()
        print(output.shape, target.shape)
        loss = loss_f(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, 
                batch_idx * len(data), 
                len(train_loader.dataset),
                100. * batch_idx / len(train_loader), 
                loss.item()
            ))

In [89]:
def eval_model(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    loss_f = nn.NLLLoss(reduction='sum')
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += loss_f(output, target).item() # sum up batch loss
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [90]:
# Training settings
batch_size = 100
test_batch_size = 100
epochs = 10
lr = 0.0001
momentum = 0.5
seed = 42
log_interval = 100
save_model = False
device = 3
X, y = train_data, labels

In [91]:
type(X), type(y)

(numpy.ndarray, numpy.ndarray)

In [92]:
X_train, X_eval, y_train, y_eval = train_test_split(
    X, y, 
    test_size=0.2, 
    shuffle=True, 
    stratify=y, 
    random_state=42
)
train_loader = torch.utils.data.DataLoader(
    RecipesDataset(X_train, y_train),
    batch_size=batch_size, 
    shuffle=False  # already shuffled on split
)
eval_loader = torch.utils.data.DataLoader(
    RecipesDataset(X_eval, y_eval),
    batch_size=test_batch_size, 
    shuffle=False  # already shuffled on split
)

In [93]:
del model
torch.cuda.empty_cache()
model = BaselineCnn().to(device)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=momentum)

In [94]:
for epoch in range(1, epochs + 1):
    train(log_interval, model, device, train_loader, optimizer, epoch)
    eval_model(model, device, eval_loader)

torch.Size([6456, 64]) torch.Size([100])


ValueError: Expected input batch_size (6456) to match target batch_size (100).

In [27]:
%%time
splitter = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
results = []
train_X_, eval_X_, train_y_, eval_y_ = train_test_split(train_data, labels_onehot, stratify=labels, test_size=0.25, shuffle=True, random_state=42)
fit_params = {
    'batch_size': 128,
    'epochs': 4,
    'validation_data': (eval_X_, eval_y_),
    'shuffle': True,
    'verbose': 1,
}
model = build_model()
model.fit(train_X_, train_y_, **fit_params)
score = model.evaluate(eval_X_, eval_y_)[0]
print(f"val_accuracy={score:.4f}")
results.append({
    'score': score,
    'model': model
})

ValueError: Error when checking input: expected conv1d_1_input to have 3 dimensions, but got array with shape (22500, 6462)

# Submission generation
Code below is more complex than it must be, but its copied from baseline and wil allow us for easy ensembling of various models later.

In [71]:
sample_subm = pd.read_csv('./input/sample_submission.csv')
print(sample_subm.shape)
sample_subm.head()

(9774, 2)


Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [78]:
preds = model.predict(test_data, batch_size=128, verbose=1)



In [75]:
result_ids = [r['id'] for r in test]
print(f"Using label encored: {label_encoder}")
print(f"Using result ids: {result_ids[:7]}...")

def generate_predictions(model_data) -> pd.DataFrame:
    model = model_data['model']
    preds = model.predict(test_data, batch_size=128, verbose=1)
    pred_labels = onehot_encoder.inverse_transform(preds)
    pred_names = label_encoder.inverse_transform(pred_labels.ravel())
    return pd.DataFrame({
        'id': result_ids,
        'cuisine': pred_names
    })

Using label encored: LabelEncoder()
Using result ids: [24888, 43564, 21898, 6991, 37700, 43546, 20544]...


In [76]:
%%time
subm_dfs = [generate_predictions(model_data) for model_data in results]

CPU times: user 7.65 s, sys: 1.61 s, total: 9.26 s
Wall time: 8.55 s


In [77]:
subm = pd.concat(subm_dfs)
print(subm.shape)
subm.head()

(9774, 2)


Unnamed: 0,id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [50]:
%%time
_sf = subm.groupby('id').cuisine.apply(lambda arr: sp.stats.mode(arr).mode[0])
subm_final = pd.DataFrame({
    'Id': _sf.index,
    'cuisine': _sf.values
})



CPU times: user 2 s, sys: 76 ms, total: 2.08 s
Wall time: 1.97 s


In [40]:
subm_final.head()

Unnamed: 0,Id,cuisine
0,16,indian
1,22,mexican
2,24,southern_us
3,32,japanese
4,48,indian


In [44]:
# sanity checks
assert(subm_final.notna().all().all())
assert(sorted(sample_subm['Id'].unique()) == sorted(subm_final['Id'].unique()))
assert(sample_subm.shape == subm_final.shape)

In [48]:
scores = [model_data['score'] for model_data in results]
mean_cv_score = np.mean(scores)
std_cv_score = np.std(scores)
model_name = 'LGBM'
subm_filename = f'{model_name}-cvmean={mean_cv_score:.4f}-cvstd={std_cv_score:.4f}.csv'
subm_path = os.path.join('./submissions/', subm_filename)
subm_path

'./submissions/LGBM-cvmean=0.7815-cvstd=0.0079.csv'

In [52]:
subm_final.to_csv(subm_path, index=False)

In [53]:
!kaggle competitions submit -f {subm_path} -m "Baseline" ml1819-whats-cooking

100%|████████████████████████████████████████| 136k/136k [00:02<00:00, 53.8kB/s]
Successfully submitted to ML1819 - What's Cooking?