In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

import matplotlib.pyplot as plt
import seaborn as sns

from typing import List

import os
from tqdm import tqdm

In [2]:
train = json.load(open('./input/cooking_train.json', 'r'))
test = json.load(open('./input/cooking_test.json', 'r'))

In [3]:
len(train)

30000

In [4]:
len(test)

9774

In [5]:
train[0]

{'cuisine': 'chinese',
 'id': 29565,
 'ingredients': ['romaine lettuce',
  'sliced almonds',
  'vegetable oil',
  'scallions',
  'soy sauce',
  'cooked chicken',
  'napa cabbage',
  'chopped cilantro fresh',
  'sugar',
  'sesame seeds',
  'wonton wrappers',
  'fresh lemon juice',
  'white vinegar',
  'black pepper',
  'sesame oil',
  'salt',
  'snow peas']}

In [6]:
all_recipes = train + test

## Count unique ingredients

In [7]:
train_meta = pd.DataFrame(index=[r['id'] for r in train], data={
    'ingred_len': [len(r['ingredients']) for r in train],
    'cuisine': [r['cuisine'] for r in train],
    'train': 1
})

test_meta = pd.DataFrame(index=[r['id'] for r in test], data={
    'ingred_len': [len(r['ingredients']) for r in test],
    'train': 0
})

In [8]:
train_meta.head()

Unnamed: 0,ingred_len,cuisine,train
29565,17,chinese,1
15528,8,italian,1
38015,15,cajun_creole,1
20511,19,italian,1
44111,14,chinese,1


In [9]:
meta = pd.concat([train_meta, test_meta], sort=True)
len(meta)

39774

# Generating vectors

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

In [11]:
%%time
all_recipes = train + test
print(len(all_recipes))

39774
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 828 µs


In [12]:
%%time
vectorizer = CountVectorizer(ngram_range=(1,2), analyzer='word')
all_ingredients = [', '.join(r['ingredients']) for r in all_recipes]
all_vectors = vectorizer.fit_transform(all_ingredients)
print(type(all_vectors))
assert(len(all_recipes) == all_vectors.shape[0])
print(all_vectors.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(39774, 81045)
CPU times: user 1.74 s, sys: 44 ms, total: 1.78 s
Wall time: 1.78 s


Bigrams are way too big feature space for this dataset, let's see if we can decrease their number by only using the most meaningful words / ingredients.

In [13]:
%%time
ingred_cuisine_dfs = [
    pd.DataFrame({
        'ingredient': recipe['ingredients'],
        recipe['cuisine']: 1
    }) for recipe in tqdm(train)
]

100%|██████████| 30000/30000 [00:13<00:00, 2195.30it/s]

CPU times: user 13.9 s, sys: 556 ms, total: 14.4 s
Wall time: 13.7 s





In [21]:
%%time
ingredient_cuisines = pd.concat(ingred_cuisine_dfs, sort=False).fillna(0).groupby('ingredient').sum()

CPU times: user 30.4 s, sys: 496 ms, total: 30.9 s
Wall time: 24.5 s


In [22]:
ingredient_cuisines.head()

Unnamed: 0_level_0,chinese,italian,cajun_creole,southern_us,spanish,british,mexican,korean,indian,thai,irish,filipino,greek,jamaican,vietnamese,french,moroccan,japanese,russian,brazilian
ingredient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
( oz.) tomato sauce,0.0,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
( oz.) tomato paste,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(10 oz.) frozen chopped spinach,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(14.5 oz.) diced tomatoes,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(15 oz.) refried beans,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
cuisine_colnames = ingredient_cuisines.columns
ingredient_cuisines['sum'] = ingredient_cuisines[cuisine_colnames].sum(axis=1)
ingredient_cuisines['std'] = ingredient_cuisines[cuisine_colnames].std(axis=1)
ingredient_cuisines['nunique'] = ingredient_cuisines[cuisine_colnames].nunique(axis=1)
ingredient_cuisines.sort_values(by='sum', ascending=False, inplace=True)

In [24]:
ingredient_cuisines.head()

Unnamed: 0_level_0,chinese,italian,cajun_creole,southern_us,spanish,british,mexican,korean,indian,thai,...,jamaican,vietnamese,french,moroccan,japanese,russian,brazilian,sum,std,nunique
ingredient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
salt,709.0,2586.0,560.0,1739.0,348.0,301.0,2071.0,194.0,1476.0,306.0,...,261.0,197.0,908.0,304.0,332.0,211.0,145.0,13681.0,708.865994,20
olive oil,103.0,2345.0,234.0,224.0,296.0,39.0,999.0,27.0,272.0,88.0,...,46.0,29.0,377.0,303.0,49.0,36.0,92.0,6026.0,531.868019,20
onions,212.0,940.0,382.0,358.0,190.0,108.0,1124.0,155.0,922.0,122.0,...,128.0,82.0,274.0,213.0,124.0,107.0,97.0,6024.0,312.643836,20
water,575.0,784.0,207.0,518.0,112.0,84.0,689.0,183.0,609.0,241.0,...,109.0,169.0,335.0,136.0,288.0,79.0,66.0,5604.0,227.222034,20
garlic,593.0,1129.0,282.0,180.0,93.0,26.0,982.0,216.0,559.0,321.0,...,98.0,167.0,162.0,108.0,129.0,16.0,66.0,5574.0,308.252989,20


In [25]:
popular_ingredients = list(ingredient_cuisines[ingredient_cuisines['sum'] > 512].index)
other_ingredients = [ingredient for ingredient in ingredient_cuisines.index if not ingredient in set(popular_ingredients)]
len(popular_ingredients), len(other_ingredients)

(117, 6134)

In [26]:
present_everywhere = list(ingredient_cuisines[ingredient_cuisines['nunique'] == len(cuisine_colnames)].index)
len(present_everywhere)

13

In [27]:
set(present_everywhere) - set(popular_ingredients)

set()

We could use position of these popular ingredients as features, and ignore them in vectorization of less popular ones.

In [28]:
def preprocess_recipe(recipe: dict, popular_ingredients: List[str]=popular_ingredients, other_ingredients: List[str]=other_ingredients) -> dict:
    recipe['ingredients'] = np.array(recipe['ingredients'], dtype=str)
    recipe['positions'] = [
        np.argmin(recipe['ingredients'] == ingredient) + 1 if ingredient in set(recipe['ingredients']) else 0 
        for ingredient in popular_ingredients
    ]
    recipe['ingredients'] = np.array([ingredient for ingredient in recipe['ingredients'] if not ingredient in set(popular_ingredients)])
    return recipe

In [29]:
%%time
preprocessed_recipes = [preprocess_recipe(recipe) for recipe in tqdm(all_recipes)]

100%|██████████| 39774/39774 [00:23<00:00, 1672.40it/s]

CPU times: user 24 s, sys: 984 ms, total: 25 s
Wall time: 23.8 s





# Generating features

In [30]:
meta_feature_columns = ['ingred_len']
meta.head()

Unnamed: 0,cuisine,ingred_len,train
29565,chinese,17,1
15528,italian,8,1
38015,cajun_creole,15,1
20511,italian,19,1
44111,chinese,14,1


In [31]:
%%time
pos_feature_columns = popular_ingredients
pos_feature_dfs = [
    pd.DataFrame(data=[recipe.get('positions', np.zeros(len(popular_ingredients)))], columns=popular_ingredients, index=[recipe['id']])
    for recipe in tqdm(all_recipes)
]
pos_features = pd.concat(pos_feature_dfs, sort=False)

100%|██████████| 39774/39774 [03:48<00:00, 174.38it/s]


CPU times: user 3min 51s, sys: 4.68 s, total: 3min 55s
Wall time: 3min 50s


In [32]:
meta[meta_feature_columns].shape, pos_features.shape

((39774, 1), (39774, 117))

In [33]:
%%time
features = pd.concat([meta[meta_feature_columns], pos_features], axis=1, ignore_index=True)
print(features.shape)

(39774, 118)
CPU times: user 312 ms, sys: 16 ms, total: 328 ms
Wall time: 20.9 ms


In [34]:
%%time
vectorizer = CountVectorizer(analyzer='word')  # no ngrams this time
all_ingredients = [', '.join(r['ingredients']) for r in all_recipes]  # all recipes contain preprocessed recipes
all_vectors = vectorizer.fit_transform(all_ingredients)
print(type(all_vectors))
assert(len(all_recipes) == all_vectors.shape[0])
print(all_vectors.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(39774, 3008)
CPU times: user 716 ms, sys: 8 ms, total: 724 ms
Wall time: 721 ms


# Assembling model input

In [35]:
import scipy as sp
from sklearn.preprocessing import LabelEncoder

In [41]:
assert(all_vectors.shape[0] == features.shape[0])
data = sp.sparse.hstack([all_vectors, sp.sparse.csr_matrix(features.astype(float))], format='csr')
type(data), data.shape

(scipy.sparse.csr.csr_matrix, (39774, 3126))

In [42]:
cousine_names = [r['cuisine'] for r in train]
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(cousine_names)

In [43]:
train_data = data[:len(labels)]
test_data = data[len(labels):]

# Model training and cross-validation

In [44]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [45]:
%%time
splitter = StratifiedKFold(n_splits=6, random_state=42, shuffle=True)
results = []
for fold, (train_idx_, eval_idx_) in enumerate(splitter.split(train_data, labels)):
    train_X_, train_y_ = train_data[train_idx_], labels[train_idx_]
    eval_X_, eval_y_ = train_data[eval_idx_], labels[eval_idx_]
    model_params = {
        'objective': 'multiclass',
        'n_estimators': 512,
        'n_classes': len(np.unique(labels)),
        'n_jobs': 12,
        'random_state': 42,
        'silent': True,
    }
    fit_params = {
        'eval_set': (eval_X_, eval_y_),
        'eval_metric': 'multi_error',
        'early_stopping_rounds': 64,
        'verbose': 16,
    }
    model = LGBMClassifier(**model_params)
    model.fit(train_X_, train_y_, **fit_params)
    score = model.score(eval_X_, eval_y_)
    print(f"Fold {fold}, val_accuracy={score}")
    results.append({
        'score': score,
        'model': model
    })

Training until validation scores don't improve for 64 rounds.
[16]	valid_0's multi_logloss: 1.30516	valid_0's multi_error: 0.304174
[32]	valid_0's multi_logloss: 1.03403	valid_0's multi_error: 0.27162
[48]	valid_0's multi_logloss: 0.907695	valid_0's multi_error: 0.251248
[64]	valid_0's multi_logloss: 0.837092	valid_0's multi_error: 0.238466
[80]	valid_0's multi_logloss: 0.794376	valid_0's multi_error: 0.233273
[96]	valid_0's multi_logloss: 0.767856	valid_0's multi_error: 0.225484
[112]	valid_0's multi_logloss: 0.750516	valid_0's multi_error: 0.22109
[128]	valid_0's multi_logloss: 0.740433	valid_0's multi_error: 0.219093
[144]	valid_0's multi_logloss: 0.733222	valid_0's multi_error: 0.214899
[160]	valid_0's multi_logloss: 0.727541	valid_0's multi_error: 0.2145
[176]	valid_0's multi_logloss: 0.724324	valid_0's multi_error: 0.215299
[192]	valid_0's multi_logloss: 0.721958	valid_0's multi_error: 0.213102
[208]	valid_0's multi_logloss: 0.720539	valid_0's multi_error: 0.212303
[224]	valid_0'

# Submission generation
For a start, we will just perform simple voting from out-of-fold predictions

In [46]:
sample_subm = pd.read_csv('./input/sample_submission.csv')
print(sample_subm.shape)
sample_subm.head()

(9774, 2)


Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [47]:
result_ids = [r['id'] for r in test]
print(f"Using label encored: {label_encoder}")
print(f"Using result ids: {result_ids[:7]}...")

def generate_predictions(model_data) -> pd.DataFrame:
    model = model_data['model']
    preds = model.predict(test_data, num_iteration=model.best_iteration_)
    pred_names = label_encoder.inverse_transform(preds)
    return pd.DataFrame({
        'id': result_ids,
        'cuisine': pred_names
    })

Using label encored: LabelEncoder()
Using result ids: [24888, 43564, 21898, 6991, 37700, 43546, 20544]...


In [48]:
%%time
subm_dfs = [generate_predictions(model_data) for model_data in results]

CPU times: user 31.6 s, sys: 264 ms, total: 31.9 s
Wall time: 2.66 s


In [49]:
subm = pd.concat(subm_dfs)
print(subm.shape)
subm.head()

(58644, 2)


Unnamed: 0,id,cuisine
0,24888,italian
1,43564,spanish
2,21898,italian
3,6991,moroccan
4,37700,spanish


In [50]:
sp.stats.mode([1,2,2,3]).mode[0]

2

In [51]:
%%time
_sf = subm.groupby('id').cuisine.apply(lambda arr: sp.stats.mode(arr).mode[0])
subm_final = pd.DataFrame({
    'Id': _sf.index,
    'cuisine': _sf.values
})



CPU times: user 2.29 s, sys: 40 ms, total: 2.33 s
Wall time: 2.2 s


In [52]:
subm_final.head()

Unnamed: 0,Id,cuisine
0,16,indian
1,22,mexican
2,24,southern_us
3,32,japanese
4,48,indian


In [53]:
# sanity checks
assert(subm_final.notna().all().all())
assert(sorted(sample_subm['Id'].unique()) == sorted(subm_final['Id'].unique()))
assert(sample_subm.shape == subm_final.shape)

In [54]:
scores = [model_data['score'] for model_data in results]
mean_cv_score = np.mean(scores)
std_cv_score = np.std(scores)
model_name = 'LGBM'
subm_filename = f'{model_name}-cvmean={mean_cv_score:.4f}-cvstd={std_cv_score:.4f}.csv'
subm_path = os.path.join('./submissions/', subm_filename)
subm_path

'./submissions/LGBM-cvmean=0.7871-cvstd=0.0082.csv'

In [55]:
subm_final.to_csv(subm_path, index=False)

In [56]:
!kaggle competitions submit -f {subm_path} -m "Position-based vectors for top ingredients, Tf-Idf for the rest" ml1819-whats-cooking

100%|████████████████████████████████████████| 136k/136k [00:03<00:00, 39.5kB/s]
Successfully submitted to ML1819 - What's Cooking?

# Possible improvements

LB = 0.79, did not improve the baseline.
- Preprocessing ingredients even more