In [37]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

import matplotlib.pyplot as plt
import seaborn as sns

from typing import List
from tqdm import tqdm
import os
import re

In [17]:
train = json.load(open('./input/cooking_train.json', 'r'))
test = json.load(open('./input/cooking_test.json', 'r'))

In [18]:
len(train)

30000

In [19]:
len(test)

9774

In [20]:
train[0]

{'cuisine': 'chinese',
 'id': 29565,
 'ingredients': ['romaine lettuce',
  'sliced almonds',
  'vegetable oil',
  'scallions',
  'soy sauce',
  'cooked chicken',
  'napa cabbage',
  'chopped cilantro fresh',
  'sugar',
  'sesame seeds',
  'wonton wrappers',
  'fresh lemon juice',
  'white vinegar',
  'black pepper',
  'sesame oil',
  'salt',
  'snow peas']}

In [21]:
all_data = train + test

## Count unique ingredients

In [22]:
train_meta = pd.DataFrame(index=[r['id'] for r in train], data={
    'ingred_len': [len(r['ingredients']) for r in train],
    'cuisine': [r['cuisine'] for r in train],
    'train': 1
})

test_meta = pd.DataFrame(index=[r['id'] for r in test], data={
    'ingred_len': [len(r['ingredients']) for r in test],
    'train': 0
})

In [23]:
train_meta.head()

Unnamed: 0,ingred_len,cuisine,train
29565,17,chinese,1
15528,8,italian,1
38015,15,cajun_creole,1
20511,19,italian,1
44111,14,chinese,1


In [24]:
meta = pd.concat([train_meta, test_meta], sort=True)
len(meta)

39774

# Analyzing ingredients

In [30]:
%%time
ingred_cuisine_dfs = [
    pd.DataFrame({
        'ingredient': recipe['ingredients'],
        recipe['cuisine']: 1
    }) for recipe in tqdm(train)
]

100%|██████████| 30000/30000 [00:15<00:00, 1989.20it/s]

CPU times: user 15.1 s, sys: 524 ms, total: 15.6 s
Wall time: 15.1 s





In [31]:
%%time
ingredient_cuisines = pd.concat(ingred_cuisine_dfs, sort=False).fillna(0).groupby('ingredient').sum()

CPU times: user 30.7 s, sys: 700 ms, total: 31.4 s
Wall time: 25.7 s


In [32]:
ingredient_cuisines.head()

Unnamed: 0_level_0,chinese,italian,cajun_creole,southern_us,spanish,british,mexican,korean,indian,thai,irish,filipino,greek,jamaican,vietnamese,french,moroccan,japanese,russian,brazilian
ingredient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
( oz.) tomato sauce,0.0,6.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
( oz.) tomato paste,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(10 oz.) frozen chopped spinach,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(14.5 oz.) diced tomatoes,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(15 oz.) refried beans,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Let's analyze what actually is in the labels - some signs such as brackets can be easily removed to eliminate noise from the data. If multiple labels contain numerical values, we can easily separate them into meaningful features.

In [79]:
ingredient_cuisines['count_notalpha'] = [len(re.findall('[^a-zA-Z\s]', i)) for i in ingredient_cuisines.index]
ingredient_cuisines['count_numbers'] = [len(re.findall('\d+', i)) for i in ingredient_cuisines.index]

In [80]:
ingredient_cuisines['count_notalpha'].agg(['min', 'max', 'mean'])

min     0.000000
max     7.000000
mean    0.085266
Name: count_notalpha, dtype: float64

In [55]:
ingredient_cuisines['count_numbers'].agg(['min', 'max', 'mean'])

min     0.000000
max     4.000000
mean    0.006239
Name: count_numbers, dtype: float64

In [61]:
len(ingredient_cuisines[ingredient_cuisines['count_numbers'] > 0])

35

# Generating vectors

In [81]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [82]:
%%time
all_recipes = train + test
print(len(all_recipes))

39774
CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 940 µs


In [93]:
def preprocess_ingredients(recipe_list: List[str]) -> str:
    keep_text_ws = lambda ingredient: "".join(re.findall('[a-zA-Z\s]', ingredient))
    strip_ingredient = lambda ingredient: "".join([word.lower() for word in keep_text_ws(ingredient).split(" ")])
    return ", ".join([strip_ingredient(ingredient) for ingredient in recipe_list])

In [92]:
preprocess_ingredients(all_recipes[0]['ingredients'])

'romainelettuce, slicedalmonds, vegetableoil, scallions, soysauce, cookedchicken, napacabbage, choppedcilantrofresh, sugar, sesameseeds, wontonwrappers, freshlemonjuice, whitevinegar, blackpepper, sesameoil, salt, snowpeas'

In [94]:
vectorizer = TfidfVectorizer()
all_ingredients = [preprocess_ingredients(r['ingredients']) for r in all_recipes]
all_vectors = vectorizer.fit_transform(all_ingredients)
print(type(all_vectors))
assert(len(all_recipes) == all_vectors.shape[0])
print(all_vectors[0].shape)

<class 'scipy.sparse.csr.csr_matrix'>
(1, 6650)


# Generating features

In [95]:
meta_feature_columns = ['ingred_len']
meta.head()

Unnamed: 0,cuisine,ingred_len,train
29565,chinese,17,1
15528,italian,8,1
38015,cajun_creole,15,1
20511,italian,19,1
44111,chinese,14,1


In [96]:
features = meta[meta_feature_columns].values
features.shape

(39774, 1)

# Assembling model input

In [97]:
import scipy as sp
from sklearn.preprocessing import LabelEncoder

In [98]:
assert(all_vectors.shape[0] == features.shape[0])
data = sp.sparse.hstack([all_vectors, sp.sparse.csr_matrix(features)], format='csr')
type(data)

scipy.sparse.csr.csr_matrix

In [99]:
cousine_names = [r['cuisine'] for r in train]
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(cousine_names)

In [100]:
train_data = data[:len(labels)]
test_data = data[len(labels):]

# Model training and cross-validation

In [101]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
%%time
splitter = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
results = []
for fold, (train_idx_, eval_idx_) in enumerate(splitter.split(train_data, labels)):
    train_X_, train_y_ = train_data[train_idx_], labels[train_idx_]
    eval_X_, eval_y_ = train_data[eval_idx_], labels[eval_idx_]
    model_params = {
        'objective': 'multiclass',
        'boosting': 'dart',
        'learning_rate': 0.2137,
        'n_estimators': 512,
        'n_classes': len(np.unique(labels)),
        'n_jobs': 16,
        'random_state': 42,
        'silent': True,
    }
    fit_params = {
        'eval_set': (eval_X_, eval_y_),
        'eval_metric': 'multi_error',
        'early_stopping_rounds': 64,
        'verbose': 16,
    }
    model = LGBMClassifier(**model_params)
    model.fit(train_X_, train_y_, **fit_params)
    score = model.score(eval_X_, eval_y_)
    print(f"Fold {fold}, val_accuracy={score}")
    results.append({
        'score': score,
        'model': model
    })

Training until validation scores don't improve for 64 rounds.
[16]	valid_0's multi_error: 0.322185	valid_0's multi_logloss: 1.23227
[32]	valid_0's multi_error: 0.309793	valid_0's multi_logloss: 1.20159
[48]	valid_0's multi_error: 0.292205	valid_0's multi_logloss: 1.0781
[64]	valid_0's multi_error: 0.281279	valid_0's multi_logloss: 1.05662
[80]	valid_0's multi_error: 0.278614	valid_0's multi_logloss: 1.01923
[96]	valid_0's multi_error: 0.269554	valid_0's multi_logloss: 0.973676
[112]	valid_0's multi_error: 0.268754	valid_0's multi_logloss: 0.990077
[128]	valid_0's multi_error: 0.271686	valid_0's multi_logloss: 0.990391
[144]	valid_0's multi_error: 0.262625	valid_0's multi_logloss: 0.950828
[160]	valid_0's multi_error: 0.263824	valid_0's multi_logloss: 0.959737
[176]	valid_0's multi_error: 0.25956	valid_0's multi_logloss: 0.931005
[192]	valid_0's multi_error: 0.259027	valid_0's multi_logloss: 0.922669
[208]	valid_0's multi_error: 0.258628	valid_0's multi_logloss: 0.936602
[224]	valid_0's

# Submission generation
For a start, we will just perform simple voting from out-of-fold predictions

In [34]:
sample_subm = pd.read_csv('./input/sample_submission.csv')
print(sample_subm.shape)
sample_subm.head()

(9774, 2)


Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [35]:
result_ids = [r['id'] for r in test]
print(f"Using label encored: {label_encoder}")
print(f"Using result ids: {result_ids[:7]}...")

def generate_predictions(model_data) -> pd.DataFrame:
    model = model_data['model']
    preds = model.predict(test_data, num_iteration=model.best_iteration_)
    pred_names = label_encoder.inverse_transform(preds)
    return pd.DataFrame({
        'id': result_ids,
        'cuisine': pred_names
    })

Using label encored: LabelEncoder()
Using result ids: [24888, 43564, 21898, 6991, 37700, 43546, 20544]...


In [36]:
%%time
subm_dfs = [generate_predictions(model_data) for model_data in results]

CPU times: user 22.3 s, sys: 444 ms, total: 22.8 s
Wall time: 2.19 s


In [37]:
subm = pd.concat(subm_dfs)
print(subm.shape)
subm.head()

(58644, 2)


Unnamed: 0,id,cuisine
0,24888,italian
1,43564,spanish
2,21898,italian
3,6991,moroccan
4,37700,spanish


In [38]:
sp.stats.mode([1,2,2,3]).mode[0]

2

In [39]:
%%time
_sf = subm.groupby('id').cuisine.apply(lambda arr: sp.stats.mode(arr).mode[0])
subm_final = pd.DataFrame({
    'Id': _sf.index,
    'cuisine': _sf.values
})



CPU times: user 3.12 s, sys: 192 ms, total: 3.31 s
Wall time: 3.14 s


In [40]:
subm_final.head()

Unnamed: 0,Id,cuisine
0,16,indian
1,22,mexican
2,24,southern_us
3,32,japanese
4,48,indian


In [44]:
# sanity checks
assert(subm_final.notna().all().all())
assert(sorted(sample_subm['Id'].unique()) == sorted(subm_final['Id'].unique()))
assert(sample_subm.shape == subm_final.shape)

In [48]:
scores = [model_data['score'] for model_data in results]
mean_cv_score = np.mean(scores)
std_cv_score = np.std(scores)
model_name = 'LGBM'
subm_filename = f'{model_name}-cvmean={mean_cv_score:.4f}-cvstd={std_cv_score:.4f}.csv'
subm_path = os.path.join('./submissions/', subm_filename)
subm_path

'./submissions/LGBM-cvmean=0.7815-cvstd=0.0079.csv'

In [52]:
subm_final.to_csv(subm_path, index=False)

In [53]:
!kaggle competitions submit -f {subm_path} -m "Baseline" ml1819-whats-cooking

100%|████████████████████████████████████████| 136k/136k [00:02<00:00, 53.8kB/s]
Successfully submitted to ML1819 - What's Cooking?

# Possible improvements

- For a baseline, we have only vectorized words, so multi-word ingredients are treated the same as single-word ones - 
  it may be beneficial to separate the ingredient and its modifiers
- TfIdf does not take position on the list into account - need to try other vectorization techniques
- Testing other models: NNs in particular might work well on such dataset - if well-made we can use them 
  to take order and comma-separation of ingredients vs their modifiers into account