In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
import spacy

import matplotlib.pyplot as plt
import seaborn as sns

from typing import List
from tqdm import tqdm
import os
import re

In [2]:
train = json.load(open('./input/cooking_train.json', 'r'))
test = json.load(open('./input/cooking_test.json', 'r'))

In [3]:
len(train)

30000

In [4]:
len(test)

9774

In [5]:
train[0]

{'cuisine': 'chinese',
 'id': 29565,
 'ingredients': ['romaine lettuce',
  'sliced almonds',
  'vegetable oil',
  'scallions',
  'soy sauce',
  'cooked chicken',
  'napa cabbage',
  'chopped cilantro fresh',
  'sugar',
  'sesame seeds',
  'wonton wrappers',
  'fresh lemon juice',
  'white vinegar',
  'black pepper',
  'sesame oil',
  'salt',
  'snow peas']}

In [6]:
all_data = train + test

## Count unique ingredients

In [7]:
train_meta = pd.DataFrame(index=[r['id'] for r in train], data={
    'ingred_len': [len(r['ingredients']) for r in train],
    'cuisine': [r['cuisine'] for r in train],
    'train': 1
})

test_meta = pd.DataFrame(index=[r['id'] for r in test], data={
    'ingred_len': [len(r['ingredients']) for r in test],
    'train': 0
})

In [8]:
train_meta.head()

Unnamed: 0,ingred_len,cuisine,train
29565,17,chinese,1
15528,8,italian,1
38015,15,cajun_creole,1
20511,19,italian,1
44111,14,chinese,1


In [9]:
meta = pd.concat([train_meta, test_meta], sort=True)
len(meta)

39774

# Processing ingredients using spaCy

In [10]:
!python -m spacy download en


[93m    Linking successful[0m
    /home/kk385830/miniconda3/envs/kaggle-cooking/lib/python3.6/site-packages/en_core_web_sm
    -->
    /home/kk385830/miniconda3/envs/kaggle-cooking/lib/python3.6/site-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [11]:
nlp = spacy.load('en')

In [12]:
recipe = all_data[0]
ingredient = recipe['ingredients'][0]
ingredient

'romaine lettuce'

In [68]:
def process_recipe(recipe, nlp=nlp):
    recipe_tokens = nlp(', '.join(recipe['ingredients']))
    nouns = [t for t in recipe_tokens if t.pos == spacy.parts_of_speech.NOUN]
    modifiers = [t for t in recipe_tokens if t.is_alpha and not t in set(nouns)]
    return {
        'nouns': nouns,
        'modifiers': modifiers
    }

In [49]:
%%time
ingred_maps = [process_recipe(recipe) for recipe in tqdm(all_data)]

100%|██████████| 39774/39774 [08:39<00:00, 76.61it/s]

CPU times: user 2h 51min 57s, sys: 7.7 s, total: 2h 52min 4s
Wall time: 8min 39s





In [53]:
ingred_maps[0]

{'nouns': [romaine,
  lettuce,
  vegetable,
  oil,
  scallions,
  soy,
  sauce,
  chicken,
  cabbage,
  cilantro,
  sugar,
  sesame,
  seeds,
  wrappers,
  lemon,
  juice,
  vinegar,
  pepper,
  oil,
  salt,
  snow,
  peas],
 'modifiers': [sliced,
  almonds,
  cooked,
  napa,
  chopped,
  fresh,
  wonton,
  fresh,
  white,
  black,
  sesame]}

This is not nearly perfect, but using a larger model should improve it.

In [50]:
%%time
noun_counts = [
    pd.DataFrame(
        data=[[
            recipe.get('cuisine', None), 
            len(im['nouns']), 
            len(im['modifiers'])]],
        columns=['label', 'len_nouns', 'len_modifiers'],
        index=[recipe['id']]
    ) for recipe, im in tqdm(zip(all_data, ingred_maps))
]

39774it [00:28, 1392.32it/s]

CPU times: user 31.6 s, sys: 592 ms, total: 32.2 s
Wall time: 28.6 s





In [51]:
%%time
nc_df = pd.concat(noun_counts)

CPU times: user 6.04 s, sys: 272 ms, total: 6.32 s
Wall time: 6.39 s


In [54]:
nc_df.sample(5)

Unnamed: 0,label,len_nouns,len_modifiers
39122,french,16,6
45567,chinese,16,6
47032,french,10,6
14325,indian,28,7
24999,korean,20,4


In [55]:
nc_df.agg(['min', 'max', 'mean', 'std'])

Unnamed: 0,len_nouns,len_modifiers
min,0.0,0.0
max,99.0,42.0
mean,14.915246,5.665963
std,6.540612,3.367649


Looks like by using just nouns we can greatly reduce the number of input parameters to the model.

In [56]:
nc_df[nc_df['len_nouns'] == 0]

Unnamed: 0,label,len_nouns,len_modifiers
29570,thai,0,1
10816,greek,0,1
16116,japanese,0,1


These 3 samples should not throw a model off, but let's remember about them for checking whether a larger model really improves the POS tagging.

In [58]:
len(nc_df[nc_df['len_modifiers'] == 0])

852

In [62]:
%%time
unique_nouns = set().union(*[set([noun.text for noun in im['nouns']]) for im in ingred_maps])

CPU times: user 824 ms, sys: 52 ms, total: 876 ms
Wall time: 871 ms


In [65]:
len(unique_nouns)  # reducing feature space by >30%, not bad

2143

# Generating vectors

In [67]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.0.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz#egg=en_core_web_md==2.0.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.0.0/en_core_web_md-2.0.0.tar.gz (120.8MB)
[K    100% |████████████████████████████████| 120.9MB 1.1MB/s 
[?25hInstalling collected packages: en-core-web-md
  Running setup.py install for en-core-web-md ... [?25ldone
[?25hSuccessfully installed en-core-web-md-2.0.0

[93m    Linking successful[0m
    /home/kk385830/miniconda3/envs/kaggle-cooking/lib/python3.6/site-packages/en_core_web_md
    -->
    /home/kk385830/miniconda3/envs/kaggle-cooking/lib/python3.6/site-packages/spacy/data/en_core_web_md

    You can now load the model via spacy.load('en_core_web_md')



In [69]:
nlp_bigger = spacy.load('en_core_web_md')

In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [71]:
%%time
all_recipes = train + test
print(len(all_recipes))

39774
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 6.89 ms


In [110]:
def preprocess_ingredients(recipe_list: List[dict]) -> List[str]:
    noun_lists = [process_recipe(recipe, nlp=nlp_bigger)['nouns'] for recipe in recipe_list]
    return [" ".join([noun.text for noun in nouns]) for nouns in noun_lists]

In [120]:
%%time
len(preprocess_ingredients(all_recipes[:1000]))  # will take 30x that

CPU times: user 3min 55s, sys: 132 ms, total: 3min 55s
Wall time: 11.8 s


1000

In [122]:
%%time
all_ingredients = preprocess_ingredients(all_recipes)

<class 'scipy.sparse.csr.csr_matrix'>
(1, 2220)
CPU times: user 2h 31min 23s, sys: 6.57 s, total: 2h 31min 30s
Wall time: 7min 37s


In [150]:
vectorizer = TfidfVectorizer()
all_vectors = vectorizer.fit_transform(all_ingredients)
print(type(all_vectors))
assert(len(all_recipes) == all_vectors.shape[0])
print(all_vectors.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(39774, 2220)


Feature space is significantly smaller - let's see if we can reduce it further and generate bigrams that will actually capture the order of ingredients.

In [163]:
def keep_popular_ingredients(ingredients: List[str], threshold: int = 10):    
    joined_ingredients = " ".join(ingredients).split(" ")
    unique, counts = np.unique(joined_ingredients, return_counts=True)
    popular_enough = set(unique[counts > threshold])
    print(f"Using {len(popular_enough)} most popular ingredients")
    filtered_ingredients = [" ".join([word for word in ingred_str.split(" ") if word in popular_enough]) for ingred_str in ingredients]
    return filtered_ingredients

In [182]:
%%time
filtered_ingredients = keep_popular_ingredients(all_ingredients, threshold=len(all_ingredients)/10)

Using 38 most popular ingredients
CPU times: user 3.56 s, sys: 140 ms, total: 3.7 s
Wall time: 657 ms


In [183]:
all_ingredients[:10]

['lettuce almonds vegetable oil scallions soy sauce chicken napa cabbage cilantro sugar sesame seeds wonton wrappers lemon juice vinegar pepper sesame oil salt snow peas',
 'pistachios fig bread ciabatta olive oil sugar wine vinegar water cheese',
 'oil lemon chili sauce shrimp butter sauce lemon juice ground pepper paprika cloves oregano bread worcestershire sauce seasoning parsley',
 'basil olive oil potato gnocchi garlic ground beef pepper meatballs cream breadcrumbs baby spinach leaves cheese worcestershire sauce onion seasoning eggs salt sodium chicken broth sea salt bay leaf',
 'honey mushroom tamari soy sauce snow peas oil pepper baby corn chestnut mushrooms garlic ginger root chili peppers lemon wine vinegar noodles',
 'sugar cooking spray purpose flour butter buttermilk baking soda baking powder corn meal eggs salt',
 'egg whites lemon juice food coloring sugar margarine spread milk crumbs lime juice',
 'corn meal olive oil rosemary thyme sage salt pizza doughs cooking spray l

In [184]:
filtered_ingredients[:10]

['vegetable oil soy sauce chicken cilantro sugar lemon juice vinegar pepper oil salt',
 'olive oil sugar wine vinegar water cheese',
 'oil lemon chili sauce butter sauce lemon juice ground pepper cloves sauce',
 'olive oil garlic ground pepper cream leaves cheese sauce onion eggs salt chicken broth salt',
 'soy sauce oil pepper corn garlic ginger chili lemon wine vinegar',
 'sugar purpose flour butter powder corn eggs salt',
 'lemon juice sugar milk lime juice',
 'corn olive oil salt',
 'eggs bell pepper onions cloves ground leaves tomatoes olive oil salt chicken broth',
 'water pepper salt chicken broth onions ground ginger flour oil eggs garlic']

In [188]:
%%time
bigram_vectorizer = TfidfVectorizer(ngram_range=(1,2))
bigram_vectors = bigram_vectorizer.fit_transform(filtered_ingredients)
print(type(bigram_vectors))
assert(len(all_recipes) == bigram_vectors.shape[0])
print(bigram_vectors.shape)

<class 'scipy.sparse.csr.csr_matrix'>
(39774, 1363)
CPU times: user 764 ms, sys: 12 ms, total: 776 ms
Wall time: 772 ms


# Generating features

In [189]:
meta_feature_columns = ['ingred_len']
meta.head()

Unnamed: 0,cuisine,ingred_len,train
29565,chinese,17,1
15528,italian,8,1
38015,cajun_creole,15,1
20511,italian,19,1
44111,chinese,14,1


In [190]:
features = meta[meta_feature_columns].values
features.shape

(39774, 1)

# Assembling model input

In [191]:
import scipy as sp
from sklearn.preprocessing import LabelEncoder

In [192]:
assert(all_vectors.shape[0] == features.shape[0])
data = sp.sparse.hstack([all_vectors, bigram_vectors, sp.sparse.csr_matrix(features)], format='csr')
type(data)

scipy.sparse.csr.csr_matrix

In [193]:
cousine_names = [r['cuisine'] for r in train]
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(cousine_names)

In [194]:
train_data = data[:len(labels)]
test_data = data[len(labels):]

# Model training and cross-validation

In [195]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
%%time
splitter = StratifiedKFold(n_splits=4, random_state=42, shuffle=True)
results = []
for fold, (train_idx_, eval_idx_) in enumerate(splitter.split(train_data, labels)):
    train_X_, train_y_ = train_data[train_idx_], labels[train_idx_]
    eval_X_, eval_y_ = train_data[eval_idx_], labels[eval_idx_]
    model_params = {
        'objective': 'multiclass',
        'boosting': 'dart',
        'learning_rate': 0.2137,
        'n_estimators': 512,
        'n_classes': len(np.unique(labels)),
        'n_jobs': 16,
        'random_state': 42,
        'silent': True,
    }
    fit_params = {
        'eval_set': (eval_X_, eval_y_),
        'eval_metric': 'multi_error',
        'early_stopping_rounds': 64,
        'verbose': 16,
    }
    model = LGBMClassifier(**model_params)
    model.fit(train_X_, train_y_, **fit_params)
    preds = model.predict(eval_X_, num_iteration=model.best_iteration_)
    score = accuracy_score(eval_y_, preds)
    conf_mat = confusion_matrix(eval_y_, preds)
    print(f"Fold {fold}, val_accuracy={score}")
    results.append({
        'score': score,
        'model': model,
        'confusion_matrix': conf_mat
    })

Training until validation scores don't improve for 64 rounds.
[16]	valid_0's multi_logloss: 1.15359	valid_0's multi_error: 0.3006
[32]	valid_0's multi_logloss: 1.10612	valid_0's multi_error: 0.283678
[48]	valid_0's multi_logloss: 0.982007	valid_0's multi_error: 0.26982
[64]	valid_0's multi_logloss: 0.954849	valid_0's multi_error: 0.256362
[80]	valid_0's multi_logloss: 0.920612	valid_0's multi_error: 0.253831
[96]	valid_0's multi_logloss: 0.87845	valid_0's multi_error: 0.247035
[112]	valid_0's multi_logloss: 0.890867	valid_0's multi_error: 0.246636
[128]	valid_0's multi_logloss: 0.891077	valid_0's multi_error: 0.247302
[144]	valid_0's multi_logloss: 0.857787	valid_0's multi_error: 0.242505
[160]	valid_0's multi_logloss: 0.864567	valid_0's multi_error: 0.242638
[176]	valid_0's multi_logloss: 0.841042	valid_0's multi_error: 0.241306
[192]	valid_0's multi_logloss: 0.833041	valid_0's multi_error: 0.238241
[208]	valid_0's multi_logloss: 0.844134	valid_0's multi_error: 0.240906
[224]	valid_0'

In [198]:
[r['score'] for r in results]

[0.7682878081279148,
 0.7537989869368168,
 0.7504335067360277,
 0.7612059765208111]

# Submission generation
For a start, we will just perform simple voting from out-of-fold predictions

In [34]:
sample_subm = pd.read_csv('./input/sample_submission.csv')
print(sample_subm.shape)
sample_subm.head()

(9774, 2)


Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [35]:
result_ids = [r['id'] for r in test]
print(f"Using label encored: {label_encoder}")
print(f"Using result ids: {result_ids[:7]}...")

def generate_predictions(model_data) -> pd.DataFrame:
    model = model_data['model']
    preds = model.predict(test_data, num_iteration=model.best_iteration_)
    pred_names = label_encoder.inverse_transform(preds)
    return pd.DataFrame({
        'id': result_ids,
        'cuisine': pred_names
    })

Using label encored: LabelEncoder()
Using result ids: [24888, 43564, 21898, 6991, 37700, 43546, 20544]...


In [36]:
%%time
subm_dfs = [generate_predictions(model_data) for model_data in results]

CPU times: user 22.3 s, sys: 444 ms, total: 22.8 s
Wall time: 2.19 s


In [37]:
subm = pd.concat(subm_dfs)
print(subm.shape)
subm.head()

(58644, 2)


Unnamed: 0,id,cuisine
0,24888,italian
1,43564,spanish
2,21898,italian
3,6991,moroccan
4,37700,spanish


In [38]:
sp.stats.mode([1,2,2,3]).mode[0]

2

In [39]:
%%time
_sf = subm.groupby('id').cuisine.apply(lambda arr: sp.stats.mode(arr).mode[0])
subm_final = pd.DataFrame({
    'Id': _sf.index,
    'cuisine': _sf.values
})



CPU times: user 3.12 s, sys: 192 ms, total: 3.31 s
Wall time: 3.14 s


In [40]:
subm_final.head()

Unnamed: 0,Id,cuisine
0,16,indian
1,22,mexican
2,24,southern_us
3,32,japanese
4,48,indian


In [44]:
# sanity checks
assert(subm_final.notna().all().all())
assert(sorted(sample_subm['Id'].unique()) == sorted(subm_final['Id'].unique()))
assert(sample_subm.shape == subm_final.shape)

In [48]:
scores = [model_data['score'] for model_data in results]
mean_cv_score = np.mean(scores)
std_cv_score = np.std(scores)
model_name = 'LGBM'
subm_filename = f'{model_name}-cvmean={mean_cv_score:.4f}-cvstd={std_cv_score:.4f}.csv'
subm_path = os.path.join('./submissions/', subm_filename)
subm_path

'./submissions/LGBM-cvmean=0.7815-cvstd=0.0079.csv'

In [52]:
subm_final.to_csv(subm_path, index=False)

In [53]:
!kaggle competitions submit -f {subm_path} -m "Baseline" ml1819-whats-cooking

100%|████████████████████████████████████████| 136k/136k [00:02<00:00, 53.8kB/s]
Successfully submitted to ML1819 - What's Cooking?

# Possible improvements

- For a baseline, we have only vectorized words, so multi-word ingredients are treated the same as single-word ones - 
  it may be beneficial to separate the ingredient and its modifiers
- TfIdf does not take position on the list into account - need to try other vectorization techniques
- Testing other models: NNs in particular might work well on such dataset - if well-made we can use them 
  to take order and comma-separation of ingredients vs their modifiers into account