# Using fastText library by Facebook Research
https://github.com/facebookresearch/fastText

## Data preprocessing

In [86]:
from sklearn.model_selection import train_test_split

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import os

In [87]:
train = json.load(open('./input/cooking_train.json', 'r'))
test = json.load(open('./input/cooking_test.json', 'r'))

In [88]:
def process_recipe(recipe: dict) -> str:
    """ Preprocess recipe to a fastText-compatible labeled txt file. """
    doc = ", ".join(recipe['ingredients'])
    try:
        return f"__label__{recipe['cuisine']} {doc}\n"
    except KeyError:
        return f"{doc}\n"

In [89]:
%%time
labeled_docs = [process_recipe(recipe) for recipe in tqdm(train)]
unlabeled_docs = [process_recipe(recipe) for recipe in tqdm(test)]


  0%|          | 0/30000 [00:00<?, ?it/s][A
100%|██████████| 30000/30000 [00:00<00:00, 721125.11it/s][A
  0%|          | 0/9774 [00:00<?, ?it/s][A
100%|██████████| 9774/9774 [00:00<00:00, 602214.17it/s][A

CPU times: user 68 ms, sys: 0 ns, total: 68 ms
Wall time: 64.5 ms


In [90]:
PREPROCESSED_TRAIN_PATH = './data/train-labeled.txt'
PREPROCESSED_EVAL_PATH = './data/eval-labeled.txt'
PREPROCESSED_TEST_PATH = './data/test.txt'
MODEL_PATH = './models/fasttext'

In [91]:
train_docs, eval_docs = train_test_split(labeled_docs, test_size=0.2, random_state=42)

In [92]:
len(train_docs), len(eval_docs)

(24000, 6000)

In [93]:
with open(PREPROCESSED_TRAIN_PATH, 'w') as train_file:
    train_file.writelines(train_docs)
    
with open(PREPROCESSED_EVAL_PATH, 'w') as eval_file:
    eval_file.writelines(eval_docs)
    
with open(PREPROCESSED_TEST_PATH, 'w') as test_file:
    test_file.writelines(unlabeled_docs)

## Model training

In [144]:
!./fastText/fasttext supervised -input {PREPROCESSED_TRAIN_PATH} -output {MODEL_PATH} -lr 0.1 -epoch 30 -wordNgrams 1

Read 0M words
Number of words:  4169
Number of labels: 20
Progress: 100.0% words/sec/thread:  404843 lr:  0.000000 loss:  0.591967 ETA:   0h 0m


In [145]:
!./fastText/fasttext test {MODEL_PATH}.bin {PREPROCESSED_EVAL_PATH}

N	6000
P@1	0.77
R@1	0.77


In [124]:
predicted_labels = !./fastText/fasttext predict {MODEL_PATH}.bin {PREPROCESSED_TEST_PATH}

In [125]:
len(predicted_labels)

9774

In [126]:
predicted_labels[0], predicted_labels[-1]

('__label__italian', '__label__british')

# Submission generation

In [127]:
sample_subm = pd.read_csv('./input/sample_submission.csv')
sample_subm.head()

Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [128]:
def extract_label(fasttext_label: str) -> str:
    return fasttext_label[len('__label__'):]

In [129]:
subm = pd.DataFrame({
    'Id': [recipe['id'] for recipe in test],
    'cuisine': [extract_label(ft_label) for ft_label in predicted_labels]
})
subm.head()

Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,brazilian
2,21898,italian
3,6991,moroccan
4,37700,spanish


In [130]:
# sanity checks
assert(subm.notna().all().all())
assert(sorted(sample_subm['Id'].unique()) == sorted(subm['Id'].unique()))
assert(sample_subm.shape == subm.shape)

In [131]:
subm_path = os.path.join('./submissions/', 'fasttext-tuned.csv')
subm_path

'./submissions/fasttext-tuned.csv'

In [132]:
subm.to_csv(subm_path, index=False)

In [133]:
!kaggle competitions submit -f {subm_path} -m "fastText" ml1819-whats-cooking

100%|████████████████████████████████████████| 136k/136k [00:02<00:00, 58.8kB/s]
Successfully submitted to ML1819 - What's Cooking?

## Better preprocessing
We will now squash ingredients together to check how it changes accuracy.

In [147]:
from typing import List

In [148]:
def preprocess_ingredients(recipe_list: List[str]) -> str:
    strip_ingredient = lambda ingredient: "".join([word.lower() for word in ingredient.split(" ") if word.isalnum()])
    return ", ".join([strip_ingredient(ingredient) for ingredient in recipe_list])

In [149]:
def process_recipe_with_preprocessing(recipe: dict) -> str:
    """ Preprocess recipe to a fastText-compatible labeled txt file. """
    doc = preprocess_ingredients(recipe['ingredients'])
    try:
        return f"__label__{recipe['cuisine']} {doc}\n"
    except KeyError:
        return f"{doc}\n"

In [150]:
%%time
labeled_docs_2 = [process_recipe_with_preprocessing(recipe) for recipe in tqdm(train)]
unlabeled_docs_2 = [process_recipe_with_preprocessing(recipe) for recipe in tqdm(test)]


  0%|          | 0/30000 [00:00<?, ?it/s][A
 15%|█▍        | 4429/30000 [00:00<00:00, 44285.50it/s][A
 34%|███▍      | 10341/30000 [00:00<00:00, 47889.55it/s][A
 54%|█████▍    | 16185/30000 [00:00<00:00, 50630.11it/s][A
 73%|███████▎  | 21976/30000 [00:00<00:00, 52604.01it/s][A
 93%|█████████▎| 27936/30000 [00:00<00:00, 54522.94it/s][A
100%|██████████| 30000/30000 [00:00<00:00, 55882.87it/s][A
  0%|          | 0/9774 [00:00<?, ?it/s][A
 60%|██████    | 5894/9774 [00:00<00:00, 58933.45it/s][A
100%|██████████| 9774/9774 [00:00<00:00, 63710.55it/s][A

CPU times: user 684 ms, sys: 24 ms, total: 708 ms
Wall time: 698 ms


In [151]:
PREPROCESSED_TRAIN_PATH_2 = './data/train-labeled-2.txt'
PREPROCESSED_EVAL_PATH_2 = './data/eval-labeled-2.txt'
PREPROCESSED_TEST_PATH_2 = './data/test-2.txt'
MODEL_PATH_2 = './models/fasttext-2'

In [154]:
train_docs_2, eval_docs_2 = train_test_split(labeled_docs_2, test_size=0.2, random_state=42)

In [155]:
len(train_docs), len(eval_docs)

(24000, 6000)

In [156]:
with open(PREPROCESSED_TRAIN_PATH_2, 'w') as train_file:
    train_file.writelines(train_docs_2)
    
with open(PREPROCESSED_EVAL_PATH_2, 'w') as eval_file:
    eval_file.writelines(eval_docs_2)
    
with open(PREPROCESSED_TEST_PATH_2, 'w') as test_file:
    test_file.writelines(unlabeled_docs_2)

## Model training

In [190]:
!./fastText/fasttext supervised -input {PREPROCESSED_TRAIN_PATH_2} -output {MODEL_PATH_2} -lr 0.275 -epoch 50 -wordNgrams 3

Read 0M words
Number of words:  7688
Number of labels: 20
Progress: 100.0% words/sec/thread:  235806 lr:  0.000000 loss:  0.078481 ETA:   0h 0m


In [191]:
!./fastText/fasttext test {MODEL_PATH_2}.bin {PREPROCESSED_EVAL_PATH_2}

N	6000
P@1	0.744
R@1	0.744


In [181]:
predicted_labels = !./fastText/fasttext predict {MODEL_PATH}.bin {PREPROCESSED_TEST_PATH}

In [182]:
len(predicted_labels)

9774

In [183]:
predicted_labels[0], predicted_labels[-1]

('__label__italian', '__label__british')

# Ensembling predictions

# Submission generation

In [127]:
sample_subm = pd.read_csv('./input/sample_submission.csv')
sample_subm.head()

Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,italian
2,21898,italian
3,6991,italian
4,37700,italian


In [128]:
def extract_label(fasttext_label: str) -> str:
    return fasttext_label[len('__label__'):]

In [129]:
subm = pd.DataFrame({
    'Id': [recipe['id'] for recipe in test],
    'cuisine': [extract_label(ft_label) for ft_label in predicted_labels]
})
subm.head()

Unnamed: 0,Id,cuisine
0,24888,italian
1,43564,brazilian
2,21898,italian
3,6991,moroccan
4,37700,spanish


In [130]:
# sanity checks
assert(subm.notna().all().all())
assert(sorted(sample_subm['Id'].unique()) == sorted(subm['Id'].unique()))
assert(sample_subm.shape == subm.shape)

In [131]:
subm_path = os.path.join('./submissions/', 'fasttext-tuned.csv')
subm_path

'./submissions/fasttext-tuned.csv'

In [132]:
subm.to_csv(subm_path, index=False)

In [133]:
!kaggle competitions submit -f {subm_path} -m "fastText" ml1819-whats-cooking

100%|████████████████████████████████████████| 136k/136k [00:02<00:00, 58.8kB/s]
Successfully submitted to ML1819 - What's Cooking?