In [None]:
import re
import json
import tqdm
import random
from datasets import load_dataset
from transformers import AutoTokenizer
random.seed(101)

In [None]:
# extract .zip file in recipes/raw/dataset.zip
# !unzip -oq recipes/raw/dataset.zip -d recipes/raw/

In [None]:
dataset = load_dataset("recipe_nlg", data_dir="recipes/raw/dataset")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def mask_recipe_proportion(recipe, tokenizer, proportion):
    tokens = tokenizer.tokenize(recipe)
    len_recipe = len(tokens)
    max_index = int(len_recipe * (1 - proportion))
    mask_0 = random.randint(0, max_index)
    mask_1 = mask_0 + int(len_recipe * proportion)
    masking30_tgt = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens[mask_0:mask_1]) )
    # substitute those tokens with [MASK]
    masking30_src = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens[:mask_0])) + " [MASK] " + tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens[mask_1:]))
    mask = {
        "src_len":len_recipe,
        "tgt_len":len(tokenizer.tokenize(masking30_tgt)),
        "src":masking30_src,
        "tgt":masking30_tgt
    }
    return mask

def is_mask_valid(mask):
    return mask["src_len"] < 512 and mask["tgt_len"] < 128


In [None]:
failed = 0
masked_recipes = []
for sample in tqdm.tqdm(dataset["train"],desc=f"Failed {failed}", total=len(dataset["train"])):
    recipe = sample["title"] + " Ingredients: " + " ".join(sample["ingredients"]) + " Directions: -" + " -".join(sample["directions"])
    # MASKING: 90% of directions, 10% of ingredients
    if random.random()>0.85:
        # mask ingredients
        src = sample["title"] + " Ingredients: " + " [MASK] " + " Directions: -" + " -".join(sample["directions"])
        tgt = " ".join(sample["ingredients"])
        mask_human = {
            "src":src,
            "tgt":tgt,
            "src_len":len(tokenizer.tokenize(src)),
            "tgt_len":len(tokenizer.tokenize(tgt))
        }
    else:
        # sample to mask from 1 to len(directions)-2
        if len(sample["directions"]) > 2:
            num_directions_to_mask = random.randint(1, len(sample["directions"])-2)
            mask_0 = random.randint(0, len(sample["directions"])-num_directions_to_mask)
            mask_1 = mask_0 + num_directions_to_mask
            src_masked_directions = sample["directions"][:mask_0] + ["[MASK]"] + sample["directions"][mask_1:]
            tgt = " -".join(sample["directions"][mask_0:mask_1])
            src = sample["title"] + " Ingredients: " + " ".join(sample["ingredients"]) + " Directions: -" + " -".join(src_masked_directions)
            mask_human = {
                "src":src,
                "tgt":tgt,
                "src_len":len(tokenizer.tokenize(src)),
                "tgt_len":len(tokenizer.tokenize(tgt))
            }

    mask_30 = mask_recipe_proportion(recipe, tokenizer, 0.3)
    if mask_human and is_mask_valid(mask_human) and mask_30 and is_mask_valid(mask_30):
        masked_recipes.append({
            "id":sample["id"],
            "mask_30":mask_30,
            "mask_human":mask_human
        })
    else:
        failed += 1

In [None]:
failed

In [None]:
len(masked_recipes)

In [None]:
# save masked_recipes as json
with open("recipes/raw/masked_recipes.json", "w") as f:
    json.dump(masked_recipes, f)

In [None]:
# load masked_recipes from json
with open("recipes/raw/masked_recipes.json", "r") as f:
    masked_recipes = json.load(f)

In [None]:
masked_recipes[0]

In [None]:
# generate the train.src, train.tgt, valid.src, valid.tgt, test.src, test.tgt
# set seed
random.seed(101)
# split dataset into train, valid, test
random.shuffle(masked_recipes)
train = masked_recipes[:int(len(masked_recipes)*0.8)]
valid = masked_recipes[int(len(masked_recipes)*0.8):int(len(masked_recipes)*0.95)]
test = masked_recipes[int(len(masked_recipes)*0.95):]

for split, recipes in zip(["train", "dev", "test"], [train, valid, test]):
    for folder in ["mask_30", "mask_human"]:
        # recipes/folder/split.
        with open(f"recipes/{folder}/org_data/{split}.src", "w") as f:
            f.write("\n".join([re.sub("\n","",recipe[folder]["src"]) for recipe in recipes]))
        with open(f"recipes/{folder}/org_data/{split}.tgt", "w") as f:
            f.write("\n".join([re.sub("\n","",recipe[folder]["src"]) for recipe in recipes]))


In [None]:
len("\n".join([re.sub("\n","",recipe[folder]["src"]) for recipe in train]).split("\n"))

In [None]:
len([re.sub("\n","",recipe[folder]["src"]) for recipe in train])

In [None]:
len([recipe["mask_human"]["tgt"] for recipe in train])

In [None]:
# count number of line sin each file in recipes/mask_human/org_data
!wc -l recipes/mask_human/org_data/dev.src

In [None]:
!wc -l recipes/mask_human/org_data/dev.tgt

In [None]:
print(masking30_src)

In [None]:
print(masking30_tgt)

In [None]:
masking30_tgt

In [None]:
masking30_tgt