In [None]:
import re
import json
import tqdm
import random
from datasets import load_dataset
from transformers import AutoTokenizer
random.seed(101)

In [None]:
# extract .zip file in recipes/raw/dataset.zip
# !unzip -oq recipes.zip -d recipes

In [None]:
dataset = load_dataset("recipe_nlg", data_dir="recipes/raw/dataset")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def mask_recipe_proportion(recipe, tokenizer, proportion):
    tokens = tokenizer.tokenize(recipe)
    len_recipe = len(tokens)
    max_index = int(len_recipe * (1 - proportion))
    mask_0 = random.randint(0, max_index)
    mask_1 = mask_0 + int(len_recipe * proportion)
    masking30_tgt = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens[mask_0:mask_1]) )
    # substitute those tokens with [MASK]
    masking30_src = tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens[:mask_0])) + " [MASK] " + tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens[mask_1:]))
    mask = {
        "src_len":len_recipe,
        "tgt_len":len(tokenizer.tokenize(masking30_tgt)),
        "src":masking30_src,
        "tgt":masking30_tgt
    }
    return mask

def is_mask_valid(mask):
    return mask["src_len"] < 512 and mask["tgt_len"] < 128


In [None]:
failed = 0
masked_recipes = []
for sample in tqdm.tqdm(dataset["train"],desc=f"Failed {failed}", total=len(dataset["train"])):
    recipe = sample["title"] + " Ingredients: " + " ".join(sample["ingredients"]) + " Directions: -" + " -".join(sample["directions"])
    # MASKING: 90% of directions, 10% of ingredients
    if random.random()>0.85:
        # mask ingredients
        src = sample["title"] + " Ingredients: " + " [MASK] " + " Directions: -" + " -".join(sample["directions"])
        tgt = " ".join(sample["ingredients"])
        mask_human = {
            "src":src,
            "tgt":tgt,
            "src_len":len(tokenizer.tokenize(src)),
            "tgt_len":len(tokenizer.tokenize(tgt))
        }
    else:
        # sample to mask from 1 to len(directions)-2
        if len(sample["directions"]) > 2:
            num_directions_to_mask = random.randint(1, len(sample["directions"])-2)
            mask_0 = random.randint(0, len(sample["directions"])-num_directions_to_mask)
            mask_1 = mask_0 + num_directions_to_mask
            src_masked_directions = sample["directions"][:mask_0] + ["[MASK]"] + sample["directions"][mask_1:]
            tgt = " -".join(sample["directions"][mask_0:mask_1])
            src = sample["title"] + " Ingredients: " + " ".join(sample["ingredients"]) + " Directions: -" + " -".join(src_masked_directions)
            mask_human = {
                "src":src,
                "tgt":tgt,
                "src_len":len(tokenizer.tokenize(src)),
                "tgt_len":len(tokenizer.tokenize(tgt))
            }

    mask_30 = mask_recipe_proportion(recipe, tokenizer, 0.3)
    if mask_human and is_mask_valid(mask_human) and mask_30 and is_mask_valid(mask_30):
        masked_recipes.append({
            "id":sample["id"],
            "mask_30":mask_30,
            "mask_human":mask_human
        })
    else:
        failed += 1

In [None]:
failed

In [None]:
len(masked_recipes)

In [None]:
# save masked_recipes as json
with open("recipes/raw/masked_recipes.json", "w") as f:
    json.dump(masked_recipes, f)

In [None]:
# load masked_recipes from json
with open("recipes/raw/masked_recipes.json", "r") as f:
    masked_recipes = json.load(f)

In [14]:
# generate the train.src, train.tgt, valid.src, valid.tgt, test.src, test.tgt
# set seed
random.seed(101)
# split dataset into train, valid, test
random.shuffle(masked_recipes)
train = masked_recipes[:int(len(masked_recipes)*0.8)]
valid = masked_recipes[int(len(masked_recipes)*0.8):int(len(masked_recipes)*0.95)]
test = masked_recipes[int(len(masked_recipes)*0.95):]

for split, recipes in zip(["train", "dev", "test-all"], [train, valid, test]):
    for folder in ["mask_30"]: #,"mask_human"
        # recipes/folder/split.
        with open(f"recipes/{folder}/org_data/{split}.src", "w", encoding="utf-8") as f:
            f.write("\n".join([re.sub("\\n|\\r","",recipe[folder]["src"]) for recipe in recipes]))
        with open(f"recipes/{folder}/org_data/{split}.tgt", "w", encoding="utf-8") as f:
            f.write("\n".join([re.sub("\\n|\\r","",recipe[folder]["tgt"]) for recipe in recipes]))


In [None]:
len(masked_recipes)

In [None]:
# count number of line sin each file in recipes/mask_human/org_data
!wc -l recipes/mask_human/org_data/train.src

In [None]:
!wc -l recipes/mask_human/org_data/train.tgt

In [15]:
# print first 5 lines from recipes/mask_human/org_data/{split}.src and recipes/mask_human/org_data/{split}.tgt
split = 'test-all'
with open(f'recipes/mask_30/org_data/{split}.src', 'r') as f:
    src_lines = f.readlines()
with open(f'recipes/mask_30/org_data/{split}.tgt', 'r') as f:
    tgt_lines = f.readlines()
for i in range(5):
    print(f'src: {src_lines[i]}')
    print(f'tgt: {tgt_lines[i]}')
    print()

src: harissa lamb and quinoa burgers ingredients : 400 g ground lamb 1 cup quinoa, cooked 1 tablespoon harissa, plus extra for speading 1 garlic clove, crushed salt pepper 1 tablespoon olive oil 6 bread rolls, halved and toasted 1 cup tzatziki 3 cups salad leaves directions : - put mince, quinoa harissa, garlic, salt and pepper in [MASK] the tzatziki and extra harissa. top with green leaves and patties to serve.

tgt: a bowl and mix well to combine. - shape the lamb into 6 patties. - heat oil in a large frying pan over medium heat. cook patties 4 - 5mins each side. - spread the rolls with


src: ez asian chicken or turkey stir fry ingredients : 2 lbs fresh turkey breast, slices ( 1 louis rich pkg. ) 1 bunch bok choy ( 1 lb. ) 8 ounces sliced water chestnuts 1 cup fresh mushrooms ( 2 oz. ) 3 green onions 1 tablespoon oil 1 cup chicken broth 14 cup soy sauce ( i used the low - sodium soy sauce ) 2 tablespoons dry sherry 2 tablespoons cornstarch 14 teaspoon garlic powder 14 teaspoon groun

In [None]:
for split in ["train", "dev", "test"]: 
    src = []
    train_src_path = f"recipes/mask_human/org_data/{split}.src"
    with open(train_src_path, "r", encoding="utf-8") as ifile:
        for line in tqdm.tqdm(ifile):
            line = line.strip()
            text = line
            src.append(text)
    src[69696:69700]
    src = []
    train_src_path = f"recipes/mask_human/org_data/{split}.tgt"
    with open(train_src_path, "r", encoding="utf-8") as ifile:
        for line in tqdm.tqdm(ifile):
            line = line.strip()
            text = line
            if not text:
                continue
            src.append(text)
src[69696:69700]

In [16]:
# # reac from recipes/mask_human/org_data/test-all.src and .tgt, and save a new test.src, test.tgt with only the top 10 lines
# with open("recipes/mask_human/org_data/test-all.src", "r") as f:
#     src_lines = f.readlines()
# with open("recipes/mask_human/org_data/test-all.tgt", "r") as f:
#     tgt_lines = f.readlines()
with open("recipes/mask_30/org_data/test.src", "w") as f:
    f.write("".join(src_lines[100:200]))
with open("recipes/mask_30/org_data/test.tgt", "w") as f:
    f.write("".join(tgt_lines[100:200]))