In [1]:
# https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb#scrollTo=UIvgZ3S6AO0z
# https://thepythoncode.com/article/pretraining-bert-huggingface-transformers-in-python
# https://www.youtube.com/watch?v=IcrN_L2w0_Y
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import pandas as pd
from unidecode import unidecode
from transformers import RobertaConfig, RobertaTokenizerFast, LineByLineTextDataset
import torch

In [2]:
df = pd.read_csv('/kaggle/input/recipenlg/RecipeNLG_dataset.csv', usecols=['directions'], nrows=1500000)

In [3]:
df

Unnamed: 0,directions
0,"[""In a heavy 2-quart saucepan, mix brown sugar..."
1,"[""Place chipped beef on bottom of baking dish...."
2,"[""In a slow cooker, combine all ingredients. C..."
3,"[""Boil and debone chicken."", ""Put bite size pi..."
4,"[""Combine first four ingredients and press in ..."
...,...
1499995,"[""In a large bowl, beat the cream cheese, sour..."
1499996,"[""Preheat oven to 350\u00b0. In a large skille..."
1499997,"[""Cook pasta according to package directions. ..."
1499998,"[""In a small bowl, combine the first five ingr..."


In [4]:
df.iloc[1,0]

'["Place chipped beef on bottom of baking dish.", "Place chicken on top of beef.", "Mix soup and cream together; pour over chicken. Bake, uncovered, at 275\\u00b0 for 3 hours."]'

In [5]:
df.dropna(inplace=True)

In [6]:
df['directions'] = df['directions'].str.replace('[', '')
df['directions'] = df['directions'].str.replace(']', '')

In [7]:
df.iloc[0,0]

'"In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.", "Stir over medium heat until mixture bubbles all over top.", "Boil and stir 5 minutes more. Take off heat.", "Stir in vanilla and cereal; mix well.", "Using 2 teaspoons, drop and shape into 30 clusters on wax paper.", "Let stand until firm, about 30 minutes."'

In [8]:
n = len(df)
train_size = int(0.9*n)

In [9]:
train_size

1350000

In [10]:
df.iloc[1,0]

'"Place chipped beef on bottom of baking dish.", "Place chicken on top of beef.", "Mix soup and cream together; pour over chicken. Bake, uncovered, at 275\\u00b0 for 3 hours."'

In [11]:
for txt in df.iloc[:10, 0]:
    txt = txt.replace('.",', '.')
    txt = txt.replace('"', '')
    print(txt)

In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine. Stir over medium heat until mixture bubbles all over top. Boil and stir 5 minutes more. Take off heat. Stir in vanilla and cereal; mix well. Using 2 teaspoons, drop and shape into 30 clusters on wax paper. Let stand until firm, about 30 minutes.
Place chipped beef on bottom of baking dish. Place chicken on top of beef. Mix soup and cream together; pour over chicken. Bake, uncovered, at 275\u00b0 for 3 hours.
In a slow cooker, combine all ingredients. Cover and cook on low for 4 hours or until heated through and cheese is melted. Stir well before serving. Yields 6 servings.
Boil and debone chicken. Put bite size pieces in average size square casserole dish. Pour gravy and cream of mushroom soup over chicken; level. Make stuffing according to instructions on box (do not make too moist). Put stuffing on top of chicken and gravy; level. Sprinkle shredded cheese on top and bake at 350\u00b0 for appr

In [12]:
# Prepare training examples
with open('train.txt', 'a') as f:
    for txt in df['directions'][:train_size]:
        txt = txt.replace('.",', '.')
        txt = txt.replace('"', '')
        txt += "\n"
        f.write(txt.lower())

In [13]:
# Prepare validation examples
with open('test.txt', 'a') as f:
    for txt in df['directions'][train_size:]:
        txt = txt.replace('.",', '.')
        txt = txt.replace('"', '')
        txt += "\n"
        f.write(txt.lower())

In [14]:
files = ['train.txt']
tokenizer = ByteLevelBPETokenizer()

tokenizer.train(files=files, vocab_size=30_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])






In [15]:
!mkdir recipe_roberta
tokenizer.save_model("recipe_roberta")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


['recipe_roberta/vocab.json', 'recipe_roberta/merges.txt']

In [16]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
    "recipe_roberta/vocab.json",
    "recipe_roberta/merges.txt",
)

In [17]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=256) # context window = 256 due to resource constraints. Can be any number

In [18]:
df.iloc[0,0]

'"In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine.", "Stir over medium heat until mixture bubbles all over top.", "Boil and stir 5 minutes more. Take off heat.", "Stir in vanilla and cereal; mix well.", "Using 2 teaspoons, drop and shape into 30 clusters on wax paper.", "Let stand until firm, about 30 minutes."'

In [19]:
tokenizer.encode(df.iloc[0,0])

Encoding(num_tokens=109, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [20]:
print(tokenizer.encode(df.iloc[0,0]).tokens)

['<s>', '"', 'I', 'n', 'Ġa', 'Ġheavy', 'Ġ2', '-', 'quart', 'Ġsaucepan', ',', 'Ġmix', 'Ġbrown', 'Ġsugar', ',', 'Ġnuts', ',', 'Ġevaporated', 'Ġmilk', 'Ġand', 'Ġbutter', 'Ġor', 'Ġmargarine', '.', '"', ',', 'Ġ', '"', 'S', 't', 'ir', 'Ġover', 'Ġmedium', 'Ġheat', 'Ġuntil', 'Ġmixture', 'Ġbubbles', 'Ġall', 'Ġover', 'Ġtop', '.', '"', ',', 'Ġ', '"', 'B', 'oil', 'Ġand', 'Ġstir', 'Ġ5', 'Ġminutes', 'Ġmore', '.', 'Ġ', 'T', 'ake', 'Ġoff', 'Ġheat', '.', '"', ',', 'Ġ', '"', 'S', 't', 'ir', 'Ġin', 'Ġvanilla', 'Ġand', 'Ġcereal', ';', 'Ġmix', 'Ġwell', '.', '"', ',', 'Ġ', '"', 'U', 'sing', 'Ġ2', 'Ġteaspoons', ',', 'Ġdrop', 'Ġand', 'Ġshape', 'Ġinto', 'Ġ30', 'Ġclusters', 'Ġon', 'Ġwax', 'Ġpaper', '.', '"', ',', 'Ġ', '"', 'L', 'et', 'Ġstand', 'Ġuntil', 'Ġfirm', ',', 'Ġabout', 'Ġ30', 'Ġminutes', '.', '"', '</s>']


In [21]:
torch.cuda.is_available()

True

In [22]:
config = RobertaConfig(
    vocab_size=30_000, # vocab_size=30,000, it can be any number
    max_position_embeddings=258,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("recipe_roberta", max_len=256)

In [24]:
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)

In [25]:
model.num_parameters()

66389808

In [None]:
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="train.txt",
    block_size=128,
)

In [27]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [28]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="recipe_roberta",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset, # must add eval_dataset and early stopping in case of training for multiple epochs
)

In [30]:
trainer.train()

Step,Training Loss
500,5.3232
1000,4.743
1500,4.3285
2000,3.9199
2500,3.4554
3000,2.9483
3500,2.6275
4000,2.4244
4500,2.2858
5000,2.1998




TrainOutput(global_step=10547, training_loss=2.6477651721755056, metrics={'train_runtime': 12169.8943, 'train_samples_per_second': 110.929, 'train_steps_per_second': 0.867, 'total_flos': 4.496366997386035e+16, 'train_loss': 2.6477651721755056, 'epoch': 1.0})

In [31]:
trainer.save_model("recipe_roberta")

## Inference

In [32]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="recipe_roberta",
    tokenizer="recipe_roberta"
)

In [34]:
fill_mask('In a small bowl, combine 1-1/3 cups flour, baking <mask>')

[{'score': 0.5234920978546143,
  'token': 627,
  'token_str': ' powder',
  'sequence': 'In a small bowl, combine 1-1/3 cups flour, baking powder'},
 {'score': 0.16120609641075134,
  'token': 18,
  'token_str': '.',
  'sequence': 'In a small bowl, combine 1-1/3 cups flour, baking.'},
 {'score': 0.15107285976409912,
  'token': 885,
  'token_str': ' soda',
  'sequence': 'In a small bowl, combine 1-1/3 cups flour, baking soda'},
 {'score': 0.027595501393079758,
  'token': 545,
  'token_str': ' dish',
  'sequence': 'In a small bowl, combine 1-1/3 cups flour, baking dish'},
 {'score': 0.006660874467343092,
  'token': 372,
  'token_str': ' pan',
  'sequence': 'In a small bowl, combine 1-1/3 cups flour, baking pan'}]

In [35]:
fill_mask('Take 2 <mask> of water.')

[{'score': 0.42869141697883606,
  'token': 740,
  'token_str': ' cups',
  'sequence': 'Take 2 cups of water.'},
 {'score': 0.2618587017059326,
  'token': 2339,
  'token_str': ' quarts',
  'sequence': 'Take 2 quarts of water.'},
 {'score': 0.03628673776984215,
  'token': 2262,
  'token_str': ' cans',
  'sequence': 'Take 2 cans of water.'},
 {'score': 0.03198911249637604,
  'token': 893,
  'token_str': ' tablespoons',
  'sequence': 'Take 2 tablespoons of water.'},
 {'score': 0.013823837973177433,
  'token': 1493,
  'token_str': ' quart',
  'sequence': 'Take 2 quart of water.'}]

In [37]:
fill_mask('Take 2 spoons of <mask> sugar')

[{'score': 0.2557569742202759,
  'token': 1307,
  'token_str': ' powdered',
  'sequence': 'Take 2 spoons of powdered sugar'},
 {'score': 0.07180331647396088,
  'token': 1132,
  'token_str': ' white',
  'sequence': 'Take 2 spoons of white sugar'},
 {'score': 0.05636908859014511,
  'token': 1908,
  'token_str': ' confectioners',
  'sequence': 'Take 2 spoons of confectioners sugar'},
 {'score': 0.05463339015841484,
  'token': 2401,
  'token_str': ' granulated',
  'sequence': 'Take 2 spoons of granulated sugar'},
 {'score': 0.038134872913360596,
  'token': 872,
  'token_str': ' cinnamon',
  'sequence': 'Take 2 spoons of cinnamon sugar'}]

In [40]:
fill_mask('Add lemon <mask> to water and keep it aside')

[{'score': 0.8541980981826782,
  'token': 601,
  'token_str': ' juice',
  'sequence': 'Add lemon juice to water and keep it aside'},
 {'score': 0.025953233242034912,
  'token': 1669,
  'token_str': ' zest',
  'sequence': 'Add lemon zest to water and keep it aside'},
 {'score': 0.008921769447624683,
  'token': 2043,
  'token_str': ' rind',
  'sequence': 'Add lemon rind to water and keep it aside'},
 {'score': 0.004515011329203844,
  'token': 1037,
  'token_str': ' syrup',
  'sequence': 'Add lemon syrup to water and keep it aside'},
 {'score': 0.004182823467999697,
  'token': 390,
  'token_str': ' sugar',
  'sequence': 'Add lemon sugar to water and keep it aside'}]

In [44]:
fill_mask('Stir in enough remaining flour to form a soft <mask>.')

[{'score': 0.5582666993141174,
  'token': 783,
  'token_str': ' ball',
  'sequence': 'Stir in enough remaining flour to form a soft ball.'},
 {'score': 0.28826451301574707,
  'token': 604,
  'token_str': ' dough',
  'sequence': 'Stir in enough remaining flour to form a soft dough.'},
 {'score': 0.03335752338171005,
  'token': 1025,
  'token_str': ' balls',
  'sequence': 'Stir in enough remaining flour to form a soft balls.'},
 {'score': 0.006526618264615536,
  'token': 773,
  'token_str': ' batter',
  'sequence': 'Stir in enough remaining flour to form a soft batter.'},
 {'score': 0.005940621253103018,
  'token': 1266,
  'token_str': ' paste',
  'sequence': 'Stir in enough remaining flour to form a soft paste.'}]

In [47]:
fill_mask('Place the <mask> cake on the table.')

[{'score': 0.04948220029473305,
  'token': 1739,
  'token_str': ' baked',
  'sequence': 'Place the baked cake on the table.'},
 {'score': 0.048075467348098755,
  'token': 2079,
  'token_str': ' favorite',
  'sequence': 'Place the favorite cake on the table.'},
 {'score': 0.033369794487953186,
  'token': 907,
  'token_str': ' other',
  'sequence': 'Place the other cake on the table.'},
 {'score': 0.01927982084453106,
  'token': 1479,
  'token_str': ' cooled',
  'sequence': 'Place the cooled cake on the table.'},
 {'score': 0.017457179725170135,
  'token': 2975,
  'token_str': ' angel',
  'sequence': 'Place the angel cake on the table.'}]