# Install and Import Dependencies 

In [None]:
!pip3 install torch torchvision torchaudio
# MacOS Binaries dont support CUDA, install from source if CUDA is needed

In [None]:
# install transformers library
!pip3 install transformers

In [None]:
# install natural language tool kit library
!pip3 install nltk

In [None]:
# install pandas
!pip3 install pandas

In [None]:
# install seaborn
!pip3 install seaborn

In [1]:
#for data preprocessing
from transformers import pipeline, GPT2Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from csv import reader
from pathlib import Path
from collections import defaultdict
import ast # for json string of list of dicts to list of dicts
import re
import nltk
nltk.download('punkt')
import pandas as pd
import math
import seaborn as sns
import numpy as np

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# for the actual model
import os
import time
import datetime
from transformers import GPT2LMHeadModel, GPT2Config, GPT2ForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import random

In [3]:
# for evaluation
import matplotlib.pyplot as plt

# Custom Formatter

In [10]:
# Method to arrange subrecipe ingredients into groups by ingredient type
def arrange_into_sub_groups (ingredients):
    ing_list = []
    for key in ingredients:
      key_ = key.replace(':', '')
      object = defaultdict(list)
      for ing in ingredients[key][0]:
        object[ing["type"]].append(ing["ingredient"])
      ing_list.append({key_: dict(object)})
    return ing_list

In [11]:
# Class to embed tokens in each text
class Formatter ():
    
    def _init_ (self):
        result = None
        raw = None
    
    def createInput (self):
        input = self.raw["keywords"]
        self.result += "<INPUT_START> "
        for i in range(4):
          self.result += input[i]+ " <NEXT_INPUT> "
        self.result += input[4]+ " <INPUT_END> "
          
    def process_ingredient_type(self, ingredients, typeOf):
        self.result +=  "<"+typeOf+"_START> "
        for i in range(0, len(ingredients)):
            # missed "recipe follows" ingredients when scraping
            if "recipe follows" in ingredients[i]:
              continue
            self.result +=  ingredients[i]
            if i == len(ingredients)-1:
                self.result += " <"+typeOf+"_END> "
            else:
                self.result +=  " <NEXT_"+typeOf+"> "
        
    def process_sub_group (self, group, typeOf):
        self.result +=  "<"+typeOf+"_INGREDIENTS_START> "
        if "premade" in group:
            self.process_ingredient_type(group["premade"], "PREMADE")
        if "prep" in group:
            self.process_ingredient_type(group["prep"], "PREP")
        if "fat" in group:
            self.process_ingredient_type(group["fat"], "FAT")
        if "structural" in group:
            self.process_ingredient_type(group["structural"], "STRUCTURAL")
        if "moistening" in group:
            self.process_ingredient_type(group["moistening"], "MOISTENING")
        if "sweetener" in group:
            self.process_ingredient_type(group["sweetener"], "SWEETENER")
        if "leavener" in group:
            self.process_ingredient_type(group["leavener"], "LEAVENER")
        if "flavoring" in group:
            self.process_ingredient_type(group["flavoring"], "FLAVORING")
        self.result +=  "<"+typeOf+"_INGREDIENTS_END> "
        
    def processLabel(self, text):
        return re.sub(r"[^a-zA-Z]+", '', text).strip()
        
        
    def createIngredients (self):
        
        ingredients = self.raw["ingredients"]
        self.result += "<INGREDIENTS_START> "
        # group_names = [k for d in ingredients for k in d.keys()]
        # add by group
        for group in ingredients:
          key = list(group.keys())[0]
          self.process_sub_group(group[key], self.processLabel(key.upper()))
        self.result += "<INGREDIENTS_END> "
        
            
    def createSteps (self):
        steps = self.raw["steps"]
        self.result += "<STEP_START> "
        for i in range(0, len(steps)):
            self.result += steps[i]
            if i == len(steps)-1:
                self.result += " <STEP_END> "
            else:
                self.result += " <NEXT_STEP> "    
    
    def createTitle (self):
        self.result += "<TITLE_START> " + self.raw["title"] + " <TITLE_END> "
    
    def createYield (self):
        self.result += "<YIELD_START> " + self.raw["yield"] + " <YIELD_END> "
        
    
    def buildString (self):
        self.result = "<RECIPE_START> "
        self.createInput()
        self.createYield()
        self.createIngredients()
        self.createSteps()
        self.createTitle()
        self.result += "<RECIPE_END>"
    
    def buildIngredientString(self):
      self.createInput()
      self.createYield()
      self.createIngredients()

    def buildStepString(self):
      self.createInput()
      self.createSteps()
    
    def run (self, object):
        self.raw = object
        self.buildString()
  
    def runIngredients (self, object):
        self.raw = object
        self.result = ""
        self.buildIngredientString()
    
    def runSteps (self, object):
        self.raw = object
        self.result = ""
        self.buildStepString()
    
    def getString (self):
        return self.result
    

# Custom Embed Tokens Tokenizer

In [None]:
my_tokenizer = GPT2Tokenizer.from_pretrained("gpt2", do_lower_case=False)
special_tokens = {
    "additional_special_tokens": [
        "<TITLE_START>",
        "<TITLE_END>",
        "<YIELD_START>",
        "<YIELD_END>",
        "<STEP_START>",
        "<NEXT_STEP>",
        "<STEP_END>",
        "<INGREDIENTS_START>",
        "<NEXT_INGREDIENT>",
        "<INGREDIENTS_END>",
        "<PREMADE_START>",
        "<NEXT_PREMADE>",
        "<PREMADE_END>",
        "<PREP_START>",
        "<NEXT_PREP>",
        "<PREP_END>",
        "<STRUCTURAL_START>",
        "<NEXT_STRUCTURAL>",
        "<STRUCTURAL_END>",
        "<FAT_START>",
        "<NEXT_FAT>",
        "<FAT_END>",
        "<FLAVORING_START>",
        "<NEXT_FLAVORING>",
        "<FLAVORING_END>",
        "<SWEETENER_START>",
        "<NEXT_SWEETENER>",
        "<SWEETENER_END>",
        "<MOISTENING_START>",
        "<NEXT_MOISTENING>",
        "<MOISTENING_END>",
        "<LEAVENER_START>",
        "<NEXT_LEAVENER>",
        "<LEAVENER_END>",
        "<RECIPE_START>",
        "<RECIPE_END>",
        "<INPUT_START>",
        "<INPUT_END>",
        "<NEXT_INPUT>",
        "<MAIN_INGREDIENTS_START>",
         "<MAIN_INGREDIENTS_END>",
        "<FROSTING_INGREDIENTS_START>",
         "<FROSTING_INGREDIENTS_END>",
        "<FILLING_INGREDIENTS_START>",
         "<FILLING_INGREDIENTS_END>",
        "<FROSTING_INGREDIENTS_START>",
         "<FROSTING_INGREDIENTS_END>",
        "<GANACHE_INGREDIENTS_START>",
         "<GANACHE_INGREDIENTS_END>",
         "<CURD_INGREDIENTS_START>",
         "<CURD_INGREDIENTS_END>",
         "<SYRUP_INGREDIENTS_START>",
         "<SYRUP_INGREDIENTS_END>",
         "<TOPPING_INGREDIENTS_START>",
         "<TOPPING_INGREDIENTS_END>",
         "<CRUST_INGREDIENTS_START>",
         "<CRUST_INGREDIENTS_END>",
         "<TOPPING_INGREDIENTS_START>",
         "<TOPPING_INGREDIENTS_END>"
    ]
}

my_tokenizer.add_special_tokens(special_tokens)
my_tokenizer.add_special_tokens({'pad_token': '<PAD>'})

end_token_id = my_tokenizer.convert_tokens_to_ids(["<RECIPE_END>"])[0]
pad_token_id = my_tokenizer.convert_tokens_to_ids(["<PAD>"])[0]


# Import Data

In [12]:
# our data from csv
df = pd.read_csv('https://raw.githubusercontent.com/lail-lei/bim-gpt2-finetuning/main/data/baking_instructions_4852.csv')

In [13]:
# # method to create docs from csv
def createDocs ():
    
  # create objects, and save to list
  objects = []
  # randomize 
  # The frac keyword argument specifies the fraction of rows to return in the random sample,  
  # so frac=1 means return all rows (in random order).
  # specifying drop=True prevents .reset_index from creating a column containing the old index entries.
  random = df.sample(frac=1, random_state=42).reset_index(drop=True) 

    
  for ind in random.index:
    obj = {}
    # for formatting
    fm = Formatter()
    obj["title"] = random['Title'][ind]
    obj["yield"] = random['Yield'][ind]
    obj["ingredients"] = arrange_into_sub_groups(ast.literal_eval(random['processed_ingredients'][ind]))
    obj["steps"] = ast.literal_eval(random['Steps'][ind])
    obj["keywords"] = ast.literal_eval(random["keywords"][ind]) 
    fm.run(obj) # format
    length = len(fm.getString().split())
    # save only the recipes less than or equal to
    # the embed token limit for gpt2 (768)
    if length <= 768:
      objects.append(fm.getString()) #and append formatted obj
  
  return objects

In [14]:
# now this is a list of recipes with embed tokens 
docs = createDocs()
len(docs)

4452

In [None]:
docs[1004]

# Create Training and Testing Datasets

In [None]:
# batch size is the number of 768 token examples fed to the model
# at each iteration
batch_size = 4

Every tensor passed to the model should be the same length. If the recipe is shorter 
than 768 tokens, it will be padded to a length of 768 using the padding token. 
In addition, an attention mask will be returned that needs to be passed to the model to tell it to ignore the padding tokens.

In [None]:
class GPT2Dataset(Dataset):

  # max length is number of embed tokens for gpt2 small
  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:
      encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")
      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    

  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx] 

In [None]:
# Dataset object loads training or test data into memory

dataset = GPT2Dataset(docs, my_tokenizer, max_length=768)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

In [None]:
train_dataset[0]

In [None]:
# DataLoader object fetches data from a Dataset 
# and serves the data up in batches.

# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

# Finetuning the model!

In [None]:
# # set configuration
# configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
# # configuration.gradient_checkpointing: True
# # configuration.n_head = 12
# # configuration.n_layer = 12
# # #increase dropout
# # configuration.resid_pdrop = 0.25
# # configuration.attn_pdrop = 0.25
# # configuration.embd_pdrop = 0.25
# configuration

In [None]:
# instantiate the model
# The GPT2 Model transformer with a language modeling head on top 
# (linear layer with weights tied to the input embeddings).
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)


In [None]:
# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(my_tokenizer))

# fix model padding token id
model.config.pad_token_id = pad_token_id
#model.config.gradient_accumulation_steps = 10;

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# some parameters I cooked up that work reasonably well
epochs = 1
learning_rate = 9e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 200

In [None]:
#Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon,
#                   weight_decay = 2,
#                   correct_bias = False,
                )


In [None]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

In [None]:
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [None]:
total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time() # reset timer

    total_train_loss = 0 # reset training loss

    model.train() # set model to "training" mode
    
    # Always clear any previously calculated gradients before performing a
    # backward pass. PyTorch doesn't do this automatically because 
    # accumulating the gradients is "convenient while training RNNs". 
    # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
    model.zero_grad()   
    
    # keep track of ste
    process_count = 0;

    # for each step and batch (group of examples) in our training dataset
    # steps = training samples / batch size 
    for step, batch in enumerate(train_dataloader):
        
#         print(step)
        
        # Unpack this training batch from our dataloader. As we unpack the batch,
        # we'll also copy each tensor to the GPU using the `to` method.
        #
        # `batch` contains two pytorch tensors:
        #   [0]: input ids /labels 
        #   [1]: attention masks
        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)
        
        
        # Model arguments: 

        # The INPUT_IDS are often the only required parameters
        # to be passed to the model as input. 
        # They are token indices, numerical representations of tokens 
        # building the sequences that will be used as input by the model.
        # Tokenize text (products of gpt2 tokenizer) are converted into IDs
        # which are understandable by the model. 
        # This can be done by directly feeding the sentence to the tokenizer, 
        # which leverages the Rust implementation of huggingface/tokenizers for peak performance.
        
        # LABELS are an optional argument 
        # which can be passed in order for the model to compute the loss itself. 
        # These labels should be the expected prediction of the model: 
        # it will use the standard loss in order to compute the loss 
        # between its predictions and the expected value (the label).
        
        # Since BIM is working on generation, the input ids and 
        # the labels are the same thing- the predicted value of the generated text
        # is the generated text itself 

        # The ATTENTION_MASK argument indicates to the model 
        # which tokens should be attended to, and which should not.
        # Since each training "sample" in the batch must be the same length (768 tokens)
        # shorter samples included special padding tokens which will be ignored by the model
        # The attention mask, b_masks, is a binary tensor 
        # indicating the position of the padded indices (and other tokens) 
        # so that the model does not attend to them.
        
        # Here, we perform a forward pass (evaluate the model on this training batch).
        # returns a tuple containing various elements depending on the configuration.
        # Based on current config, outputs include:
        # language modeling loss, prediction scores of the language modeling head,
        # and past: a tensor that contains pre-computed 
        # hidden-states (key and values in the attention blocks). 
        # Can be used (see past input) to speed up sequential decoding. 
        # When past is used as input, the input_ids which have their past 
        # given to this model should not be passed as input_ids 
        # as they have already been computed.
        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask = b_masks,
                          token_type_ids=None
                        )

      
        loss = outputs[0]  
        
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        batch_loss = loss.item()
        total_train_loss += batch_loss

        # Get sample every x batches.
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

            model.eval() # set mode to evaluate 
            # now we generate a sample
            
            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000), #The id of the beginning-of-sequence token.
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 200,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, my_tokenizer.decode(sample_output, skip_special_tokens=True)))
            
            model.train() # now let's set mode back to train

        # backward pass- compute the gradients for each parameter from loss
        loss.backward() 
        process_count += 1
        
        if process_count == batch_size:
            process_count = 0    
            optimizer.step()  # update the parameters using gradients computed in backward pass
            scheduler.step() # change the learning rate as the training loop progresses
                            # warm up vs actual training 
            optimizer.zero_grad()
            model.zero_grad() # Reset gradients tensors
        
# if we are at end of batch, we can do gradients (if (step+1) % accumulation_steps == 0:  
#optimizer.step() scheduler.step() model.zero_grad() 

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)       
    
    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================

    print("")
    print("Running Validation...")

    t0 = time.time() # reset timer

    model.eval() # set mode to evaluate 
    
    # set variables to 0
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device) # get inputs
        b_labels = batch[0].to(device) # get labels
        b_masks = batch[1].to(device) # get mask
        
        # no gradients! Disabling gradient calculation is useful for inference, 
        # when you are sure that you will not call Tensor.backward().
        with torch.no_grad():        
            # input the inputs 
            outputs  = model(b_input_ids, 
#                            token_type_ids=None, 
                             attention_mask = b_masks,
                            labels=b_labels)
            # get the losss
            loss = outputs[0]  
            
        batch_loss = loss.item()
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    validation_time = format_time(time.time() - t0)    

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time,
            "Perplexity": math.exp(loss)
        }
    )
    
    print("Perplexity:", math.exp(loss))

print("")
print("Training complete!")
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Let's view the summary of the training process.

In [None]:
#Display floats with two decimal places.
pd.set_option('precision', 3)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

In [None]:
#perplexity score
math.exp(loss)

In [None]:
## Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")
plt.plot(df_stats['Perplexity'], 'r-o', label="Perplexity")

# Label the plot.
plt.title("Training, Validation Loss and Perplexity")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2])

plt.show()

# Model Info

In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The GPT-2 model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:2]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[2:14]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-2:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

# Saving and Loading the Model

In [None]:
Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()

output_dir = './model_80-20-train/'

# # Create output directory if needed
# if not os.path.exists(output_dir):
#     os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
my_tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))

In [None]:
# !ls -l --block-size=K ./model_80-20-train/

In [None]:
# !ls -l --block-size=M ./model_80-20-train/pytorch_model.bin

# LOAD an existing model

In [4]:
output_dir = './model_80-20-train/'
#Load a trained model and vocabulary that you have fine-tuned
model = GPT2LMHeadModel.from_pretrained(output_dir)
my_tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50313, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

# Generate Text

In [6]:
model.eval()
prompt = "<RECIPE_START> <INPUT_START> cupcakes <NEXT_INPUT> " 
generated = torch.tensor(my_tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

sample_outputs = model.generate(
                                generated, 
                                #bos_token_id=random.randint(1,30000),
                                do_sample=True,   
                                top_k=75, 
                                max_length = 1024,
                                top_p=0.95, 
                                num_return_sequences=1,
                                no_repeat_ngram_size = 7, 
                                temperature=0.75 
                                )


for i, sample_output in enumerate(sample_outputs):
  print("{}: {}\n\n".format(i, my_tokenizer.decode(sample_output, skip_special_tokens=True)))

0: cupcakesapplecheeselemonjuice24 cupcakes1 1/2 sticks unsalted butter at room temperature2 1/2 cups all purpose flour4 large eggs at room temperature1 cup milk1 cup sugar1 1/4 teaspoons baking powder1/4 teaspoon fine salt1 teaspoon pure vanilla extract4 ounces cream cheese at room temperature1/4 cup sugar1/4 teaspoon grated lemon zest2 teaspoons pure vanilla extract1 teaspoon lemon juice2 sticks unsalted butter at at room temperature2 sticks unsalted butter at medium speed4 cups confectioners sugar3 tablespoons lemon juice1 tablespoon pure vanilla extractPreheat the oven to 350 degrees F. Line 24 muffin tins with paper liners.Put the flour salt baking powder and 3/4 cup sugar in a large bowl and whisk with an electric mixer until combined.Add the butter and beat until light and fluffy about 3 minutes.Add the eggs one at a time beating well after each addition.Add the vanilla and lemon juice and beat until incorporated.Reduce the mixer speed to low and gradually add the flour mixture 

In [95]:
decoded = []
for item in sample_outputs:
    decoded.append(my_tokenizer.decode(item, skip_special_tokens=False))

In [96]:
sample_dataframe = pd.DataFrame(data=decoded, columns = ["generated"])

# Parser and Format for HungerRice

In [50]:
# Parser superclass
class Parser:
  def get_string(self, text, pattern):
    truncated = re.search(pattern, text)
    if truncated == None: 
      return None
    return truncated.group()

In [51]:
# Ingredients Parser object extends Parser superclass
class IngredientsParser(Parser):
  def __init__(self, text):
    self.ingredients = []
    self.string = self.get_string(text, "<INGREDIENTS_START>.*?<INGREDIENTS_END>")
  
  # get the name of each ingredient list (main, frosting, filling, etc)
  def get_group_names (self, string):
    # get all ingredients_start tokens in list
    names= re.findall("\w+_INGREDIENTS_START", string)
    # isolate list name from token
    return [re.sub("_INGREDIENTS_START", "", name) for name in names]

  # remove empty strings from array
  def remove_empty_strings (self, item):
    stripped = item.strip()
    if  stripped == "":
      return
    else:
      return stripped
  # hungerrice uses abbreviations for units
  def abbreviate_unit (self, unit):
    unit = unit.lower()
    if unit == "pinches":
      return "pinch";
    if unit == "dashes":
      return "dash";
    if unit == "teaspoons" or unit =="teaspoon":
      return "tsp";
    if unit == "tablespoon" or unit == "t" or unit== "tablespoons":
      return "tbsp";
    if unit == "cups" or unit == "cup":
        return "cup";
    if unit == "pint" or unit == "pints":
      return "pt";
    if unit == "quart" or unit == "quarts":
      return "qt";
    if unit == "gallon" or unit == "gallons": 
      return "gal";
    if unit == "fluid ounce" or unit == unit == "fluid ounces" or unit == "floz" or unit == "fl oz":
      return "fl oz (US)"
    if unit == "ounce" or unit == unit == "ounces":
      return "oz"
    if unit == "pound" or unit == "pounds":
      return "lbs"
    if unit == "g" or unit == "gs" or unit == "grams" or unit == "gms":
      return "gm";
    if unit == "kilos" or unit == "kilograms" or unit == "kgs":
      return "kg";
    if unit == "milliliter" or unit == "milliliters" or unit == "millilitre" or unit == "millilitres" or unit == "ml":
      return "mL";
    if unit == "liter" or unit == "liters" or unit == "litre" or unit == "litres" or unit == "l":
      return "L";
    
    return unit

  # convert ingredient string item to dict
  def create_ingredient_object (self, item, group):
    unit = None
    amount = None
    plus = None
    name = ""

    units = re.findall("cups*|pinche*s*|dashe*s*|pints*|tsp|teaspoons*|tbsp|tablespoons*|qt|quarts*|pt|pints*|gal|gallons*|fluid\soz|fl\soz|pounds*|lbs\s|ounces*|oz|\d+\s*g\s|gms*|grams*|kilos*|kilograms*|kg|\sml\s|\sl\s", item, re.IGNORECASE)
    # set unit to first found unit 
    if len(units) > 0:
      unit = self.abbreviate_unit(units[0])
    
    # more than 1 unit, (1 cup 2 tablespoons), need a plus phrase
    if len(units) == 2:
      second_amount_pattern1 = "plus\s\d/*\d*"
      second_amount_pattern2 = "and\s\d/*\d*"
      second_amount_pattern3 = "with\s\d/*\d*"
      second_amount = re.search(f"{second_amount_pattern2}|{second_amount_pattern3}|{second_amount_pattern1}", item)
      if second_amount != None:
        second_amount = re.sub("plus\s|and\s|with\s", "", second_amount.group())
        plus = f"{second_amount} {units[1]}"
  
    # different patterns amount can come in
    amount_pattern1 = "^\d*/*\d*"
    amount_pattern2 = "^\d/*\d*\sto\s\d/*\d*"
    amount_pattern3 = "^\d/*\d*\sand\s\d/*\d*"
    amount = re.search(f"{amount_pattern2}|{amount_pattern3}|{amount_pattern1}", item)
    
    if amount != None:
      amount = amount.group()
    
    # get last unit or amount to find part where ingredient name begins
    if len(units) > 0:
      last_unit = units[len(units) - 1]
      name = item.split(last_unit)[1].strip()
      if plus != None:
        name += f", plus {plus}"
    elif amount != None:
      try:
        name = item.split(amount)[1].strip()
      except ValueError:
        name = item.strip()
    # if no unit nor amount, entire string is ingredient name
    else:
      name = item.strip()  
    
    return {"amount": amount, "unit": unit, "name": name, "group": group.lower()}
  

  def get_ingredients(self):
    return self.ingredients
    
  def run_parser (self):
    # can't run if no ingredient string
    if self.string == None:
      return False;
    # get group names 
    names = self.get_group_names(self.string)
    # return false if no ingredients
    if len(names) == 0:
      return False;

    # create an array encountered ingredients
    for name in names:
      # isolate substring including only list ingredients
      pattern = f"<{name}_INGREDIENTS_START>.*?<{name}_INGREDIENTS_END>"
      list_string = re.search(pattern, self.string)
      # skip this loop iteration if pattern doesn't exist in ingredients string
      if list_string is None:
        continue;
      else:
        list_string = list_string.group()
      # remove list start and end tokens
      list_string = re.sub(f"<{name}_INGREDIENTS_.*?>", '', list_string)
      #remove ingredient category tokens
      list_array = re.split("<.*?> ", list_string)
      list_array = list(filter(lambda x: x != None, [self.remove_empty_strings(item) for item in list_array]))      
    
      # create objects for each ingredient
      for item in list_array:
        obj = self.create_ingredient_object(item, name)
        self.ingredients.append(obj)
    
    return True
  

In [52]:
# Steps Parser object extends Parser superclass
class StepsParser(Parser):
  def __init__(self, text):
    self.steps = []
    self.string = self.get_string(text, "<STEP_START>.*?<STEP_END>")
  
  def get_steps(self):
    return self.steps

  def run_parser(self):
    # can't run if no ingredient string
    if self.string == None:
      return False;
    self.steps = re.sub("\s\s+", "\n", re.sub("<.*?>", "", self.string).strip()).split("\n")
    return True


In [53]:
#Yields parser object extends parser superclass
class YieldsParser(Parser):
  def __init__(self, text):
    self.amount = 12
    self.label = None
    self.string = self.get_string(text, "<YIELD_START>.*?<YIELD_END>")
  
  def get_yield(self):
    return {"amount": self.amount, "label": self.label}

  def run_parser(self):
    # can't run if no ingredient string
    if self.string == None:
      return False;
    
    stripped = re.sub("<.*?>", '', self.string).strip()
    string_amount = re.search("^\d+", stripped)
    if string_amount != None:
       self.amount = int(string_amount.group())
       string_label = re.sub(string_amount.group(), "", stripped)
       if string_label != None and len(string_label) > 0:
         self.label = string_label.strip()
    return True

In [54]:
#Tags parser object extends parser superclass
class TagsParser(Parser):
  def __init__(self, text):
    self.tags = ["bim", "bot"]
    self.string = self.get_string(text, "<INPUT_START>.*?<INPUT_END>")
  
  def get_tags(self):
    return self.tags
  # tags must be hyphenated in hungerrice
  def remove_characters(self, item):
    return re.sub("\W", "_", item)

  def run_parser(self):
    # can't run if no ingredient string
    if self.string == None:
      return False;
    
    input = re.sub("\s\s+", "\n", re.sub("<.*?>", "", self.string).strip()).split("\n")
    input = [self.remove_characters(i) for i in input]
    self.tags = [*self.tags, *input]
    return True

In [55]:
# converts recipe to hungerrice recipe object
class RecipeParser(Parser):
  
  def __init__(self, text):
    self.ingredients = []
    self.steps = []
    self.yields = None
    self.tags = None
    self.images = [{"image_url": "https://hungerrice-images.s3.us-east-2.amazonaws.com/ee464b52-e778-4b69-a9ec-1a7329cb06ad", "position": 0}]
    self.title = f"BIM GENERATED RECIPE-{time.time()}"
    self.notes = f"This recipe was generated at {time.time()} by a bot named BIM-GPT2. Do you think BIM did a good job? Leave a rating below!"
    self.owner_id = 1053
    self.status = 1
    self.string = self.get_string(text, "<RECIPE_START>.*?<RECIPE_END>")

  def parse_ingredients(self, string):
    parser = IngredientsParser(string)
    success = parser.run_parser()
    if success:
      self.ingredients = parser.get_ingredients()
      return True
    else: 
      return False

  def parse_steps(self, string):
    parser = StepsParser(string)
    success = parser.run_parser()
    if success:
      self.steps = parser.get_steps()
      return True
    else: 
      return False

  def parse_yield(self, string):
    parser = YieldsParser(string)
    success = parser.run_parser()
    if success:
      self.yields = parser.get_yield()
      return True
    else: 
      return False
  
  def parse_tags(self, string):
    parser = TagsParser(string)
    success = parser.run_parser()
    if success:
      self.tags = parser.get_tags()
      return True
    else: 
      return False

  def extract_title (self, text):
    title = re.search("<TITLE_START>.*?<TITLE_END>", text)
    if title == None:
      return None;
    title = title.group()
    return re.sub("<.*?>", '', title).strip()

  def run_parser (self):
    if self.string == None:
      print("No parseable string found")
      return False;
    if self.parse_ingredients(self.string) == False:
      print("No parseable ingredients found")
      return False
    if self.parse_steps(self.string) == False:
      print("No parseable steps found")
      return False
    
    extracted_title = self.extract_title(self.string)
    
    if extracted_title != None:
      self.title = extracted_title
    
    self.parse_yield(self.string)
    self.parse_tags(self.string)

    return True

  
  def get_recipe (self):
    return {"title": self.title, 
            "owner_id": self.owner_id, 
            "original_owner_id": self.owner_id,
            "status": self.status,
            "yield": self.yields,
            "ingredients": self.ingredients,
            "steps": self.steps,
            "notes": self.notes,
            "tags" : self.tags
            }

In [56]:
def parse_recipe (text):
  parse = RecipeParser(text)
  success = parse.run_parser()
  if success:
    return parse.get_recipe()
  else:
    return None

In [97]:
parsed = sample_dataframe.generated.apply(parse_recipe)

In [98]:
sample_dataframe["parsed"] = parsed

In [99]:
sample_dataframe

Unnamed: 0,generated,parsed
0,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,{'title': 'Strawberry Cupcakes with Creamy But...
1,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,"{'title': 'Peanut Butter Cupcakes', 'owner_id'..."
2,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,"{'title': 'Cupcake', 'owner_id': 1053, 'origin..."
3,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,"{'title': 'Carrot Cake Cupcakes', 'owner_id': ..."
4,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,{'title': 'Chocolate Cupcakes with Marshmallow...
...,...,...
65,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,"{'title': 'Chocolate Spider Web Cupcakes', 'ow..."
66,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,"{'title': 'Coconut Cupcakes', 'owner_id': 1053..."
67,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,"{'title': 'Cherry Cupcakes', 'owner_id': 1053,..."
68,<RECIPE_START> <INPUT_START> cupcakes <NEXT_IN...,"{'title': 'Candy Cane Cupcakes', 'owner_id': 1..."


In [100]:
time_created = time.time()

In [101]:
sample_dataframe.to_csv(f"./generated_recipes/{time_created}.csv", index=False) 

In [102]:
normalized = pd.json_normalize(sample_dataframe['parsed'])

In [103]:
normalized.to_csv(f"./generated_recipes/{time_created}_normalized.csv", index=False) 