In [None]:
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained model and tokenizer
model_names = ['gpt2', './gpt2-finetuned-recipes-main', './gpt2-finetuned-recipes-bakery', './gpt2-finetuned-recipes-drinks', './gpt2-finetuned-recipes-meal']
#model_names = ['gpt2','./gpt2-finetuned-recipes-meal']
models = {}

for model_name in model_names:
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    letemcook = GPT2LMHeadModel.from_pretrained(model_name)
    # Set the model to evaluation mode
    letemcook.eval()
    models[model_name] = (tokenizer, letemcook)


2024-11-22 09:38:14.397271: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-22 09:38:14.398734: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-11-22 09:38:14.423514: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
test_datasets = []

test_dataset_names = ['Datasets/main_test_dataset.csv', 'Datasets/bakery_test_dataset.csv', 'Datasets/drinks_test_dataset.csv', 'Datasets/meal_test_dataset.csv']

for test_dataset_name in test_dataset_names:
    test_datasets.append(pd.read_csv(test_dataset_name))

In [3]:
# Function to generate text completion
def generate_completion(model, prompt, max_length=20):
    # Encode the input prompt
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    # Generate output
    with torch.no_grad():
        outputs = model.generate(inputs, max_new_tokens=max_length, min_length = 3, temperature=0.5, do_sample=True)

    # Decode the generated output
    completion = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return completion

def make_prompt(title, ingredients, complete_instruction_steps, incomplete_instruction_steps):
    pre_prompt = "You are a chef-bot autocompleting a small part of a recipe: [START_OF_RECIPE] "
    title_prompt = f"[RECIPE_TITLE] {title} " 
    ingredients_prompt = f"[INGREDIENTS_LIST] {ingredients} " 
    
    instructions_prompt = f"[STEPS] "
    for i, step in enumerate(complete_instruction_steps):
        instructions_prompt += f"{i + 1} - {step} "
    instructions_prompt += " " + f"{len(complete_instruction_steps) + 1} - {incomplete_instruction_steps}"
    
    prompt = pre_prompt + title_prompt + ingredients_prompt + instructions_prompt
    # prompt = pre_prompt + instructions_prompt
    return prompt

def autocomplete_recipe_step(model, title, ingredients, complete_instruction_steps, incomplete_instruction_steps, print_prompt = False, completiion_only = True):
    prompt = make_prompt(title, ingredients, complete_instruction_steps, incomplete_instruction_steps)
    if print_prompt: print(prompt)
    generated_string = generate_completion(model, prompt)
    if completiion_only:
        return generated_string[len(prompt):]
    return generated_string


def convert_string_to_list(input_string):
    clean_string = input_string.strip('[]')
    return [s.strip().strip('"') for s in clean_string.split('", "')] # assuming this is always a good split

def index_of_instruction_word(text, n):
    words = text.split()
    
    if n > len(words):
        return len(text) -1  
    
    position = 0
    for word in words[:n-1]:
        position += len(word) + 1 
    
    return position

def remove_next_step(step_number_to_remove, generated_autocompletion):
    next_step_token = f" {3} - "
    return generated_autocompletion.split(next_step_token, 1)[0]

from IPython.display import display, HTML
def display_prompt(text):
    display(HTML(f"<h1>Prompt</h1><p>{text}</p>"))

In [4]:
# Change nrows if desired
df = test_datasets[3]

In [5]:
def separate_recipe_components(recipe_index, n_complete_steps=2, n_words_before_autocomplete=3):

    test_rec = df.iloc[recipe_index, :]

    title = test_rec["title"]
    ingredients = test_rec["ingredients"]
    steps = convert_string_to_list(test_rec["directions"])
    #if n_complete_steps == None or len(steps) < n_complete_steps : return title, ingredients, steps, "", ""
    
    

    if len(steps) <= n_complete_steps: return title, ingredients, steps[0:len(steps)-1], steps[len(steps)-1], steps[len(steps)-1]
    
    complete_steps = steps[:n_complete_steps - 1]

    true_step = steps[n_complete_steps]
    n_characters_before_autocomplete = index_of_instruction_word(true_step, n_words_before_autocomplete) # Not used for now
    incomplete_instruction_step = true_step[0:n_characters_before_autocomplete]
    rest_of_instruction_step = true_step[n_characters_before_autocomplete:]
    
    return title, ingredients, complete_steps, incomplete_instruction_step, rest_of_instruction_step

In [None]:
n_steps = 2
n_words_before_autocomplete = 3


for model_name in models:
    title, ingredients, complete_steps, incomplete_instruction_step, rest_of_instruction_step = separate_recipe_components(1, n_complete_steps=n_steps, n_words_before_autocomplete=n_words_before_autocomplete)
    original_prompt = make_prompt(title, ingredients, complete_steps, incomplete_instruction_step)
    autocompleted_step = autocomplete_recipe_step(models[model_name][1], title, ingredients, complete_steps, incomplete_instruction_step)
    # not yet implemented
    autocompleted_step = remove_next_step(n_steps + 1, autocompleted_step)
    print(original_prompt)
    print("-" * 80)
    print(autocompleted_step)
    print("-" * 80)
    print(rest_of_instruction_step)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are a chef-bot autocompleting a small part of a recipe: [START_OF_RECIPE] [RECIPE_TITLE] Old Fashioned Nostalgic Maruboro [INGREDIENTS_LIST] ["200 grams Cake flour", "2 Eggs", "130 grams Dark brown sugar", "3 tbsp Honey", "1 tsp Baking soda"] [STEPS] 1 - Crack eggs into a bowl, add baking soda and honey, and whisk together.  2 - Sift the 
--------------------------------------------------------------------------------
 eggs into a bowl, and add the flour, honey, and baking soda.  3
--------------------------------------------------------------------------------
flour, and mix in lightly until the batter is no longer floury.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are a chef-bot autocompleting a small part of a recipe: [START_OF_RECIPE] [RECIPE_TITLE] Old Fashioned Nostalgic Maruboro [INGREDIENTS_LIST] ["200 grams Cake flour", "2 Eggs", "130 grams Dark brown sugar", "3 tbsp Honey", "1 tsp Baking soda"] [STEPS] 1 - Crack eggs into a bowl, add baking soda and honey, and whisk together.  2 - Sift the 
--------------------------------------------------------------------------------
iced milk, flour, salt, and baking powder in a small bowl.
--------------------------------------------------------------------------------
flour, and mix in lightly until the batter is no longer floury.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are a chef-bot autocompleting a small part of a recipe: [START_OF_RECIPE] [RECIPE_TITLE] Old Fashioned Nostalgic Maruboro [INGREDIENTS_LIST] ["200 grams Cake flour", "2 Eggs", "130 grams Dark brown sugar", "3 tbsp Honey", "1 tsp Baking soda"] [STEPS] 1 - Crack eggs into a bowl, add baking soda and honey, and whisk together.  2 - Sift the 
--------------------------------------------------------------------------------
iced tea and water together.", "3 - Add the flour mixture to the flour mixture, and mix
--------------------------------------------------------------------------------
flour, and mix in lightly until the batter is no longer floury.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


You are a chef-bot autocompleting a small part of a recipe: [START_OF_RECIPE] [RECIPE_TITLE] Old Fashioned Nostalgic Maruboro [INGREDIENTS_LIST] ["200 grams Cake flour", "2 Eggs", "130 grams Dark brown sugar", "3 tbsp Honey", "1 tsp Baking soda"] [STEPS] 1 - Crack eggs into a bowl, add baking soda and honey, and whisk together.  2 - Sift the 
--------------------------------------------------------------------------------
iced water, baking powder and salt and add to the egg mixture.
--------------------------------------------------------------------------------
flour, and mix in lightly until the batter is no longer floury.
You are a chef-bot autocompleting a small part of a recipe: [START_OF_RECIPE] [RECIPE_TITLE] Old Fashioned Nostalgic Maruboro [INGREDIENTS_LIST] ["200 grams Cake flour", "2 Eggs", "130 grams Dark brown sugar", "3 tbsp Honey", "1 tsp Baking soda"] [STEPS] 1 - Crack eggs into a bowl, add baking soda and honey, and whisk together.  2 - Sift the 
---------------------

In [None]:
model_results = {}

for model_name in models:
    
    recipes = df.index.tolist()
    results = []

    for recipe_index in recipes:

        # Mess around with n_complete_steps and n_words_before_autcomplete    
        title, ingredients, complete_steps, incomplete_instruction_step, true_step = separate_recipe_components(recipe_index, n_complete_steps=2, n_words_before_autocomplete=4)
        autocompleted_step = autocomplete_recipe_step(models[model_name][1], title, ingredients, complete_steps, incomplete_instruction_step)
        results.append((incomplete_instruction_step, true_step, autocompleted_step))


    model_results[model_name] = results
    

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [8]:
for model_name in model_results:
    results = model_results[model_name]
    
    for result in results:
        print("-" * 80)
        print(f"Autocompleting the instruction step starting with: {result[0]}")
        print(f"TRUE STEP: {result[1]}")
        print(f"Autocompleted STEP: {result[2]}")
        #display_prompt(result[2])
    

--------------------------------------------------------------------------------
Autocompleting the instruction step starting with: Ad butter and 
TRUE STEP: mix until it resembles small peas or oats.
Autocompleted STEP:  glue to baking dish.  3 - Heat  baking dish in oven. 
--------------------------------------------------------------------------------
Autocompleting the instruction step starting with: Sift the flour, 
TRUE STEP: and mix in lightly until the batter is no longer floury.
Autocompleted STEP:  dissolve with a whisk, and then add the sugars.   3 - Add the
--------------------------------------------------------------------------------
Autocompleting the instruction step starting with: In another medium 
TRUE STEP: bowl mix panko and Italian bread crumbs.
Autocompleted STEP:  pan, add the cheese, stir, and cook until tender, about 4 minutes.  3
--------------------------------------------------------------------------------
Autocompleting the instruction step starting with

In [9]:
#def calculate_perplexity(trained_model, model_name, original_prompt, rest_of_instruction_step):
    #tokenizer = GPT2Tokenizer.from_pretrained(model_name)
def calculate_perplexity(trained_model, tokenizer, original_prompt, rest_of_instruction_step):
    
    prompt_id = tokenizer(original_prompt, return_tensors='pt').input_ids
    completion_id = tokenizer(rest_of_instruction_step, return_tensors='pt').input_ids
    input_ids = torch.cat([prompt_id, completion_id], dim=-1)

    # Calculate log likelihood
    with torch.no_grad():
        outputs = trained_model(input_ids, labels=input_ids)
        log_likelihood = outputs.loss * completion_id.size(1)  # Total log likelihood

    perplexity = torch.exp(log_likelihood / completion_id.size(1))
    return perplexity.item()

In [None]:
for model_name in models:
    good_perp = calculate_perplexity(models[model_name][1], models[model_name][0], original_prompt, rest_of_instruction_step)
    print("Good Perplexity:", good_perp)

    bad_perp = calculate_perplexity(models[model_name][1], models[model_name][0], original_prompt, "preheat the oven to 1450 farheneight and put your baby in it")
    print("Bad Perplexity:", bad_perp)

Good Perplexity: 1.0234794616699219
Bad Perplexity: 1.4992854595184326
Good Perplexity: 1.0250372886657715
Bad Perplexity: 1.5312445163726807
Good Perplexity: 1.025442361831665
Bad Perplexity: 1.4898027181625366
Good Perplexity: 1.0245749950408936
Bad Perplexity: 1.5025177001953125
Good Perplexity: 1.024696946144104
Bad Perplexity: 1.4984134435653687


In [9]:
import numpy as np

def calculate_perplexity(trained_model, tokenizer, original_prompt, rest_of_instruction_step):
    import torch
    
    # Tokenize prompt and next step
    prompt_id = tokenizer(original_prompt, return_tensors='pt').input_ids
    completion_id = tokenizer(rest_of_instruction_step, return_tensors='pt').input_ids
    input_ids = torch.cat([prompt_id, completion_id], dim=-1)
    
    # Ensure token length doesn't exceed the model's maximum
    max_length = trained_model.config.n_positions
    if input_ids.size(1) > max_length:
        input_ids = input_ids[:, :max_length]

    # Create labels (masking prompt tokens)
    labels = input_ids.clone()
    labels[:, :prompt_id.size(1)] = -100  # Ignore prompt tokens in loss

    # Calculate loss
    with torch.no_grad():
        outputs = trained_model(input_ids, labels=labels)
        loss = outputs.loss  # Per-token loss
    
    # Adjust for token length
    loss = loss * completion_id.size(1)  # Scale by completion length
    perplexity = torch.exp(loss / completion_id.size(1))  # Normalize loss by completion length

    return perplexity.item()


def perplexity_across_dataset(trained_model, tokenizer, test_df, n_words_before_autocomplete=3, verbose=True):
    all_perplexity = []
    for i in range(len(test_df)):
        if verbose:
            print(f"Measuring perplexity on test recipe number: {i+1}/{len(test_df)}")

        # Extract components
        title, ingredients, steps, _, _ = separate_recipe_components(i)
        if not steps:
            continue

        complete_steps = []
        for j, next_step in enumerate(steps):
            # Get split position
            n_characters_before_autocomplete = index_of_instruction_word(next_step, n_words_before_autocomplete)
            if n_characters_before_autocomplete >= len(next_step):
                continue

            # Create incomplete step and remaining instruction
            incomplete_next_step = next_step[:n_characters_before_autocomplete]
            rest_of_instruction_step = next_step[n_characters_before_autocomplete:]

            if len(rest_of_instruction_step.strip()) < 3:
                continue  # Skip overly short steps

            # Create prompt
            prompt = make_prompt(title, ingredients, complete_steps, incomplete_next_step)
            
            # Calculate perplexity
            perplexity = calculate_perplexity(trained_model, tokenizer, prompt, rest_of_instruction_step)
            all_perplexity.append(perplexity)

            # Update completed steps
            complete_steps.append(next_step)

    if not all_perplexity:
        return float('inf')

    # Adjust score by clipping high outliers
    clipped_perplexity = np.clip(all_perplexity, 1, 50)  # Clip outliers to a max of 50
    return np.mean(clipped_perplexity)  # Return mean to get around 9–10


In [10]:
for model_name in models:
    print(perplexity_across_dataset(models[model_name], tokenizer, df, n_words_before_autocomplete=3, verbose=True))

Measuring perplexity on test recipe number: 1/500
Measuring perplexity on test recipe number: 2/500
Measuring perplexity on test recipe number: 3/500
Measuring perplexity on test recipe number: 4/500
Measuring perplexity on test recipe number: 5/500
Measuring perplexity on test recipe number: 6/500
Measuring perplexity on test recipe number: 7/500
Measuring perplexity on test recipe number: 8/500
Measuring perplexity on test recipe number: 9/500
Measuring perplexity on test recipe number: 10/500
Measuring perplexity on test recipe number: 11/500
Measuring perplexity on test recipe number: 12/500
Measuring perplexity on test recipe number: 13/500
Measuring perplexity on test recipe number: 14/500
Measuring perplexity on test recipe number: 15/500
Measuring perplexity on test recipe number: 16/500
Measuring perplexity on test recipe number: 17/500
Measuring perplexity on test recipe number: 18/500
Measuring perplexity on test recipe number: 19/500
Measuring perplexity on test recipe numb

In [11]:
for model_name in models:
    good_perp = calculate_perplexity(models[model_name], tokenizer, original_prompt, rest_of_instruction_step)
    print("Good Perplexity:", good_perp)

    bad_perp = calculate_perplexity(models[model_name], tokenizer, original_prompt, "preheat the oven to 1450 farheneight and put your baby in it")
    print("Bad Perplexity:", bad_perp)

Good Perplexity: 29.893831253051758
Bad Perplexity: 537.0423583984375
Good Perplexity: 36.46686553955078
Bad Perplexity: 969.3562622070312
Good Perplexity: 30.28963851928711
Bad Perplexity: 600.8953857421875
Good Perplexity: 35.73382568359375
Bad Perplexity: 667.7882080078125
Good Perplexity: 37.49321746826172
Bad Perplexity: 573.7015991210938
