In [1]:
import pandas as pd
from generate_recipe_task import generate_task_file_from_df

recipe_id = 2
actor_specs = ["Alice", "She", 1, 0.5, 0]
transcipt_steps = 3

recipe_df = pd.read_csv("csv/cake_recipes_formatted.csv")
selected_row = recipe_df.iloc[recipe_id]
print(selected_row.iloc[0])
print(selected_row.iloc[3])
print(selected_row.iloc[4])

Rhubarb Bread
brownsugar = Ingredient('BRSU', 'firmly packed brown sugar', 330, over='too sweet', under='too bland')
vegetableoil = Ingredient('OIL1', 'vegetable oil', 145, over='too oily', under='too dry')
buttermilk = Ingredient('BTML', 'buttermilk', 240, over='too tangy', under='too flat')
egg = Ingredient('EGG1', 'egg', 50, over='too rubbery', under='too dry')
vanilla = Ingredient('VNL1', 'vanilla', 5, over='too strong', under='too bland')
flour = Ingredient('FLR2', 'flour', 300, over='too dense', under='too wet')
salt = Ingredient('SLT1', 'salt', 6, over='too salty', under='too bland')
bakingsoda = Ingredient('BKSD', 'baking soda', 5, over='too bitter', under='too flat')
rhubarb = Ingredient('RHUB', 'finely chopped rhubarb', 183, over='too tart', under='too bland')
nuts = Ingredient('NUTS', 'chopped nuts', 60, over='too greasy', under='too plain')
sugar = Ingredient('SUGR', 'sugar', 25, over='too sweet', under='too bland')


ingredients = [brownsugar, vegetableoil, buttermilk, egg

In [2]:
generate_task_file_from_df('csv/cake_recipes_formatted.csv', recipe_id, actor_specs, transcipt_steps)

In [3]:
!python 'recipe_task/recipe_task.py'

In [4]:
output_text = open("recipe_task/output.txt", "r").read()
print(output_text)

Alice mixes firmly packed brown sugar, vegetable oil, buttermilk, egg and vanilla to make brown_sugar_mixture.
She mixes flour, salt and baking soda to make dry_mixture.
She stirs brown_sugar_mixture and dry_mixture to make combined_batter.



# Mistaken Ingredient Test

In [19]:
import random
from openai import OpenAI
client = OpenAI()

class WrongIngTest:
    def __init__(self, transcript, num_steps, df_row, actor_specs, add_ingredients=False):
        self.transcript = transcript
        self.num_steps = num_steps
        self.df_row = df_row
        self.actor_specs = actor_specs

        self.recipe_ingredients = df_row.iloc[1][2:-2].split('", "')
        self.variable_ingredients = []
        for line in df_row.iloc[3].split('\n'):
            if line != '': self.variable_ingredients.append(line.split("', ")[1][1:])

        self.ing_name_map = {self.variable_ingredients[i]: self.recipe_ingredients[i] for i in range(len(self.recipe_ingredients))}

        self.used_ingredients = []
        self.unused_ingredients = []
        self.missing_ing_map = {}
        self.outputs = {}

        self.recipe_text = ''

        self.spec_level = [
            f'. {actor_specs[0]} appears shocked.',
            f' and glances back at the recipe. {actor_specs[0]} appears shocked.',
            f' and glances back at the recipe. {actor_specs[0]} appears shocked and begins trying to remove them.',
        ]

        self.extra_ingredients = [
            '2 tbsp orange zest',
            '2 tbsp lemon zest',
            '5 Oreo cookies',
            '1/2 cup strawberry jam',
            '1/2 cup grape jelly',
            '1/2 cup orange marmalade',
        ]

        self.perturb_recipe(add_ingredients=add_ingredients)


    def perturb_recipe(self, add_ingredients=False):
        # find ingredients already in use in output
        used_ings = []
        unused_ings = []
        for variable_name in self.variable_ingredients:
            if variable_name in self.transcript:
                used_ings.append(variable_name)
            else:
                unused_ings.append(variable_name)
        self.unused_ingredients = unused_ings

        self.recipe_text = f'Recipe 1\n {self.recipe_ingredients}\n\n'

        for i, unused in enumerate(self.unused_ingredients):
            loo = self.variable_ingredients.copy()
            loo.remove(unused)
            loo_names = [self.ing_name_map[ing] for ing in loo]
            if add_ingredients: loo_names.append(random.choice(self.extra_ingredients))
            self.recipe_text += f'Recipe {i + 2}\n {loo_names}\n\n'
            self.missing_ing_map[unused] = f'Recipe {i + 2}'


    def run_test(self, specification_level=0, model='gpt-5', effort='medium', n=5, prompt=None):
        self.outputs = {}
        if prompt is None:
            prompt = f'Given a transcript of actions performed by {self.actor_specs[0]}, output the name of the recipe {self.actor_specs[0]} is trying to cook.'

        for ing in self.unused_ingredients:
            mistake_text = f"{self.actor_specs[0]} mixes in the {ing}{self.spec_level[specification_level]}"
            self.outputs[ing] = []
            print(f"Running recipe with '{ing}', spec level {specification_level}.")
            print('[', end='')
            for i in range(n):
                response = client.responses.create(
                                    model=model,
                                    reasoning={"effort": effort},
                                    instructions=f"Below are a list of recipes {self.actor_specs[0]} may be attempting to make, followed by their ingredients: \n"
                                                 f"{self.recipe_text}"
                                                 f"{prompt}",
                                    input=output_text + mistake_text,
                                )
                self.outputs[ing].append(response.output_text)
                print('*', end='')
            print(']')
        return self.outputs


    def print_results(self, seperate_outputs=False):
        print('='*20)
        for ing in self.unused_ingredients:
            print(f'Expected: {self.missing_ing_map[ing]}')
            if not seperate_outputs:
                print(f'Actual: {self.outputs[ing]}')
                correct = 0
                for output in self.outputs[ing]:
                    if output == self.missing_ing_map[ing]: correct += 1
                print(f'Accuracy: {correct / len(self.outputs[ing])}')
            else:
                print('Actual:')
                print('-'*20)
                for output in self.outputs[ing]:
                    print(f'Output: {output}')
                    print('-'*20)
            print('='*20)

In [None]:
test = WrongIngTest(output_text, transcipt_steps, recipe_df.iloc[recipe_id], actor_specs, add_ingredients=True)
test.run_test(n=50)
test.print_results()
print()
test = WrongIngTest(output_text, transcipt_steps, recipe_df.iloc[recipe_id], actor_specs, add_ingredients=False)
test.run_test(n=50)
test.print_results()

Running recipe with 'finely chopped rhubarb', spec level 0.
[

In [16]:
test = WrongIngTest(output_text, transcipt_steps, recipe_df.iloc[recipe_id], actor_specs, add_ingredients=True)
test.run_test(n=10)
test.print_results()

Running recipe with 'finely chopped rhubarb', spec level 0.
[**********]
Running recipe with 'chopped nuts', spec level 0.
[**********]
Expected: Recipe 2
Actual: ['Recipe 2', 'Recipe 2', 'Recipe 1', 'Recipe 1', 'Recipe 2', 'Recipe 2', 'Recipe 2', 'Recipe 2', 'Recipe 2', 'Recipe 2']

Expected: Recipe 3
Actual: ['Recipe 3', 'Recipe 3', 'Recipe 3', 'Recipe 1', 'Recipe 3', 'Recipe 3', 'Recipe 3', 'Recipe 3', 'Recipe 2', 'Recipe 3']



In [17]:
test = WrongIngTest(output_text, transcipt_steps, recipe_df.iloc[recipe_id], actor_specs, add_ingredients=False)
test.run_test(n=10)
test.print_results()

Running recipe with 'finely chopped rhubarb', spec level 0.
[**********]
Running recipe with 'chopped nuts', spec level 0.
[**********]
Expected: Recipe 2
Actual: ['Recipe 3', 'Recipe 3', 'Recipe 2', 'Recipe 2', 'Recipe 2', 'Recipe 2', 'Recipe 2', 'Recipe 2', 'Recipe 3', 'Recipe 2']

Expected: Recipe 3
Actual: ['Recipe 3', 'Recipe 1', 'Recipe 1', 'Recipe 3', 'Recipe 3', 'Recipe 3', 'Recipe 1', 'Recipe 2', 'Recipe 3', 'Recipe 3']



In [71]:
test = WrongIngTest(output_text, transcipt_steps, recipe_df.iloc[recipe_id], actor_specs)
test.run_test(n=10, effort='high')
test.print_results()

Running recipe with 'finely chopped rhubarb', spec level 0.
Running recipe with 'chopped nuts', spec level 0.
Expected: Recipe 2
Actual: ['Recipe 2', 'Recipe 3', 'Recipe 2', 'Recipe 3', 'Recipe 2', 'Recipe 3', 'Recipe 2', 'Recipe 3', 'Recipe 2', 'Recipe 3']
Expected: Recipe 3
Actual: ['Recipe 1', 'Recipe 3', 'Recipe 3', 'Recipe 3', 'Recipe 3', 'Recipe 1', 'Recipe 3', 'Recipe 3', 'Recipe 1', 'Recipe 1']


In [10]:
test = WrongIngTest(output_text, transcipt_steps, recipe_df.iloc[recipe_id], actor_specs, add_ingredients=True)
test.run_test(n=10, prompt=f'Given a transcript of actions performed by {actor_specs[0]}, output the name of the recipe {actor_specs[0]} is trying to cook. First, think step by step and provide reasoning based on her actions and reactions before giving your answer.')
test.print_results(seperate_outputs=True)

Recipe 1
 ['1 1/2 c. firmly packed brown sugar', '2/3 c. vegetable oil', '1 c. buttermilk', '1 egg', '1 tsp. vanilla', '2 1/2 c. flour', '1 tsp. salt', '1 tsp. baking soda', '1 1/2 c. finely chopped rhubarb', '1/2 c. chopped nuts', '2 Tbsp. sugar']

Recipe 2
 ['1 1/2 c. firmly packed brown sugar', '2/3 c. vegetable oil', '1 c. buttermilk', '1 egg', '1 tsp. vanilla', '2 1/2 c. flour', '1 tsp. salt', '1 tsp. baking soda', '1/2 c. chopped nuts', '2 Tbsp. sugar', '2 tbsp lemon zest']

Recipe 3
 ['1 1/2 c. firmly packed brown sugar', '2/3 c. vegetable oil', '1 c. buttermilk', '1 egg', '1 tsp. vanilla', '2 1/2 c. flour', '1 tsp. salt', '1 tsp. baking soda', '1 1/2 c. finely chopped rhubarb', '2 Tbsp. sugar', '5 Oreo cookies']


Running recipe with 'finely chopped rhubarb', spec level 0.
Running recipe with 'chopped nuts', spec level 0.
Expected: Recipe 2
Actual:
--------------------
Output: - The initial wet and dry mixtures match the common base for all three recipes.
- She then adds finely