# Query GPT4

I'm going to automate my first pass data GPT-4 generation using the API. 

In [15]:
from pathlib import Path
import os
from openai import OpenAI
from dotenv import load_dotenv
import random
import time
import pandas as pd
from tqdm.notebook import tqdm

load_dotenv()

Path.ls = lambda x: list(x.iterdir())

In [16]:
ASST_ID = os.environ.get("ASST_ID")

PROJECT = Path("../../")

PROMPT = PROJECT / "prompt.txt"

DATA = PROJECT / "data"
len((DATA / "recipes").ls())

426

In [17]:
df = pd.read_csv(DATA / "recipes-meta.csv")
df.head()

Unnamed: 0,_id,name,ingredients,url,image,ts,cookTime,source,recipeYield,datePublished,prepTime,description,totalTime,creator,recipeCategory,dateModified,recipeInstructions,simple_url
0,{'$oid': '516b456d96cc6251ae131a46'},Green Curry Broth,1 tablespoon coriander seeds\n1 1/2 teaspoon w...,http://www.101cookbooks.com/archives/green-cur...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984621817},PT15M,101cookbooks,Serves 4.,2010-08-24,PT20M,"A beautiful, thin green curry broth, fragrant ...",,,,,,101cookbooks.com
1,{'$oid': '516b454f96cc6251ae1319e7'},Heirloom Beans &amp; Seitan Recipe,"1 small-medium head of broccoli or broccolini,...",http://www.101cookbooks.com/archives/heirloom-...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984591273},PT20M,101cookbooks,,2009-11-30,PT10M,This is simply heirloom beans and roasted broc...,,,,,,101cookbooks.com
2,{'$oid': '516b451796cc6251ae13196d'},Spiced Caramel Corn,"4 ounces whole macadamia nuts (about 1 cup), o...",http://www.101cookbooks.com/archives/000169.html,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984535371},,101cookbooks,,2005-05-24,,101 Cookbooks: From the Archives: Spiced Caram...,,,,,,101cookbooks.com
3,{'$oid': '516b455196cc6251ae1319ed'},Buttermilk Squash Soup Recipe,1 teaspoon cumin seeds\n1/4 cup / 2 oz / 55g u...,http://www.101cookbooks.com/archives/buttermil...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984593142},PT40M,101cookbooks,,2010-07-06,PT15M,A nice way to use up a good amount of summer s...,,,,,,101cookbooks.com
4,{'$oid': '516b45b296cc6251ae131b0a'},Honey-sweetened Thumbprint Cookie Recipe,2/3 cup honey (I use a clover honey)\n1/3 cup ...,http://www.101cookbooks.com/archives/honeyswee...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984690972},,101cookbooks,Makes a few dozen cookies.,2009-03-05,,A simple thumbprint cookie recipe. Made with w...,,,,,,101cookbooks.com


## Setup

In [18]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)
assistant = client.beta.assistants.retrieve(ASST_ID)

In [19]:
import logging

# Set up logging configuration
logging.basicConfig(level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)


def get_response(recipe):
    thread = client.beta.threads.create()
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=recipe,
    )
    run = client.beta.threads.runs.create(
        thread_id=thread.id, assistant_id=assistant.id, instructions=""
    )
    status = None
    logging.info("Waiting for response...")
    while status != "completed":
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        status = run.status
        time.sleep(1)
    logging.info("Response received ü•≥")
    messages = client.beta.threads.messages.list(thread_id=thread.id)
    return messages


def get_file_id(messages):
    annotations = (
        messages.model_dump()
        .get("data", [{}])[0]
        .get("content", [{}])[0]
        .get("text", {})
        .get("annotations", [{}])
    )
    if annotations:
        file_id = annotations[0].get("file_path", {}).get("file_id")
    else:
        file_id = None
    return file_id


def download_file(file_id, fp):
    api_response = client.files.with_raw_response.retrieve_content(file_id)
    if api_response.status_code == 200:
        content = api_response.content
        with open(fp, "wb") as f:
            f.write(content)
        logging.info(f"File downloaded successfully to {fp}")

## Process recipes

In [20]:
already_generated = [p.parts[-2] for p in (DATA / "recipes").glob("*/gantt.tsv")]
gdf = df.loc[~df["name"].isin(already_generated)].copy()
len(gdf)

138

In [None]:
missed = []
for i, row in tqdm(gdf.iterrows(), total=len(df)):
    dir = DATA / "recipes" / row["name"]
    assert dir.exists()
    logging.info(f"Processing {dir.parts[-1]}")
    recipe = dir / "recipe.txt"
    recipe = recipe.read_text()
    messages = get_response(recipe)
    file_id = get_file_id(messages)
    if file_id is not None:
        download_file(file_id, dir / "gantt.tsv")
    else:
        logging.info("No file produced")
        missed.append(dir.parts[-1])

# save missed to a file
with open("missed.txt", "w") as f:
    f.write("\n".join([str(x) for x in missed]))

In [29]:
already_generated = [p.parts[-2] for p in (DATA / "recipes").glob("*/gantt.tsv")]
print(f"Number of recipes: {len(already_generated)}")

# total storage used by tsv files
dataset_size = (
    sum(p.stat().st_size for p in (DATA / "recipes").glob("*/gantt.tsv")) / 2**20
)
print(f"Total dataset size: {dataset_size:.2f} MiB")

Number of recipes: 288
Total dataset size: 0.47 MiB


So we have a very small dataset but I think it's worth having a go finetuning with it, at least as a POC. I'm not sure I'm keen to spend more of my own money generating a bigger dataset. Hopefully I can get _some_ improvements and then get a prove that all this problem would need is a bigger dataset.

## Collate and convert

Going to load up all the data and convert it into a single JSONL file.

In [35]:
rows = []
for recipe in already_generated:
    dp = DATA / "recipes" / recipe
    input_path = dp / "recipe.txt"
    output_path = dp / "gantt.tsv"
    input = input_path.read_text()
    output = output_path.read_text()
    row = {}
    row["input"] = input
    row["output"] = output
    rows.append(row)

odf = pd.DataFrame(rows)

Now I need to add the instruction column. I'm really interested in training a model that can map directly from recipe to gantt chart, without the painful CoT prompting that I had to do with GPT4. That way it should be quicker and maybe less prone to differences in formatting etc?

I think this leaves me with two options:
1. Write a short prompt for the instruction column explaining the desired output similar to the GPT4 prompt but importantly _without_ the CoT.
    - This might give the model somewhere to start from (important with such a small dataset)
    - However, it seems a bit strange/redundant to be giving the same instruction in every example. This information would be redundant and would increase computational overhead.
1. Put the input into the instruction column and provide no input.
    - The model _should_ simply treat the instructions similarly to the input (the only difference is where they appear in the prompt)
    - The task _should_ be implicitly learnable from the input and output pairs
    - This option should have a smaller computational overhead
    - I'd just be worried that it might be a bit too much to ask the model to infer the structure of the task when I have such a small training set

In light of these options, I think I'll include an instruction column (with identical instructions for each) for now and then I can always choose to include it / exclude it later depending on what I find.

### Instructions

Your task is to transform cooking recipes from raw text into a Gantt chart .tsv file which conveys all the same information but graphically so one can see which ingredients are involved in each step. In the end, we wish to produce a downloadable .tsv file containing a table. It will be structured as follows:

- the column headers will contain the full text description of each step in the recipe (verbatim as in the original recipe)
- Each row will refer to a different ingredient
- If a particular ingredient is used in a particular method step then the corresponding cell is marked with an ‚ÄúX‚Äù otherwise it‚Äôs left blank

Tip: It's very important that you break down every single ingredient verbatim (with any preparation information - no changes!) as a separate row and copy the method descriptions and verbatim (without making any changes!) from each step to each column header.

Here‚Äôs an example:

```
Ingredients

vegetable oil
2 large free-range eggs
100 g plain flour
100 ml milk

Method

1. Preheat the oven to 225¬∞C/425¬∞F/gas 9.
2. Get yourself a cupcake tin and add a tiny splash of vegetable oil into each of the 12 compartments.
3. Pop into the oven for 10 to 15 minutes so the oil gets really hot.
4. Meanwhile, beat the eggs, flour, milk and a pinch of salt and pepper together in a jug until light and smooth.
5. Carefully remove the tray from the oven, then confidently pour the batter evenly into the compartments.
6. Pop the tray back in the oven to cook for 12 to 15 minutes, or until risen and golden.
```

would output this tsv file:
```
Preheat the oven to 225¬∞C/425¬∞F/gas 9.	Get yourself a cupcake tin and add a tiny splash of vegetable oil into each of the 12 compartments.	Pop into the oven for 10 to 15 minutes so the oil gets really hot.	Meanwhile, beat the eggs, flour, milk and a pinch of salt and pepper together in a jug until light and smooth.	Carefully remove the tray from the oven, then confidently pour the batter evenly into the compartments.	Pop the tray back in the oven to cook for 12 to 15 minutes, or until risen and golden.
vegetable oil		X	X		X	X
2 large free-range eggs				X	X	X
100 g plain flour				X	X	X
100 ml milk				X	X	X
```

In [38]:
instruction = """Your task is to transform cooking recipes from raw text into a Gantt chart .tsv file which conveys all the same information but graphically so one can see which ingredients are involved in each step. In the end, we wish to produce a downloadable .tsv file containing a table. It will be structured as follows:

- the column headers will contain the full text description of each step in the recipe (verbatim as in the original recipe)
- Each row will refer to a different ingredient
- If a particular ingredient is used in a particular method step then the corresponding cell is marked with an ‚ÄúX‚Äù otherwise it‚Äôs left blank

Tip: It's very important that you break down every single ingredient verbatim (with any preparation information - no changes!) as a separate row and copy the method descriptions and verbatim (without making any changes!) from each step to each column header.

Here‚Äôs an example:

```
Ingredients

vegetable oil
2 large free-range eggs
100 g plain flour
100 ml milk

Method

1. Preheat the oven to 225¬∞C/425¬∞F/gas 9.
2. Get yourself a cupcake tin and add a tiny splash of vegetable oil into each of the 12 compartments.
3. Pop into the oven for 10 to 15 minutes so the oil gets really hot.
4. Meanwhile, beat the eggs, flour, milk and a pinch of salt and pepper together in a jug until light and smooth.
5. Carefully remove the tray from the oven, then confidently pour the batter evenly into the compartments.
6. Pop the tray back in the oven to cook for 12 to 15 minutes, or until risen and golden.
```

would output this tsv file:
```
Preheat the oven to 225¬∞C/425¬∞F/gas 9.	Get yourself a cupcake tin and add a tiny splash of vegetable oil into each of the 12 compartments.	Pop into the oven for 10 to 15 minutes so the oil gets really hot.	Meanwhile, beat the eggs, flour, milk and a pinch of salt and pepper together in a jug until light and smooth.	Carefully remove the tray from the oven, then confidently pour the batter evenly into the compartments.	Pop the tray back in the oven to cook for 12 to 15 minutes, or until risen and golden.
vegetable oil		X	X		X	X
2 large free-range eggs				X	X	X
100 g plain flour				X	X	X
100 ml milk				X	X	X
```"""

In [39]:
odf["instruction"] = instruction

In [41]:
odf.head()

Unnamed: 0,instruction,input,output
0,Your task is to transform cooking recipes from...,Ingredients\n\n5 pounds boneless chicken thigh...,"\tCombine the meat, salt, pepper, garlic, basi..."
1,Your task is to transform cooking recipes from...,"Ingredients\n\n12 oz fresh crabmeat, drained\n...","\tIn a medium bowl, combine crab, lime juice a..."
2,Your task is to transform cooking recipes from...,"Ingredients\n\n3 pounds tomatoes, preferably o...",\tFirst make the soup. Preheat the oven to 400...
3,Your task is to transform cooking recipes from...,Ingredients\n\n1/2 cup fresh herbs (minced)\n4...,Ingredient\tPreheat your grill ‚Äî medium-low he...
4,Your task is to transform cooking recipes from...,Ingredients\n\n200 g caster sugar\n200 ml wate...,Sorbets are always a nice way to finish a meal...


In practice I can choose to either use the instruction, or copy the input into the instruction column and leave the input blank (which looks like is convention).

In [44]:
# odf.to_json(DATA/"dataset.jsonl", orient='records', lines=True)