# Query GPT4

I'm going to automate my first pass data GPT-4 generation using the API. 

In [1]:
from pathlib import Path
import os
from openai import OpenAI
from dotenv import load_dotenv
import random
import time
import pandas as pd
from tqdm.notebook import tqdm

load_dotenv()

Path.ls = lambda x: list(x.iterdir())

In [2]:
ASST_ID = os.environ.get("ASST_ID")

PROJECT = Path("../../")

PROMPT = PROJECT / "prompt.txt"

DATA = PROJECT / "data"
len((DATA / "recipes").ls())

426

In [3]:
df = pd.read_csv(DATA / "recipes-meta.csv")
df.head()

Unnamed: 0,_id,name,ingredients,url,image,ts,cookTime,source,recipeYield,datePublished,prepTime,description,totalTime,creator,recipeCategory,dateModified,recipeInstructions,simple_url
0,{'$oid': '516b456d96cc6251ae131a46'},Green Curry Broth,1 tablespoon coriander seeds\n1 1/2 teaspoon w...,http://www.101cookbooks.com/archives/green-cur...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984621817},PT15M,101cookbooks,Serves 4.,2010-08-24,PT20M,"A beautiful, thin green curry broth, fragrant ...",,,,,,101cookbooks.com
1,{'$oid': '516b454f96cc6251ae1319e7'},Heirloom Beans &amp; Seitan Recipe,"1 small-medium head of broccoli or broccolini,...",http://www.101cookbooks.com/archives/heirloom-...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984591273},PT20M,101cookbooks,,2009-11-30,PT10M,This is simply heirloom beans and roasted broc...,,,,,,101cookbooks.com
2,{'$oid': '516b451796cc6251ae13196d'},Spiced Caramel Corn,"4 ounces whole macadamia nuts (about 1 cup), o...",http://www.101cookbooks.com/archives/000169.html,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984535371},,101cookbooks,,2005-05-24,,101 Cookbooks: From the Archives: Spiced Caram...,,,,,,101cookbooks.com
3,{'$oid': '516b455196cc6251ae1319ed'},Buttermilk Squash Soup Recipe,1 teaspoon cumin seeds\n1/4 cup / 2 oz / 55g u...,http://www.101cookbooks.com/archives/buttermil...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984593142},PT40M,101cookbooks,,2010-07-06,PT15M,A nice way to use up a good amount of summer s...,,,,,,101cookbooks.com
4,{'$oid': '516b45b296cc6251ae131b0a'},Honey-sweetened Thumbprint Cookie Recipe,2/3 cup honey (I use a clover honey)\n1/3 cup ...,http://www.101cookbooks.com/archives/honeyswee...,http://www.101cookbooks.com/mt-static/images/f...,{'$date': 1365984690972},,101cookbooks,Makes a few dozen cookies.,2009-03-05,,A simple thumbprint cookie recipe. Made with w...,,,,,,101cookbooks.com


## Setup

In [4]:
client = OpenAI(
    # This is the default and can be omitted
    api_key=os.environ.get("OPENAI_API_KEY"),
)
assistant = client.beta.assistants.retrieve(ASST_ID)

In [5]:
import logging

# Set up logging configuration
logging.basicConfig(level=logging.INFO)
logging.getLogger("httpx").setLevel(logging.WARNING)


def get_response(recipe):
    thread = client.beta.threads.create()
    message = client.beta.threads.messages.create(
        thread_id=thread.id,
        role="user",
        content=recipe,
    )
    run = client.beta.threads.runs.create(
        thread_id=thread.id, assistant_id=assistant.id, instructions=""
    )
    status = None
    logging.info("Waiting for response...")
    while status != "completed":
        run = client.beta.threads.runs.retrieve(thread_id=thread.id, run_id=run.id)
        status = run.status
        time.sleep(1)
    logging.info("Response received 🥳")
    messages = client.beta.threads.messages.list(thread_id=thread.id)
    return messages


def get_file_id(messages):
    annotations = (
        messages.model_dump()
        .get("data", [{}])[0]
        .get("content", [{}])[0]
        .get("text", {})
        .get("annotations", [{}])
    )
    if annotations:
        file_id = annotations[0].get("file_path", {}).get("file_id")
    else:
        file_id = None
    return file_id


def download_file(file_id, fp):
    api_response = client.files.with_raw_response.retrieve_content(file_id)
    if api_response.status_code == 200:
        content = api_response.content
        with open(fp, "wb") as f:
            f.write(content)
        logging.info(f"File downloaded successfully to {fp}")

## Process recipes

In [6]:
already_generated = [p.parts[-2] for p in (DATA / "recipes").glob("*/gantt.tsv")]
gdf = df.loc[~df["name"].isin(already_generated)].copy()
len(gdf)

421

In [7]:
missed = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    dir = DATA / "recipes" / row["name"]
    assert dir.exists()
    logging.info(f"Processing {dir.parts[-1]}")
    recipe = dir / "recipe.txt"
    recipe = recipe.read_text()
    messages = get_response(recipe)
    file_id = get_file_id(messages)
    if file_id is not None:
        download_file(file_id, dir / "gantt.tsv")
    else:
        logging.info("No file produced")
        missed.append(dir.parts[-1])

# save missed to a file
with open("missed.txt", "w") as f:
    f.write("\n".join([str(x) for x in missed]))

  0%|          | 0/426 [00:00<?, ?it/s]

INFO:root:Processing Green Curry Broth
INFO:root:Waiting for response...
INFO:root:Response received 🥳
INFO:root:File downloaded successfully to ../../data/recipes/Green Curry Broth/gantt.tsv
INFO:root:Processing Heirloom Beans &amp; Seitan Recipe
INFO:root:Waiting for response...
INFO:root:Response received 🥳
INFO:root:File downloaded successfully to ../../data/recipes/Heirloom Beans &amp; Seitan Recipe/gantt.tsv
INFO:root:Processing Spiced Caramel Corn
INFO:root:Waiting for response...
INFO:root:Response received 🥳
INFO:root:File downloaded successfully to ../../data/recipes/Spiced Caramel Corn/gantt.tsv
INFO:root:Processing Buttermilk Squash Soup Recipe
INFO:root:Waiting for response...
INFO:root:Response received 🥳
INFO:root:File downloaded successfully to ../../data/recipes/Buttermilk Squash Soup Recipe/gantt.tsv
INFO:root:Processing Honey-sweetened Thumbprint Cookie Recipe
INFO:root:Waiting for response...
INFO:root:Response received 🥳
INFO:root:File downloaded successfully to ..