In [1]:
import pandas as pd
import os
import json
from dotenv import load_dotenv
from openai import OpenAI

client = OpenAI()

In [2]:
# variables
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

# models
cheap = 'gpt-4o'
best = 'o3'

# prompts and schemas
from prompts import PROMPT_FULL, PROMPT_CONCEPTS, PROMPT_INTERVENTIONS, PROMPT_EDGES
from schemas import SCHEMA_FULL, SCHEMA_CONCEPTS, SCHEMA_INTERVENTIONS, SCHEMA_EDGES

# papers
input_dir = '../inputdata_development_paper_set'
output_dir = 'outputs'
paper = '2307.16513v2.pdf'

In [None]:
# test api
# response = client.responses.create(
#     model=best,
#     input='This is a test, say hello.'
# )

# print(response.output_text)

In [3]:
# prompt with paper only
def prompt_paper(input_path: str, output_path: str, prompt_text: str, json_schema: dict,  model: str = 'gpt-4o'):

    file = client.files.create(
        file=open(input_path, "rb"),
        purpose="user_data"
    )
    file_id = file.id

    input = [
        {
            "role": "user",
            "content": [
                {
                    "type": "input_file",
                    "file_id": file_id
                },
                {
                    "type": "input_text",
                    "text": prompt_text
                }
            ]
        }
    ]

    response = client.responses.create(
        model=model,
        input=input
    )

    data = json.loads(response.output_text)
    
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    return response

In [4]:
# full prompt
full_resp = prompt_paper(
    input_path=os.path.join(input_dir, paper),
    output_path=os.path.join(output_dir, paper + "_full.json"),
    prompt_text=PROMPT_FULL,
    json_schema=SCHEMA_FULL,
    model=best
)

In [5]:
# concepts prompt
concepts_resp = prompt_paper(
    input_path=os.path.join(input_dir, paper),
    output_path=os.path.join(output_dir, paper + "_concepts.json"),
    prompt_text=PROMPT_CONCEPTS,
    json_schema=SCHEMA_CONCEPTS,
    model=best
)

In [6]:
# interventions prompt
concepts_resp = prompt_paper(
    input_path=os.path.join(input_dir, paper),
    output_path=os.path.join(output_dir, paper + "_interventions.json"),
    prompt_text=PROMPT_INTERVENTIONS,
    json_schema=SCHEMA_INTERVENTIONS,
    model=best
)

In [None]:
# edges prompt

# working ..
# edges_resp = prompt_json_paper(
#     input_path=os.path.join(input_dir, paper),
#     output_path=os.path.join(output_dir, paper + "_full.json"),
#     prompt_text=PROMPT_INTERVENTIONS,
#     json_schema=SCHEMA_INTERVENTIONS,
#     model=best
# )