# Notebook for Testing Prompts with GPT-4o

This is for generating synthetic dataset which involves having the LLM (GPT-4o in this case) add different magnitude of spin to a given abstract.

In [None]:
# import package
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import os
from dotenv import load_dotenv

In [None]:
def load_api_keys():
    load_dotenv(override=True)
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    return client

In [None]:
# get original abstracts used for testing prompts (DEV set)
df_abstracts = pd.read_csv('../data/FactPico_115_abstracts_with_pmid_pmcid_title.csv')
df_dev_abstracts = df_abstracts[df_abstracts['split'] == 'DEV']

# count number of abstracts
print('Number of abstracts in DEV set:', len(df_dev_abstracts))

## PART TO CUSTOMIZE

In [None]:
PROMPT_TEMPLATE_NAME = 'prompt_template'

In [None]:
SYSTEM_PROMPT = 'Given the following abstract, please provide the PICO elements (Population, Intervention, Comparison, Outcome)'

In [None]:
PROMPT_TEMPLATE = 'Given the following abstract, please provide the PICO elements. \n\nTitle: {title} \nAbstract: {abstract}'

In [None]:
NEW_FILENAME = "gpt4o_xxxx.csv"

## RUN generation

In [None]:
def gen_gpt4o(title, abstract, client):
    
    sys_prompt = SYSTEM_PROMPT.replace('{title}', title).replace('{abstract}', abstract)
    user_prompt = PROMPT_TEMPLATE.replace('{title}', title).replace('{abstract}', abstract)

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {'role':'system', 'content': sys_prompt},
            {'role': 'user', 'content': user_prompt}
        ]
    )
    response_content = response.choices[0].message.content
    return response_content

In [None]:
# prompt_template, prompt_with_input, model, model_output
client = load_api_keys()

output_data = []
for i, row in tqdm(df_dev_abstracts.iterrows(), total=df_dev_abstracts.shape[0]):
    print(i, len(output_data))
    if i < len(output_data):
        continue
    data_dict = {}
    data_dict['pmid'] = row['pmid']
    data_dict['pmcid'] = row['pmcid']
    data_dict['title'] = row['title']
    data_dict['abstract'] = row['abstract']
    data_dict['prompt_template_name'] = PROMPT_TEMPLATE_NAME
    data_dict['prompt_template'] = SYSTEM_PROMPT + ' ' + PROMPT_TEMPLATE
    data_dict['model_name'] = 'gpt-4o'
    data_dict['model_output'] = gen_gpt4o(row['title'], row['abstract'], client)
    
    output_data.append(data_dict)
    
new_df = pd.DataFrame.from_dict(output_data)

new_df.to_csv(NEW_FILENAME)