# Notebook for Testing Prompts with GPT-4o

This is for generating synthetic dataset which involves having the LLM (GPT-4o in this case) add different magnitude of spin to a given abstract.

In [10]:
# import package
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import os
from dotenv import load_dotenv

In [11]:
def load_api_keys():
    load_dotenv(override=True)
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    return client

In [12]:
# get original abstracts used for testing prompts (DEV set)
df_abstracts = pd.read_csv('../data/FactPico_115_abstracts_with_pmid_pmcid_title.csv')
df_dev_abstracts = df_abstracts[df_abstracts['data_split'] == 'DEV']

# count number of abstracts
print('Number of abstracts in DEV set:', len(df_dev_abstracts))

Number of abstracts in DEV set: 14


## PART TO CUSTOMIZE

In [13]:
PROMPT_TEMPLATE_NAME = 'gpt4o-generated-general-temp0_3-p0_2'

In [14]:
SYSTEM_PROMPT = 'You are an expert in the field of health sciences.'

In [15]:
PROMPT_TEMPLATE = 'Reframe the abstract below to make the treatment results appear more compelling, emphasizing effectiveness while maintaining the integrity of the original data. Use language strategies to highlight positive outcomes, such as choosing stronger adjectives, focusing on key benefits, and crafting a tone of optimism. Avoid altering any of the actual numerical results or statistical findings, ensuring all changes are limited to the way the quantitative results are communicated. Provide at least three versions with varying degrees of emphasis (subtle, moderate, and strong).\n\nAbstract: {abstract}'

In [16]:
TEMPERATURE = 0.3
TOP_P = 0.2

In [17]:
NEW_FILENAME = "./prompt_engineering/gpt4o-generated-general-temp0_3-p0_2.csv"

## RUN generation

In [18]:
def gen_gpt4o(title, abstract, client, temperature=1, top_p=1):
    
    sys_prompt = SYSTEM_PROMPT.replace('{title}', title).replace('{abstract}', abstract)
    user_prompt = PROMPT_TEMPLATE.replace('{title}', title).replace('{abstract}', abstract)

    response = client.chat.completions.create(
        model="gpt-4o",
        temperature=temperature,
        top_p=top_p,
        messages=[
            {'role':'system', 'content': sys_prompt},
            {'role': 'user', 'content': user_prompt}
        ]
    )
    response_content = response.choices[0].message.content
    return response_content

In [19]:
# prompt_template, prompt_with_input, model, model_output
client = load_api_keys()

output_data = []
for i, row in tqdm(df_dev_abstracts.iterrows(), total=df_dev_abstracts.shape[0]):
    data_dict = {}
    data_dict['pmid'] = row['pmid']
    data_dict['pmcid'] = row['pmcid']
    data_dict['title'] = row['title']
    data_dict['abstract'] = row['abstract']
    data_dict['prompt_template_name'] = PROMPT_TEMPLATE_NAME
    data_dict['prompt_template'] = SYSTEM_PROMPT + ' ' + PROMPT_TEMPLATE
    data_dict['model_name'] = 'gpt-4o'
    data_dict['model_output'] = gen_gpt4o(row['title'], row['abstract'], client, TEMPERATURE, TOP_P)
    
    output_data.append(data_dict)
    
new_df = pd.DataFrame.from_dict(output_data)

new_df.to_csv(NEW_FILENAME)

100%|██████████| 14/14 [06:35<00:00, 28.24s/it]
