# Notebook for Testing Prompts with GPT-4o

This is for generating synthetic dataset which involves having the LLM (GPT-4o in this case) add different magnitude of spin to a given abstract.

In [288]:
# import package
from openai import OpenAI
import pandas as pd
from tqdm import tqdm
import os
from dotenv import load_dotenv
from enum import Enum
from pydantic import BaseModel
from json import dumps

In [289]:
def load_api_keys():
    load_dotenv(override=True)
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
    return client

In [290]:
# get original abstracts used for testing prompts (DEV set)
df_abstracts = pd.read_csv('../data/FactPico_115_abstracts_with_pmid_pmcid_title.csv')
df_dev_abstracts = df_abstracts[df_abstracts['data_split'] == 'DEV']

# FOR TESTING (TODO: remove)
# df_dev_abstracts = df_dev_abstracts.sample(3)

# count number of abstracts
print('Number of abstracts in DEV set:', len(df_dev_abstracts))

Number of abstracts in DEV set: 14


## PART TO CUSTOMIZE

In [291]:
TABLULAR_EXAMPLE_FORMAT_TYPE = 'markdown' # 'json' or 'tsv' or 'markdown'

examples_df = pd.read_csv('../data/example_w_wo_spin_boutron_2014.csv')  # load the examples

if TABLULAR_EXAMPLE_FORMAT_TYPE == 'json':
    EXAMPLES = dumps(examples_df.to_dict(orient='records'))
elif TABLULAR_EXAMPLE_FORMAT_TYPE == 'tsv':
    EXAMPLES = examples_df.to_csv(index=False, sep='\t')
elif TABLULAR_EXAMPLE_FORMAT_TYPE == 'markdown':
    EXAMPLES = examples_df.to_markdown(index=False)
else:
    EXAMPLES = dumps(examples_df.to_dict(orient='records'))

In [292]:
PROMPT_TEMPLATE_NAME = 'gpt4o-guideline-1-example-remove-spin-markdown-temp0_6-p0_7'

In [293]:
SYSTEM_PROMPT = 'You are an expert in the field of health sciences.'

In [294]:
PROMPT_TEMPLATE = '''Rewrite the provided abstract to remove any instances of spin. The length of the new abstract should be within 25 words of the original. Do not make any changes if there are no instances of spin in the original. Follow the guidelines below for removing spin from an abstract.
Guidelines for writing an abstract without spin:
* In the Context section:
    * Delete all information that could distort the understanding of the aim of the trial.
        * The aim is to evaluate the treatment effect on a secondary outcome.
        * The aim is to evaluate the treatment effect for a subgroup.
        * The aim is to evaluate overall improvement.
* In the Methods section:
    * Clearly report the primary outcome.
    * According to space constraints, report all secondary outcomes evaluated in the Methods section or report no secondary outcome evaluated in the Methods section to avoid specifically highlighting statistically significant secondary outcomes.
    * Delete information that could distort the understanding of the aim (eg, within-group comparison, modified population analysis, subgroup analysis).
* In the Results section:
    * Delete subgroup analyses that were not prespecified, based on the primary outcome, and interpreted in light of the totality of prespecified subgroup analyses undertaken.
    * Delete within-group comparisons.
    * Delete linguistic spin.
    * Report the results for the primary outcome with numbers in both arms (if possible with some measure of variability) with no wording of judgment.
    * Report results for all secondary outcomes, for no secondary outcome, or for the most clinically important secondary outcome.
    * Report safety data including reason for withdrawals; report treatment discontinuation when applicable.
* In the Conclusions section:
    * Delete the author conclusion, and only add the following standardized conclusion: “the treatment A was not more effective than comparator B in patients with….”
    * Specify the primary outcome in the conclusion when some secondary outcomes were statistically significant: “the treatment A was not more effective on overall survival than the comparator B in patients with….”

Example: {examples}

Abstract: {abstract}'''

In [295]:
TEMPERATURE = 0.6
TOP_P = 0.7

In [296]:
NEW_FILENAME = "./prompt_engineering/" + PROMPT_TEMPLATE_NAME + ".csv"

In [297]:
class Severity(Enum):
    subtle = 'subtle'
    moderate = 'moderate'
    strong = 'strong'

# Used for generating rationales for each edit
class Documentation(BaseModel):
    edit: str
    rationale: str
    strategy_applied: str

class AbstractResponse(BaseModel):
    title: str
    abstract: str
    spin_severity: Severity
    documentation: list[Documentation]

class Response(BaseModel):
    generated: list[AbstractResponse]

## RUN generation

In [298]:
def format_model_output(model_output_response: Response):
    formatted_output = ""
    for response in model_output_response.generated:
        formatted_documentation = ""
        for documentation in response.documentation:
            formatted_documentation += f"Edit: {documentation.edit}\nRationale: {documentation.rationale}\nStrategy Applied: {documentation.strategy_applied},\n"
        formatted_output += f"{response.spin_severity.value.capitalize()}:\nTitle: {response.title}\nAbstract: {response.abstract}\nDocumentation: [{formatted_documentation}]\n\n"

    return formatted_output

In [299]:
def gen_gpt4o(title, abstract, client, temperature=1, top_p=1):
    
    sys_prompt = SYSTEM_PROMPT.replace('{title}', title).replace('{abstract}', abstract).replace('{examples}', EXAMPLES)
    user_prompt = PROMPT_TEMPLATE.replace('{title}', title).replace('{abstract}', abstract).replace('{examples}', EXAMPLES)
    try:
        # TODO
        # response = client.beta.chat.completions.parse(
        #     model="gpt-4o",
        #     temperature=temperature,
        #     top_p=top_p,
        #     messages=[
        #         {'role':'system', 'content': sys_prompt},
        #         {'role': 'user', 'content': user_prompt}
        #     ],
        #     response_format=Response,
        # )
        response = client.chat.completions.create(
            model="gpt-4o",
            temperature=temperature,
            top_p=top_p,
            messages=[
                {'role':'system', 'content': sys_prompt},
                {'role': 'user', 'content': user_prompt}
            ],
        )
        return response.choices[0].message.content
    # TODO
    #     response_message = response.choices[0].message
    #     if response_message.parsed:
    #         response = response_message.parsed
    #         return response 
    #     elif response_message.refusal:
    #         # handle refusal
    #         print(response_message.refusal)
    except Exception as e:
        # Handle exceptions
        print(e)
        pass   

In [300]:
# prompt_template, prompt_with_input, model, model_output
client = load_api_keys()

output_data = []
for i, row in tqdm(df_dev_abstracts.iterrows(), total=df_dev_abstracts.shape[0]):
    data_dict = {}
    data_dict['pmid'] = row['pmid']
    data_dict['pmcid'] = row['pmcid']
    data_dict['title'] = row['title']
    data_dict['abstract'] = row['abstract']
    data_dict['prompt_template_name'] = PROMPT_TEMPLATE_NAME
    data_dict['prompt_template'] = 'system prompt: ' + SYSTEM_PROMPT + ' user prompt: ' + PROMPT_TEMPLATE
    data_dict['model_name'] = 'gpt-4o'
    data_dict['temperature'] = TEMPERATURE
    data_dict['top_p'] = TOP_P
    model_output = gen_gpt4o(row['title'], row['abstract'], client, TEMPERATURE, TOP_P)
    # data_dict['model_output'] = format_model_output(model_output)
    # TODO
    data_dict['model_output'] = model_output

    output_data.append(data_dict)
    
new_df = pd.DataFrame.from_dict(output_data)

new_df.to_csv(NEW_FILENAME, index=False)

100%|██████████| 14/14 [01:35<00:00,  6.81s/it]
