In [None]:
!pip install openai==0.28

In [None]:
from datasets import load_dataset
import openai
import pandas as pd
import numpy as np
import re
from collections import Counter
from matplotlib import pyplot as plt

In [None]:
# (helps with displaying dataframes containing long strings)
pd.set_option('display.max_colwidth', 0)

In [None]:
# Get API key
openai.api_key = input()

In [None]:
# Load synthetic clinical notes (1% sample of ~158K)
# See https://huggingface.co/datasets/starmpcc/Asclepius-Synthetic-Clinical-Notes for details
ds = load_dataset("starmpcc/Asclepius-Synthetic-Clinical-Notes", split="train[:1%]")

In [None]:
# Filter to Question Answering
ds_qa = ds.filter(lambda ex: ex["task"] == "Question Answering")

In [None]:
# Set maximum number of examples to run
N_EXAMPLES = ds_qa.num_rows
print(N_EXAMPLES)

In [None]:
# Loop through examples and record ChatGPT answers
new_answers_1 = []
for i in range(0, N_EXAMPLES):
    # Get notes and question
    temp_notes = ds_qa["note"][i]
    temp_question = ds_qa["question"][i]

    # Structure prompt
    temp_prompt = f"""
        Answer the following question given the context below:
        {temp_question}
        
        "{temp_notes}"
    """

    # Run through OpenAI
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": temp_prompt},
        ]
    )
    
    # Append to list
    new_answers_1.append(response.choices[0].message["content"])

In [None]:
# Convert to dataframe and save temporary copy
df_1 = pd.DataFrame({
    "note": ds_qa["note"][0:N_EXAMPLES],
    "question": ds_qa["question"][0:N_EXAMPLES],
    "answer": ds_qa["answer"][0:N_EXAMPLES],
    "new_answer": new_answers_1
})
df_1.to_csv("temp_llm_answers_1.csv", index=False)

In [None]:
# Define simple tokenizer (to use with calculating F1 score)
def simple_tokenize(string):
    # Remove non-alphanumeric
    string = re.sub("[^0-9a-zA-Z]+", " ", string)
    string = re.sub(r"\s+", " ", string)
    
    # Lowercase and split to list
    return string.lower().split()

In [None]:
# Simple function to calculate F1 score for two strings
def calc_f1_score(str1, str2):
    # Tokenize the strings
    tokens1 = simple_tokenize(str1)
    tokens2 = simple_tokenize(str2)
    
    # Create token counters
    counter1 = Counter(tokens1)
    counter2 = Counter(tokens2)
    
    # Calculate the number of common tokens
    common_tokens = sum((counter1 & counter2).values())
    
    # Calculate precision and recall
    precision = common_tokens / len(tokens2) if tokens2 else 0
    recall = common_tokens / len(tokens1) if tokens1 else 0
    
    # Calculate F1 score
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    return f1_score

In [None]:
# Calculate F1 score for original vs new answers
f1_scores = []
for i in range(0, df_1.shape[0]):
    f1_scores.append(calc_f1_score(df_1["answer"][i], df_1["new_answer"][i]))

df_1["f1_score"] = f1_scores

In [None]:
df_1.sort_values("f1_score").head(5)

In [None]:
df_1.sort_values("f1_score").tail(5)

In [None]:
plt.hist(df_1["f1_score"])
plt.xlabel("F1 Score")
plt.ylabel("Number of Examples")
print(f"Mean F1 score: {round(df_1['f1_score'].mean(), 2)}")
plt.show()

In [None]:
# Compare average string length
print(f"Mean # of characters for original answers: {np.mean([len(x) for x in df_1['answer']])}")
print(f"Mean # of characters for new answers: {np.mean([len(x) for x in df_1['new_answer']])}")

# Part 2: Re-run with more instructions

In [None]:
# Loop through examples and record ChatGPT answers
new_answers_2 = []
for i in range(1, N_EXAMPLES):
    # Get notes and question
    temp_notes = ds_qa["note"][i]
    temp_question = ds_qa["question"][i]

    # Structure prompt
    temp_prompt = f"""
        Here is an example of a question, context, and correct answer:
        Question: "{ds_qa["note"][0]}"
        Context: {ds_qa["note"][0]}
        Answer: {ds_qa["answer"][0]}

        
        Now answer the following question given the context below; keep the answer concise:
        {temp_question}
        
        "{temp_notes}"
    """

    # Run through OpenAI
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": temp_prompt},
        ]
    )
    
    # Append to list
    new_answers_2.append(response.choices[0].message["content"])

In [None]:
# Convert to dataframe and save temporary copy
df_2 = pd.DataFrame({
    "note": ds_qa["note"][1:N_EXAMPLES],
    "question": ds_qa["question"][1:N_EXAMPLES],
    "answer": ds_qa["answer"][1:N_EXAMPLES],
    "new_answer": new_answers_2
})
df_2.to_csv("temp_llm_answers_2.csv", index=False)

In [None]:
# Calculate F1 score for original vs new answers
f1_scores = []
for i in range(0, df_2.shape[0]):
    f1_scores.append(calc_f1_score(df_2["answer"][i], df_2["new_answer"][i]))

df_2["f1_score"] = f1_scores

In [None]:
plt.hist(df_2["f1_score"])
plt.xlabel("F1 Score")
plt.ylabel("Number of Examples")
print(f"Mean F1 score: {round(df_2['f1_score'].mean(), 2)}")
plt.show()

In [None]:
# Compare average string length
print(f"Mean # of characters for original answers: {np.mean([len(x) for x in df_2['answer']])}")
print(f"Mean # of characters for new answers: {np.mean([len(x) for x in df_2['new_answer']])}")

# Part 3: Prediction

Use ChatGPT to assign a severity score 1-10 to each example, where 10 is most severe. Ask the model to explain its reasoning.

In [None]:
# Loop through examples and record ChatGPT answers
new_answers_3 = []
for i in range(0, N_EXAMPLES):
    # Get notes and question
    temp_notes = ds_qa["note"][i]

    # Structure prompt
    temp_prompt = f"""
        Based on the notes below, rate the patient's level of illness on a score from 1 to 10, where 10 is the most severe.
        Please explain your reasoning in one sentence.
        Answers should be structured as "Score: __; Reasoning: __"
        
        Notes: "{temp_notes}"
    """

    # Run through OpenAI
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a medical professional reviewing clinical notes."},
            {"role": "user", "content": temp_prompt},
        ]
    )
    
    # Append to list
    new_answers_3.append(response.choices[0].message["content"])

In [None]:
# Convert to dataframe and save temporary copy
df_3 = pd.DataFrame({
    "note": ds_qa["note"][0:N_EXAMPLES],
    "score": [int(re.search(r"Score:\s*(\d+)", x).group(1)) for x in new_answers_3],
    "reasoning": [re.search(r"Reasoning:\s*(.*)", x).group(1) for x in new_answers_3],
})
df_2.to_csv("temp_llm_answers_3.csv", index=False)

In [None]:
df_3.sort_values("score").head()

In [None]:
plt.hist(df_3["score"])
plt.xlabel("Illness Severity Score")
plt.ylabel("Number of Examples")
print(f"Mean severity score: {round(df_3['score'].mean(), 2)}")
plt.show()