# Testing - Step 5

- We will use ROUGE-1, ROUGE-2, ROUGE-L (sentence-level), ROUGE-L (summary-level), BERTScore (accepting truncation) from the original [ACI-Bench paper](https://www.nature.com/articles/s41597-023-02487-3) and also work by Liu et al. on non-fine-tuned [ChatGPT](https://doi.org/10.1186/s12911-024-02481-8). 
- In the ACI-Bench paper, GPT-4 exhibited the best performance, but was not fine-tuned on the ACI-BENCH clinical note format. 
- In the paper by Liu et al. ChatGPT paper, GPT-3.5 exhibited good performance, but was also not fine-tuned.

In [15]:
import openai
from openai import OpenAI
import os
from dotenv import load_dotenv
import pandas as pd
# for evaluation of results
from rouge_score import rouge_scorer
from bert_score import BERTScorer # use BERTScore and accept truncation

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
load_dotenv()  # load environment variables from .env file
openai.api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI()

## Generate Test Files

In [7]:
# prepare test files
dfs = []
for file in os.listdir('./clinical_visit_note_summarization_corpus/data/aci-bench/challenge_data/'):
    if 'test' in file and 'metadata' not in file:
        dfs.append(pd.read_csv(f'./clinical_visit_note_summarization_corpus/data/aci-bench/challenge_data/{file}'))
all_dfs = pd.concat(dfs, ignore_index=True)

In [8]:
def model_note_testing(model_name, all_dfs, save_file):
  ai_generated_notes = []
  for i in range(len(all_dfs['dialogue'])):
    completion = client.chat.completions.create(
      model=model_name,
      messages=[
        {"role": "system", "content": "You are an expert medical professional. Given a clinical dialogue, create a clinical note outlining key dialogue aspects such as 'CHIEF COMPLAINT' (or 'CC'), 'HISTORY OF PRESENT ILLNESS' ('HPI'), 'REVIEW OF SYSTEMS', 'PHYSICAL EXAMINATION', 'VITALS REVIEWED', 'RESULTS', 'ASSESSMENT AND PLAN', 'INSTRUCTIONS', 'CURRENT MEDICATIONS', 'PAST MEDICAL HISTORY', 'EXAM', 'IMPRESSION', 'PLAN', 'ASSESSMENT', 'PAST HISTORY', 'ALLERGIES', 'SOCIAL HISTORY', 'PHYSICAL EXAM', 'PROCEDURE', 'FAMILY HISTORY', 'MEDICATIONS', 'VITALS', 'MEDICAL HISTORY', 'SURGICAL HISTORY'. You will not use all of these aspects in every dialogue, vary it from dialogue to dialogue."},
        {"role": "user", "content": f"Dialogue: {all_dfs['dialogue'][i]}"}
      ]
    )
    ai_generated_notes.append(completion.choices[0].message.content)
  new_dataframe = {'dialogue': all_dfs['dialogue'], 'human_note': all_dfs['note'], 'ai_note':ai_generated_notes}
  new_dataframe = pd.DataFrame(new_dataframe)
  if '.csv' in save_file:
    save_file = save_file.replace('.csv', '')
  new_dataframe.to_csv(f'./testing_files/{save_file}.csv')
  return new_dataframe

In [9]:
df = model_note_testing("ft:gpt-3.5-turbo-0125:personal:id0-0-5-3-1:9lHCKWt6", all_dfs, "id0")

In [10]:
list_of_models = ["ft:gpt-3.5-turbo-0125:personal:id0-0-5-3-1:9lHCKWt6", "ft:gpt-3.5-turbo-0125:personal:id1-0-5-3-32:9lH86WrE", "ft:gpt-3.5-turbo-0125:personal:id2-0-5-3-67:9lH8UvK4", "ft:gpt-3.5-turbo-0125:personal:id3-0-5-10-1:9lHvfLG8", "ft:gpt-3.5-turbo-0125:personal:id4-0-5-10-32:9lHdGty4", "ft:gpt-3.5-turbo-0125:personal:id5-0-5-10-67:9lHc7RWd", "ft:gpt-3.5-turbo-0125:personal:id6-1-3-1:9lInZ0Yi", "ft:gpt-3.5-turbo-0125:personal:id7-1-3-32:9lIiZDyS", "ft:gpt-3.5-turbo-0125:personal:id8-1-3-67:9lIhXPXC", "ft:gpt-3.5-turbo-0125:personal:id9-1-10-1:9lJxwuWa", "ft:gpt-3.5-turbo-0125:personal:id10-1-10-32:9lJkMCw1", "ft:gpt-3.5-turbo-0125:personal:id11-1-10-67:9lJkGqJ5", "ft:gpt-3.5-turbo-0125:personal:id12-2-3-1:9lJz2sxA", "ft:gpt-3.5-turbo-0125:personal:id13-2-3-32:9lJt3emA", "ft:gpt-3.5-turbo-0125:personal:id14-2-3-67:9lK2HhyF", "ft:gpt-3.5-turbo-0125:personal:id15-2-10-1:9lKVnn72", "ft:gpt-3.5-turbo-0125:personal:id16-2-10-32:9lKCyb7z", "ft:gpt-3.5-turbo-0125:personal:id17-2-10-67:9lKF9kn8"]
for i in range(len(list_of_models)):
    if i != 0:
        model_note_testing(list_of_models[i], all_dfs, f"id{i}")
    

- time taken to cycle through all hyperparameterised models - 326m 50.4s 

In [13]:
# perform model testing on default gpt model
model_note_testing("gpt-3.5-turbo-0125", all_dfs, 'no-fine-tuning')

Unnamed: 0,dialogue,human_note,ai_note
0,"[doctor] hi , andrew . how are you ?\n[patient...",CHIEF COMPLAINT\n\nUpper respiratory infection...,**CHIEF COMPLAINT (CC):**\nPatient presents wi...
1,"[doctor] hi andrea , how are you ?\n[patient] ...",CHIEF COMPLAINT\n\nAnnual exam.\n\nHISTORY OF ...,**CHIEF COMPLAINT (CC):**\n- 52-year-old femal...
2,"[doctor] hi , albert . how are you ?\n[patient...",CHIEF COMPLAINT\n\nER follow-up.\n\nHISTORY OF...,**CHIEF COMPLAINT**:\n- Emergency room follow-...
3,"[doctor] hi jerry , how are you doing ?\n[pati...",CHIEF COMPLAINT\n\nAnnual exam.\n\nHISTORY OF ...,**CHIEF COMPLAINT (CC):** Insomnia and follow-...
4,"[doctor] hello , mrs . martinez . good to see ...",CC:\n\nRight arm pain.\n\nHPI:\n\nMs. Martinez...,**Clinical Note:**\n- **CC (Chief Complaint):*...
...,...,...,...
115,[doctor] good afternoon matthew how are you\n[...,CHIEF COMPLAINT\n\nVision changes in the right...,"**CHIEF COMPLAINT (CC):**\n- Vision changes, t..."
116,[doctor] okay well hi joe i understand you've ...,CHIEF COMPLAINT\n\nRight knee injury.\n\nHISTO...,**CHIEF COMPLAINT:** \nPatient presents with ...
117,[doctor] hey angela how are you\n[patient] i'm...,CHIEF COMPLAINT\n\nFollow-up of stage III non-...,**CHIEF COMPLAINT:**\n- Follow-up on neo-adjuv...
118,[doctor] hey joshua good to see you today so t...,CHIEF COMPLAINT\n\nRight flank pain.\n\nHISTOR...,**CHIEF COMPLAINT:** Patient presents with rig...


## Evaluate Test Files

In [16]:
average_scores = []
bert = BERTScorer(model_type="bert-base-uncased")
rouge = rouge_scorer.RougeScorer(['rouge1','rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True, split_summaries=True)
for file in sorted(os.listdir('./testing_files/')):
    if  'csv' in file:
        temp_df = pd.read_csv(f"./testing_files/{file}")
        df_list = []
        for i in range(len(temp_df)):
            scores = rouge.score(temp_df['human_note'][i], temp_df['ai_note'][i])
            P, R, F1 = bert.score([temp_df['ai_note'][i]], [temp_df['human_note'][i]]) 
            temp_row = {'rouge1': scores['rouge1'].fmeasure, 'rouge2': scores['rouge2'].fmeasure, 'rougeL': scores['rougeL'].fmeasure, 'rougeLsum': scores['rougeLsum'].fmeasure, 'bertScore': F1}
            df_list.append(temp_row)
        df = pd.DataFrame(df_list)
        average_scores.append({'id': file.replace('.csv', ''), 'rouge1': float(df['rouge1'].mean()), 'rouge2': float(df['rouge2'].mean()), 'rougeL': float(df['rougeL'].mean()), 'rougeLsum': float(df['rougeLsum'].mean()), 'bertScore': float(df['bertScore'].mean()), 'average': (float(df['rouge1'].mean()) + float(df['rouge2'].mean()) + float(df['rougeL'].mean()) + float(df['rougeLsum'].mean()) + float(df['bertScore'].mean())) / (5)})
all_scores_df = pd.DataFrame(average_scores) 
all_scores_df.loc[len(all_scores_df.index)] = ['average', all_scores_df['rouge1'].mean(), all_scores_df['rouge2'].mean(), all_scores_df['rougeL'].mean(), all_scores_df['rougeLsum'].mean(), all_scores_df['bertScore'].mean(), all_scores_df['average'].mean()]  

In [17]:
print(all_scores_df)
all_scores_df.to_csv('./testing_files/testing_metrics.csv')

                   id    rouge1    rouge2    rougeL  rougeLsum  bertScore  \
0   default-aci-bench  0.547839  0.233264  0.288649   0.532430   0.683181   
1                 id0  0.461466  0.161440  0.222590   0.447362   0.645535   
2                 id1  0.488685  0.189492  0.254347   0.473521   0.646551   
3                id10  0.504063  0.183951  0.241418   0.489548   0.659889   
4                id11  0.461714  0.156300  0.219464   0.446362   0.642121   
5                id12  0.552501  0.236541  0.296557   0.538425   0.687813   
6                id13  0.401571  0.121552  0.180548   0.388084   0.617024   
7                id14  0.429907  0.139586  0.203204   0.414540   0.632077   
8                id15  0.601781  0.299588  0.354807   0.587760   0.715227   
9                id16  0.548803  0.225141  0.284379   0.532322   0.682128   
10               id17  0.500529  0.182463  0.237837   0.484936   0.654692   
11                id2  0.483402  0.185078  0.252262   0.465780   0.642909   