In [1]:
# !pip install --upgrade openai
import os
import sys
script_dir = os.getcwd()
root_dir = os.path.join(os.path.dirname(os.path.abspath(script_dir)))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(script_dir))))

import openai
from openai import OpenAI #1.93.0
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

from llm.inference import run_llm
from utils.prompt import get_prompt
from utils.io import load_object, save_object
from utils.evaluation import calc_eval_metrics

import numpy as np
import pandas as pd
import json
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import colorcet as cc
from statsmodels.stats.multitest import multipletests


### Test run GPT5

In [2]:
model_embed = 'text-embedding-3-small'
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Missing API key. Please set OPENAI_API_KEY in your .env file.")
CLIENT=OpenAI(api_key=api_key)

In [12]:
input_prompt="""
Hi, I recently moved to a 2-bedroom apartment and I would like to buy a new sofa that is around 70-inch wide and within a reasonable price range. Please give me recommendations. Please provide them in the following JSON schema.
{{
    "Sofa 1": {{
        "Name": ,
        "Brand": ,
        "Price": ,
        "Link":
        "Reason":
        }}
}}
"""

In [None]:
temp=1
random_seed=2025
model_type="gpt_reasoning"
model="gpt-5-2025-08-07"
max_len=2048
output, input=run_llm(input_prompt, CLIENT, model_type, model, max_len, temp, random_seed)
print(output)




#### GPT5 with structured RAG

In [None]:
moalmanac_data=pd.read_csv(os.path.join(root_dir, 'data/moa_fda_queries_answers.csv'), index_col=0)
synthetic_prompt_groundtruth_dict=load_object(os.path.join(root_dir, 'data/synthetic_prompt_groundtruth_dict.pkl'))

In [7]:
# Load RAG-LLM and baseline results
base_model='gpt5'
llm_res_dict_0 = load_object(filename=os.path.join(root_dir, f'output/LLM_res_{base_model}/stra0n5temp0.0_res_dict.pkl'))
rag_struc_res_dict_0 = load_object(filename=os.path.join(root_dir, f'output/RAG_res_{base_model}/structured/RAGstra0n1temp0.0_res_dict.pkl'))

In [9]:
# Evaluate RAG-LLM output from the first iteration
llm_results = calc_eval_metrics(llm_res_dict_0['full output'][0], moalmanac_data['prompt'], synthetic_prompt_groundtruth_dict)
struc_results = calc_eval_metrics(rag_struc_res_dict_0['full output'][0], moalmanac_data['prompt'], synthetic_prompt_groundtruth_dict)

print("-----First iteration-----")
print(f"LLM-only: {llm_results['avg_exact_match_acc']}, {llm_results['avg_partial_match_acc']}, {llm_results['avg_precision']}")
print(f"Structured: {struc_results['avg_exact_match_acc']}, {struc_results['avg_partial_match_acc']}, {struc_results['avg_precision']}")

for i in range(5):
    llm_results = calc_eval_metrics(llm_res_dict_0['full output'][i], moalmanac_data['prompt'], synthetic_prompt_groundtruth_dict)
    print(f"LLM-only: {llm_results['avg_exact_match_acc']}, {llm_results['avg_partial_match_acc']}")


-----First iteration-----
LLM-only: 0.6923076923076923, 0.8589743589743589, 0.2927492260825598
Structured: 0.8931623931623932, 0.9487179487179487, 0.7032865282865282
LLM-only: 0.6923076923076923, 0.8589743589743589
LLM-only: 0.688034188034188, 0.8589743589743589
LLM-only: 0.6367521367521367, 0.7948717948717948
LLM-only: 0.6666666666666666, 0.8376068376068376
LLM-only: 0.6324786324786325, 0.7905982905982906


### Test run o4-mini

In [2]:
input_prompt="""
Hi, I recently moved to a 2-bedroom apartment and I would like to buy a new sofa that is around 70-inch wide and within a reasonable price range. Please give me recommendations. Please provide them in the following JSON schema.
{{
    "Sofa 1": {{
        "Name": ,
        "Brand": ,
        "Price": ,
        "Link":
        "Reason":
        }}
}}
"""
CLIENT=OpenAI(api_key=api_key)
temp=0
random_seed=2025

In [3]:
model_type="gpt_reasoning"
model="o4-mini-2025-04-16"
max_len=None
output, input=run_llm(input_prompt, CLIENT, model_type, model, max_len, temp, random_seed)
print(output)

{
  "Sofa 1": {
    "Name": "Serta Rane Collection Upholstered Loveseat",
    "Brand": "Serta",
    "Price": "$399.00",
    "Link": "https://www.amazon.com/dp/B07YZ5TKHF",
    "Reason": "At 71.2\" wide, this loveseat fits snugly in a 2-bedroom living space. It features pocketed coils and foam layers for comfort, and its neutral upholstery pairs well with most decors."
  },
  "Sofa 2": {
    "Name": "Modway Engage Mid Century Modern Glam Performance Fabric Sofa",
    "Brand": "Modway",
    "Price": "$449.00",
    "Link": "https://www.amazon.com/dp/B07GBL9FHP",
    "Reason": "With a 71\" width and durable, easy-clean performance fabric, this sofa blends style and practicality in a compact footprint ideal for smaller apartments."
  },
  "Sofa 3": {
    "Name": "EKTORP 2-seat Sofa",
    "Brand": "IKEA",
    "Price": "$349.99",
    "Link": "https://www.ikea.com/us/en/p/ektorp-2-seat-sofa-olde-brown-80400645/",
    "Reason": "Measuring 68 7/8\" wide, the EKTORP sofa is slightly under 70\" bu

In [None]:
model_type="gpt_reasoning"
model="o4-mini-2025-04-16"
max_len=None
random_seed=2025
params = {
        "model": model,
        "messages": [{"role": "user", "content": input_prompt}],
        "max_completion_tokens": max_len,
        "seed": random_seed,
        "response_format": {"type": "json_object"}
        }

params['reasoning_effort']='medium'

In [52]:
output=CLIENT.chat.completions.create(**params)

In [57]:
output.usage


CompletionUsage(completion_tokens=851, prompt_tokens=91, total_tokens=942, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=576, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0))

### Examine o4-mini output

In [18]:
output=load_object(os.path.join(root_dir,'output/RAG_res_o4mini/structured/RAGstra0n1temp0.0_res_dict.pkl'))
output.keys()

dict_keys(['full output', 'input prompt', 'runtime'])

In [19]:
print(np.min([len(input) for input in output['input prompt'][0]]))
print(np.median([len(input) for input in output['input prompt'][0]]))
print(np.max([len(input) for input in output['input prompt'][0]]))

6885
8131.0
11146


In [20]:
print(np.min([len(output) for output in output['full output'][0]]))
print(np.median([len(output) for output in output['full output'][0]]))
print(np.max([len(output) for output in output['full output'][0]]))

0
672.5
2467


### Evaluate o4-mini output

In [None]:
moalmanac_data=pd.read_csv(os.path.join(root_dir, 'data/moa_fda_queries_answers.csv'), index_col=0)
synthetic_prompt_groundtruth_dict=load_object(os.path.join(root_dir, 'data/synthetic_prompt_groundtruth_dict.pkl'))

#### Structured

In [None]:
output_structured=load_object(os.path.join(root_dir,'output/RAG_res_o4mini/structured/RAGstra0n1temp0.0_res_dict.pkl'))

In [50]:
output_structured_eval=calc_eval_metrics(output['full output'][0], moalmanac_data['prompt'], synthetic_prompt_groundtruth_dict)
output_structured_eval.keys()

dict_keys(['avg_exact_match_acc', 'avg_partial_match_acc', 'avg_precision', 'avg_recall', 'avg_f1', 'avg_specificity', 'exact_match_acc', 'partial_match_acc', 'precision_ls', 'recall_ls', 'f1_ls', 'specificity_ls', 'pred_drugs_generic_set_ls', 'true_drugs_generic_set_ls'])

In [None]:
max_len=2048

i_pass_max_len={}
for i, e in enumerate(output_structured['full output'][0]):
    if len(e) > max_len:
        i_pass_max_len[i] = {
            'input prompt': output_structured['input prompt'][0][i],
            'full output': e,
            'pred drugs': output_structured_eval['pred_drugs_generic_set_ls'][i],
            'true drugs': output_structured_eval['true_drugs_generic_set_ls'][i]
            }

In [None]:
# Outputs with length of tokens exceeding max_len (2048)
for k,v in i_pass_max_len.items():
    print(k)
    print(v['input prompt'])
    print(v['pred drugs'])
    print(v['true drugs'])

33

    Context information is below.
    ---------------------
    ['If a melanoma patient has a somatic variant in gene BRAF (c.1799T>A or p.V600E), and is unresectable or metastatic, one recommended treatment would be Vemurafenib. The U.S. Food and Drug Administration (FDA) granted regular approval to vemurafenib for the treatment of patients with unresectable or metastatic melanoma (MEL) with BRAF V600E mutation, as detected by an FDA-approved test. (Citation: Genentech, Inc. Zelboraf (vemurafenib) [package insert]. U.S. Food and Drug Administration website. https://www.accessdata.fda.gov/drugsatfda_docs/label/2020/202429s019lbl.pdf. Revised May 2020. Accessed November 12, 2020.).', 'If a melanoma patient has a somatic variant in gene BRAF (c.1799T>A or p.V600E), and is unresectable or metastatic, one recommended treatment would be Dabrafenib. The U.S. Food and Drug Administration (FDA) granted approval to dabrafenib as a single agent for the treament of patients with unresectable 

#### Real-world

In [59]:
real_questions=pd.read_csv(os.path.join(root_dir, 'data/real_world_queries.csv'), index_col=0)
real_prompt_groundtruth_dict=load_object(os.path.join(root_dir, 'data/real_prompt_groundtruth_dict.pkl'))

In [None]:
# After setting max_len to None
output_realworld_stra4=load_object(filename=os.path.join(root_dir, 'output/RAG_res_o4mini/realworld_maxlenNone/RAGstra4n5temp0.0_res_dict.pkl'))

In [73]:
for i in range(len(output_realworld_stra4['full output'])):
    for output in output_realworld_stra4['full output'][i]:
        if len(output)==0:
            print('len(output)==0')
            print(i, output)
        elif output==None:
            print('output==None')
            print(i, output)

In [None]:
for i in range(len(output_realworld_stra4['full output'])):
    print(
        np.min([len(output) for output in output_realworld_stra4['full output'][i]]),
        np.median([len(output) for output in output_realworld_stra4['full output'][i]]),
        np.max([len(output) for output in output_realworld_stra4['full output'][i]])
        )

71 76.0 10669
71 72.0 75650
71 78.0 1902
71 76.0 163386
71 72.0 6390


Load all input and output

In [59]:
# Load results from all runs
all_real_results = {
    4: {'ragllm_input':[],'ragllm_output':[],'full':[], 'partial':[], 'exact':[], 'avg_partial':[], 'avg_exact':[]},
    5: {'ragllm_input':[],'ragllm_output':[],'full':[], 'partial':[], 'exact':[], 'avg_partial':[], 'avg_exact':[]}
}

for i in range(len(output_realworld_stra4['full output'])):
    for strategy, output_dict in zip(
        [4, 5],
        [output_realworld_stra4, output_realworld_stra5]
    ):
        res = calc_eval_metrics(output_dict['full output'][i], real_questions['prompt'], real_prompt_groundtruth_dict)
        all_real_results[strategy]['ragllm_input'].append(output_dict['input prompt'][i])
        all_real_results[strategy]['ragllm_output'].append(output_dict['full output'][i])
        all_real_results[strategy]['full'].append(res)
        all_real_results[strategy]['partial'].append(res['partial_match_acc'])
        all_real_results[strategy]['exact'].append(res['exact_match_acc'])
        all_real_results[strategy]['avg_partial'].append(res['avg_partial_match_acc'])
        all_real_results[strategy]['avg_exact'].append(res['avg_exact_match_acc'])

def make_eval_df(data, prefix):
    return pd.DataFrame(data).T.add_prefix(prefix)

real_res_stra4_partial_acc_df = make_eval_df(all_real_results[4]['partial'], 'partial_match_acc_')
real_res_stra4_exact_acc_df   = make_eval_df(all_real_results[4]['exact'],   'exact_match_acc_')

real_res_stra5_partial_acc_df = make_eval_df(all_real_results[5]['partial'], 'partial_match_acc_')
real_res_stra5_exact_acc_df   = make_eval_df(all_real_results[5]['exact'],   'exact_match_acc_')


Sanity check - if no output was returned, that means the model failed to generate a response due to the max output tokens limit

In [84]:
stra=4
for i,e in enumerate(all_real_results[stra]['ragllm_output'][0]):
    if len(e) == 0:
        print(i)
        print(all_real_results[stra]['ragllm_input'][0][i])


4

    Context information is below.
    ---------------------
    ['If a breast cancer patient has a copy number in gene ERBB2, one recommended treatment would be Ado-Trastuzumab Emtansine. HER2-targeted antibody and microtubule inhibitor conjugate indicated, as a single agent, for the treatment of patients with HER2-positive, metastatic breast cancer who previously received trastuzumab and a taxane, separately or in combination. Patients should have either: (1) Received prior therapy for metastatic disease, or (2) Developed disease recurrence during or within six months of completing adjuvant therapy. (Citation: Genentech, Inc. Kadcyla (ado-trastuzumab emtansine) [package insert]. U.S. Food and Drug Administration website. https://www.accessdata.fda.gov/drugsatfda_docs/label/2020/125427s108lbl.pdf. Revised September 2020. Accessed November 12, 2020.).', 'If a breast cancer patient has a copy number in gene ERBB2, one recommended treatment would be Trastuzumab. The U.S. Food and Drug 

Strategy 4 - without providing a json schema in the input prompt

In [70]:
stra=4
print(all_real_results[stra]['avg_exact'], np.mean(all_real_results[stra]['avg_exact']))
print(all_real_results[stra]['avg_partial'], np.mean(all_real_results[stra]['avg_partial']))

[0.7142857142857143, 0.7142857142857143, 0.7619047619047619, 0.7619047619047619, 0.7619047619047619] 0.7428571428571429
[0.7619047619047619, 0.8095238095238095, 0.8571428571428571, 0.9047619047619048, 0.7619047619047619] 0.819047619047619


Strategy 5 - providing a json schema in the input prompt

In [71]:
stra=5
print(all_real_results[stra]['avg_exact'], np.mean(all_real_results[stra]['avg_exact']))
print(all_real_results[stra]['avg_partial'], np.mean(all_real_results[stra]['avg_partial']))

[0.6190476190476191, 0.7142857142857143, 0.7142857142857143, 0.7142857142857143, 0.7619047619047619] 0.7047619047619048
[0.8095238095238095, 0.8571428571428571, 0.8095238095238095, 0.7619047619047619, 0.7619047619047619] 0.8


### Evaluate GPT4o output 

#### Real-world

In [93]:
real_questions=pd.read_csv(os.path.join(root_dir, 'data/real_world_queries.csv'), index_col=0)
real_prompt_groundtruth_dict=load_object(os.path.join(root_dir, 'data/real_prompt_groundtruth_dict.pkl'))

In [97]:
real_rag_res_dict_4 = load_object(filename=os.path.join(root_dir, 'output/RAG_res_gpt4o/realworld/RAGstra4n5temp0.0_res_dict.pkl'))
real_rag_res_dict_5 = load_object(filename=os.path.join(root_dir, 'output/RAG_res_gpt4o/realworld/RAGstra5n5temp0.0_res_dict.pkl'))

In [99]:
# Load results from all runs
all_real_results = {
    4: {'ragllm_input':[],'ragllm_output':[],'full':[], 'partial':[], 'exact':[], 'avg_partial':[], 'avg_exact':[]},
    5: {'ragllm_input':[],'ragllm_output':[],'full':[], 'partial':[], 'exact':[], 'avg_partial':[], 'avg_exact':[]}
}

for i in range(len(real_rag_res_dict_4['full output'])):
    for strategy, output_dict in zip(
        [4, 5],
        [real_rag_res_dict_4, real_rag_res_dict_5]
    ):
        res = calc_eval_metrics(output_dict['full output'][i], real_questions['prompt'], real_prompt_groundtruth_dict)
        all_real_results[strategy]['ragllm_input'].append(output_dict['input prompt'][i])
        all_real_results[strategy]['ragllm_output'].append(output_dict['full output'][i])
        all_real_results[strategy]['full'].append(res)
        all_real_results[strategy]['partial'].append(res['partial_match_acc'])
        all_real_results[strategy]['exact'].append(res['exact_match_acc'])
        all_real_results[strategy]['avg_partial'].append(res['avg_partial_match_acc'])
        all_real_results[strategy]['avg_exact'].append(res['avg_exact_match_acc'])

def make_eval_df(data, prefix):
    return pd.DataFrame(data).T.add_prefix(prefix)

real_res_stra4_partial_acc_df = make_eval_df(all_real_results[4]['partial'], 'partial_match_acc_')
real_res_stra4_exact_acc_df   = make_eval_df(all_real_results[4]['exact'],   'exact_match_acc_')

real_res_stra5_partial_acc_df = make_eval_df(all_real_results[5]['partial'], 'partial_match_acc_')
real_res_stra5_exact_acc_df   = make_eval_df(all_real_results[5]['exact'],   'exact_match_acc_')


In [126]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")  # or "gpt-4", "gpt-3.5-turbo", etc.


In [131]:
acc_type='partial'
stra=4
for n in range(len(all_real_results[stra][acc_type])):
    print(n, all_real_results[stra][f'avg_{acc_type}'][n])

0 0.5714285714285714
1 0.5714285714285714
2 0.6666666666666666
3 0.6190476190476191
4 0.5238095238095238


In [None]:
acc_type='exact'
stra=5
for n in range(len(all_real_results[stra][acc_type])):
    print(n, all_real_results[stra][f'avg_{acc_type}'][n])
    for i, e in enumerate(all_real_results[stra][acc_type][n]):
        if e == False:
            print(i)
            print(real_questions['prompt'][i])
            print(real_prompt_groundtruth_dict[real_questions['prompt'][i]])
            
            output = all_real_results[stra]['ragllm_output'][n][i]
            num_tokens = len(encoding.encode(output))
            print(output)
            print(num_tokens)

0 0.8095238095238095
3
For a patient with cancer metastatic prostate cancer, advanced stage (most often stage IV, but some stage III or earlier potentially) and with BRCA2 mutation, what drugs approved?
[{'olaparib'}, {'olaparib', 'abiraterone', 'prednisone'}, {'talazoparib', 'enzalutamide'}, {'niraparib', 'abiraterone acetate'}, {'rucaparib'}]
{
    "Status": "success",
    "Treatment 1": {
        "Disease Name": "Metastatic Prostate Cancer",
        "Disease Phase or Condition": "Metastatic Castration-Resistant",
        "Drug Name": "Abiraterone + Prednisone + Olaparib",
        "Prior Treatment or Resistance Status": "Not specified",
        "Genomic Features": "BRCA2 mutation (somatic or germline)",
        "FDA-approval status": "Approved",
        "Link to FDA-approved Label": "https://www.accessdata.fda.gov/drugsatfda_docs/label/2023/208558s025lbl.pdf"
    },
    "Treatment 2": {
        "Disease Name": "Metastatic Prostate Cancer",
        "Disease Phase or Condition": "Metas

#### Structured vs. Unstructured

In [135]:
moalmanac_data=pd.read_csv(os.path.join(root_dir, 'data/moa_fda_queries_answers.csv'), index_col=0)
synthetic_prompt_groundtruth_dict=load_object(os.path.join(root_dir, 'data/synthetic_prompt_groundtruth_dict.pkl'))

In [133]:
# Load RAG-LLM and baseline results
rag_unstruc_res_dict_0 = load_object(filename=os.path.join(root_dir, 'output/RAG_res_gpt4o/unstructured/RAGstra0n5temp0.0_res_dict.pkl'))
rag_struc_res_dict_0 = load_object(filename=os.path.join(root_dir, 'output/RAG_res_gpt4o/structured/RAGstra0n5temp0.0_res_dict.pkl'))
llm_res_dict_0 = load_object(filename=os.path.join(root_dir, 'output/LLM_res_gpt4o/stra0n1temp0.0_res_dict.pkl'))


In [136]:
# Calculate evaluation metrics for the RAG-LLM output from the first iteration
unstruc_results = calc_eval_metrics(rag_unstruc_res_dict_0['full output'][0], moalmanac_data['prompt'], synthetic_prompt_groundtruth_dict)
struc_results = calc_eval_metrics(rag_struc_res_dict_0['full output'][0], moalmanac_data['prompt'], synthetic_prompt_groundtruth_dict)

print(f"Unstructured: {unstruc_results['avg_exact_match_acc']}, {unstruc_results['avg_partial_match_acc']}, {unstruc_results['avg_precision']}")
print(f"Structured: {struc_results['avg_exact_match_acc']}, {struc_results['avg_partial_match_acc']}, {struc_results['avg_precision']}")

Unstructured: 0.7948717948717948, 0.9017094017094017, 0.49424603174603166
Structured: 0.9444444444444444, 0.9529914529914529, 0.793091168091168


In [140]:
for i, output in enumerate(rag_struc_res_dict_0['full output'][0]):
    if len(output) == 0:
        print(i)
    elif output == None:
        print(i)
    else: 
        print(len(output))

474
1845
2462
1845
1395
1853
1646
1646
1646
1646
1399
1399
907
489
1094
1094
1098
1098
1094
1094
443
399
423
824
918
824
918
1255
900
501
2077
2002
1258
2077
1228
864
2077
1228
1258
2077
421
2077
955
1573
954
484
1501
484
449
1007
456
1769
492
1385
492
449
509
456
955
1525
464
484
480
484
1772
492
488
492
501
904
477
904
458
2256
412
1720
1286
1730
1510
1510
527
527
527
527
527
477
622
622
802
623
429
1269
1275
451
451
973
429
451
451
451
1428
1456
506
506
529
529
1456
1070
1070
929
791
805
962
432
465
1050
502
906
502
502
436
497
521
450
898
513
474
450
898
465
436
450
898
513
1440
907
474
450
898
457
450
898
465
1368
3708
3708
3708
3708
3708
3708
890
923
2837
482
2569
2569
475
542
542
472
536
499
1090
1090
1110
1090
831
451
416
1342
504
1670
1112
2650
1064
1563
945
547
467
430
1802
1802
1368
1368
496
496
468
468
900
427
1772
1772
904
904
429
428
455
914
906
1368
1772
1368
1802
445
893
438
454
460
864
564
577
588
590
591
589
2663
392
399
1657
1286
1721
1286
1721
791
785
428
824
1225
4