In [1]:
import sys
import os
script_dir = os.getcwd()
root_dir = os.path.join(os.path.dirname(os.path.abspath(script_dir)))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(script_dir))))

import json
import pandas as pd
import numpy as np
from utils.io import load_object, save_object
import requests
from collections import Counter
import spacy
import faiss
from openai import OpenAI
from utils.embedding import get_context_db
from dotenv import load_dotenv
import random
import math


In [2]:
os.chdir(root_dir)
os.getcwd()

'/home/helenajun/rag-llm-cancer-paper'

In [3]:
#split ranges for testing on small batches
def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result


# function to subset FDA statements
def subset_db_statements(statements, organization='fda'):
    # subset statements
    subset=[statement for statement in statements if statement['reportedIn'][0]['organization']['id'] == organization]
    return subset

# function to ensure list
def ensure_list(x):
    return x if isinstance(x, list) else [x]

# function to extract biomarker from statement
def extract_biomarker_info(stmt):
    # extract biomarkers from the statement
    biomarkers_list = []
    for i in range(len(stmt['proposition']['biomarkers'])):
        biomarker = stmt["proposition"]["biomarkers"][i]["name"]
        extensions_dict = {item['name']: item['value'] for item in stmt['proposition']['biomarkers'][i]['extensions']}
        presence = extensions_dict.get('present', '')
        biomarkers_list.append(biomarker)
        
        # extract presence information
        if presence == True:
            biomarker += " [present]"
        else:   
            biomarker += " [not present]"
        if i == 0:
            biomarkers_str = biomarker
        else:
            biomarkers_str += f", {biomarker}"
    
    extracted_info = {
        "str": biomarkers_str, 
        "list": biomarkers_list
    }
    
    return extracted_info


# function to extract therapy info from statement
def extract_therapy_info(stmt):
    # extract membership operator
    obj = stmt.get('proposition', {}).get('objectTherapeutic', {})
    operator = obj.get('membership_operator', None)
    
    # extract therapy approach, type, and names
    if operator == 'AND':
        approach = 'Combination therapy'
        therapy_strategyList = []
        therapy_typeList = []
        for therapy in obj.get('therapies', []):
            extensions_dict = {item['name']: item['value'] for item in therapy['extensions']}
            therapy_strategyList.extend(ensure_list(extensions_dict['therapy_strategy']))
            therapy_typeList.extend(ensure_list(extensions_dict['therapy_type']))
        drugList = [drug.get('name', None) for drug in obj.get('therapies', [])]
        
    else:
        approach = 'Monotherapy'
        therapy_strategyList = []
        therapy_typeList = []
        extensions_dict = {item['name']: item['value'] for item in obj['extensions']}
        therapy_strategyList.extend(ensure_list(extensions_dict['therapy_strategy']))
        therapy_typeList.extend(ensure_list(extensions_dict['therapy_type']))
        drugList = [obj.get('name', None)]
    
    # sanity check for drugList
    if any(d is None for d in drugList):
        raise ValueError(f"Found None in drugList for statement {stmt['id']}")
    
    drug_str = " + ".join([d for d in drugList if d is not None])
    therapy_strategy_str = " + ".join([s for s in therapy_strategyList if s is not None])
    therapy_type_str = " + ".join([t for t in therapy_typeList if t is not None])
    
    extracted_info = {
        "str": {
            "drug_str": drug_str, 
            "therapy_approach": approach,
            "therapy_strategy_str": therapy_strategy_str, 
            "therapy_type_str": therapy_type_str
                }, 
        "list": {
            "drugList": drugList, 
            "therapy_approach": approach,
            "therapy_strategyList": therapy_strategyList, 
            "therapy_typeList": therapy_typeList
            }
        }
    
    return extracted_info


# function to flatten statement into summary text to include in context
def flatten_statements(stmt: dict) -> str:
    
    statement_id = stmt.get("id")
    
    # approval status
    approval_status = stmt.get("reportedIn", [{}])[0].get("subtype", "None")
    approval_org = stmt.get("reportedIn", [{}])[0].get("organization", {}).get("id", "Unknown organization")
    approval_url = stmt.get("reportedIn", [{}])[0].get("url", "Unknown URL")
    approval_date = stmt.get("reportedIn", [{}])[0].get("publication_date", "Unknown date")
    
    # description and indication
    description = stmt.get("description", "None")
    indication = stmt.get("indication", {}).get("indication", "None")
    
    # cancer type
    cancer_type = stmt.get("proposition", {}).get("conditionQualifier", {}).get("name", "Unknown cancer")
    
    # biomarkers
    biomarker = extract_biomarker_info(stmt)
    
    # therapy
    therapy_info = extract_therapy_info(stmt)
    
    # create summary text
    summary = (
        f"Indication: {indication}\n"
        f"Cancer type: {cancer_type}\n"
        f"Biomarkers: {biomarker['str']}\n"
        f"Therapy: {therapy_info['str']['drug_str']}\n"
        f"Therapy approach: {therapy_info['str']['therapy_approach']}\n"
        f"Therapy strategy: {therapy_info['str']['therapy_strategy_str']}\n"
        f"Therapy type: {therapy_info['str']['therapy_type_str']}\n"
        f"Description: {description}\n"
        f"Approval status: {approval_status} ({approval_org})\n"
        f"Approval url: {approval_url}\n"
        f"Publication date: {approval_date}"
    )
    
    # create row to add to dataframe
    row = {
        "statement_id": statement_id,
        "approval_status": approval_status,
        "approval_org": approval_org,
        "description": description,
        "indication": indication,
        "cancer_type": cancer_type,
        "biomarker": biomarker['list'],
        "therapy_drug": therapy_info['list']['drugList'],
        "therapy_approach": therapy_info['list']['therapy_approach'],
        "therapy_strategy": therapy_info['list']['therapy_strategyList'],
        "therapy_type": therapy_info['list']['therapy_typeList'],
        "approval_url": approval_url,
        "publication_date": approval_date,
        "context": summary
    }
    
    return summary, row


In [4]:
statements = requests.get('https://api.moalmanac.org/statements').json()['data']
print(len(statements))

1784


In [13]:
fda_statements=subset_db_statements(statements, organization='fda')
print(len(fda_statements))

CACHE_FILE = "db_version_cache.json"
def get_local_version():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE) as f:
            return json.load(f).get("version")
    return None

version=get_local_version()
with open(f"data/latest_db/fda_statements__{version}.json", "w") as f:
    json.dump(fda_statements, f)

642


In [15]:
with open(f"data/latest_db/fda_statements__{version}.json", "r") as f:
    fda_statements = json.load(f)

## Create synthetic queries and answers

In [16]:
import ast
standardized_to_raw_mapping=pd.read_csv("data/latest_db/moalmanac_core__2025-09-04.csv")
standardized_to_raw_mapping['biomarker']=standardized_to_raw_mapping['biomarker'].apply(ast.literal_eval)
standardized_to_raw_mapping['therapy']=standardized_to_raw_mapping['therapy'].apply(ast.literal_eval)
print(standardized_to_raw_mapping.shape)
standardized_to_raw_mapping.head()

(642, 6)


Unnamed: 0,statement_id,standardized_cancer,raw_cancer,modified_standardized_cancer,biomarker,therapy
0,0,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"[HER2-negative, ER positive]","[Tamoxifen, Abemaciclib]"
1,1,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"[PR positive, HER2-negative]","[Tamoxifen, Abemaciclib]"
2,2,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"[PR positive, HER2-negative, ER positive]","[Tamoxifen, Abemaciclib]"
3,3,Invasive Breast Carcinoma,advanced or metastatic breast cancer,advanced or metastatic invasive breast carcinoma,"[HER2-negative, ER positive]","[Anastrozole, Abemaciclib]"
4,4,Invasive Breast Carcinoma,advanced or metastatic breast cancer,advanced or metastatic invasive breast carcinoma,"[PR positive, HER2-negative]","[Anastrozole, Abemaciclib]"


In [27]:
def create_synthetic_query(db: dict) -> str:
    """
    Create a synthetic query based on the database entry.
    """
    if len(db['biomarker']) > 1:
        biomarker_str = ", ".join(db['biomarker'])
    else:
        biomarker_str = db['biomarker'][0]
    query_text=f"if a patient with {db['modified_standardized_cancer']} cancer has {biomarker_str.lower()}, what therapy is recommended?"
    return query_text

def create_synthetic_answer(db: dict) -> str:
    """
    Create a synthetic answer based on the database entry.
    """
    if len(db['biomarker']) > 1:
        biomarker_str = ", ".join(db['biomarker'])
    else:
        biomarker_str = db['biomarker'][0]
    if len(db['therapy']) > 1:
        therapy_str = " + ".join(db['therapy'])
    else:
        therapy_str = db['therapy'][0]
    answer_text=f"If a patient with {db['modified_standardized_cancer']} cancer has {biomarker_str.lower()}, one recommended therapy is {therapy_str.lower()}."
    return answer_text

synthetic_query_list = []
synthetic_answer_list = []
synthetic_query_therapy_pair_dict = {}
for _, row in standardized_to_raw_mapping.iterrows():
    #create synthetic query and answer and add to dataframe
    query = create_synthetic_query(row)
    answer = create_synthetic_answer(row)
    standardized_to_raw_mapping.at[_, 'prompt'] = query
    
    #add more therapy context to answers (context db)
    summary, stmt_row = flatten_statements(fda_statements[_])
    if len(stmt_row['therapy_type']) > 1:
        therapy_type = ' + '.join(stmt_row['therapy_type'])
    else:
        therapy_type = stmt_row['therapy_type'][0]
    if len(stmt_row['therapy_strategy']) > 1:
        therapy_strategy = ' + '.join(stmt_row['therapy_strategy'])
    else:
        therapy_strategy = stmt_row['therapy_strategy'][0]
        
    standardized_to_raw_mapping.at[_, 'answer'] = (
        f"{answer} therapy type: {therapy_type.lower()}. therapy strategy: {therapy_strategy.lower()}. indication: {stmt_row['indication'].lower()} approval url: {stmt_row['approval_url']}"
    )
    
    #create synthetic query and ground-truth mapping dict
    row['therapy'] = [therapy.lower().strip() for therapy in row['therapy']]
    if query in synthetic_query_therapy_pair_dict.keys():
        synthetic_query_therapy_pair_dict[query].append(set(row['therapy']))
    else:
        synthetic_query_therapy_pair_dict[query] = [set(row['therapy'])]

In [8]:
test_idx_list = split_ranges(standardized_to_raw_mapping.shape[0], 10, 50)

In [9]:
for i, e in enumerate(standardized_to_raw_mapping.loc[test_idx_list].prompt):
    print(i, e)
    if i == 10:
        break

0 if a patient with metastatic castration-resistant prostate adenocarcinoma cancer has bard1 oncogenic variants, what therapy is recommended?
1 if a patient with advanced or metastatic invasive breast carcinoma cancer has pr positive, her2-negative, what therapy is recommended?
2 if a patient with metastatic non-small cell lung cancer cancer has pd-l1 >= 10% tiic, what therapy is recommended?
3 if a patient with philadelphia chromosome-positive chronic myelogenous leukemia cancer has bcr::abl1, what therapy is recommended?
4 if a patient with locally advanced or metastatic non-small cell lung cancer cancer has egfr exon 20 (insertion), what therapy is recommended?
5 if a patient with metastatic colorectal adenocarcinoma cancer has egfr positive, wild type kras, what therapy is recommended?
6 if a patient with metastatic non-small cell lung cancer cancer has met exon 14 (deletion), what therapy is recommended?
7 if a patient with locally advanced or metastatic invasive breast carcinoma 

In [10]:
for i, e in enumerate(standardized_to_raw_mapping.loc[test_idx_list].answer):
    print(i, e)
    if i == 10:
        break

0 If a patient with metastatic castration-resistant prostate adenocarcinoma cancer has bard1 oncogenic variants, one recommended therapy is prednisone + niraparib + abiraterone acetate. therapy type: chemotherapy + targeted therapy + hormone therapy. therapy strategy: corticosteroid + parp inhibition + antiandrogen. indication: akeega is a combination of niraparib, a poly (adp-ribose) polymerase (parp) inhibitor, and abiraterone acetate, a cyp17 inhibitor indicated with prednisone for the treatment of adult patients with deleterious or suspected deleterious brca-mutated (brcam) metastatic castration-resistant prostate cancer (mcrpc). select patients for therapy based on an fda-approved test for akeega. approval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2023/216793s000lbl.pdf
1 If a patient with advanced or metastatic invasive breast carcinoma cancer has pr positive, her2-negative, one recommended therapy is anastrozole + abemaciclib. therapy type: hormone therapy + targ

In [33]:
save_object(synthetic_query_therapy_pair_dict, "data/latest_db/synthetic_query_therapy_pair_dict.pkl")

In [34]:
for k, v in synthetic_query_therapy_pair_dict.items():
    print(k, v)
    break

if a patient with early invasive breast carcinoma cancer has her2-negative, er positive, what therapy is recommended? [{'tamoxifen', 'abemaciclib'}, {'abemaciclib', 'letrozole'}, {'anastrozole', 'goserelin', 'ribociclib'}, {'goserelin', 'letrozole', 'ribociclib'}, {'anastrozole', 'abemaciclib'}, {'abemaciclib', 'exemestane'}]


### Save synthetic prompts and answers/context

In [38]:
with open(f"data/latest_db/synthetic_prompts__{version}.json", "w") as f:
    json.dump(standardized_to_raw_mapping.prompt.to_list(), f)

with open(f"data/latest_db/synthetic_answers__{version}.json", "w") as f:
    json.dump(standardized_to_raw_mapping.answer.to_list(), f)
    

### Save synthetic answers as context index

In [37]:
_CONTEXT = standardized_to_raw_mapping.answer.to_list()
print(_CONTEXT[0])
print(len(_CONTEXT))

If a patient with early invasive breast carcinoma cancer has her2-negative, er positive, one recommended therapy is tamoxifen + abemaciclib. therapy type: hormone therapy + targeted therapy. therapy strategy: estrogen receptor inhibition + cdk4/6 inhibition. indication: verzenio is a kinase inhibitor indicated in combination with endocrine therapy (tamoxifen or an aromatase inhibitor) for the adjuvant treatment of adult patients with hormone receptor (hr)-positive, human epidermal growth factor receptor 2 (her2)-negative, node positive, early breast cancer at high risk of recurrence. approval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2023/208716s010s011lbl.pdf
642


In [44]:
def _cache_paths(output_dir: str, embed_name: str, name: str, version: str = "v1"):
    os.makedirs(output_dir, exist_ok=True)
    return (
        f"{output_dir}/{embed_name}_{name}__{version}.faiss",
        f"{output_dir}/{embed_name}_{name}__{version}.json",
    )

In [42]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
_CLIENT = OpenAI(api_key=api_key)
_MODEL_EMBED = "text-embedding-3-small"

In [None]:
# Always prefer cached index if present; only build once
force_rebuild=False
index_path, ctx_path =_cache_paths("data/latest_db/indexes", _MODEL_EMBED, "structured_context", version=version)
context_json_path=f"data/latest_db/synthetic_answers__{version}.json"

if (not force_rebuild) and os.path.exists(index_path) and os.path.exists(ctx_path):
    with open(ctx_path, "r") as f:
        _CONTEXT = json.load(f)
    _INDEX = faiss.read_index(index_path)
else:
    with open(context_json_path, "r") as f:
        _CONTEXT = json.load(f)
    _INDEX = get_context_db(_CONTEXT, _CLIENT, _MODEL_EMBED)
    faiss.write_index(_INDEX, index_path)
    with open(ctx_path, "w") as f:
        json.dump(_CONTEXT, f)

## Create validation test dataset

In [59]:
standardized_to_raw_mapping_subset=standardized_to_raw_mapping.sample(frac=.1, random_state=42)
standardized_to_raw_mapping_subset

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,modified_standardized_cancer,biomarker,therapy,prompt,answer
86,87,Colorectal Adenocarcinoma,metastatic colorectal cancer,metastatic colorectal adenocarcinoma,[BRAF p.V600E],"[Cetuximab, Encorafenib]",if a patient with metastatic colorectal adenoc...,If a patient with metastatic colorectal adenoc...
192,193,Acute Lymphoid Leukemia,B-cell precursor acute lymphoblastic leukemia ...,b-cell precursor acute lymphoid leukemia,[CD22 +],[Inotuzumab ozogamicin],if a patient with b-cell precursor acute lymph...,If a patient with b-cell precursor acute lymph...
479,481,Non-Small Cell Lung Cancer,locally advanced or metastatic non-small cell ...,locally advanced or metastatic non-small cell ...,[KRAS p.G12C],[Sotorasib],if a patient with locally advanced or metastat...,If a patient with locally advanced or metastat...
616,619,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"[HER2-negative, ER positive]","[Anastrozole, Abemaciclib]",if a patient with early invasive breast carcin...,If a patient with early invasive breast carcin...
72,73,Invasive Breast Carcinoma,locally advanced or metastatic breast cancer,locally advanced or metastatic invasive breast...,"[PTEN splice site variants, HER2-negative, ER ...","[Fulvestrant, Capivasertib]",if a patient with locally advanced or metastat...,If a patient with locally advanced or metastat...
...,...,...,...,...,...,...,...,...
165,166,Invasive Breast Carcinoma,advanced or metastatic breast cancer,advanced or metastatic invasive breast carcinoma,"[PR positive, HER2-negative, ER positive]","[Fulvestrant, Abemaciclib]",if a patient with advanced or metastatic invas...,If a patient with advanced or metastatic invas...
78,79,Non-Small Cell Lung Cancer,metastatic non-small cell lung cancer,metastatic non-small cell lung cancer,[MET Exon 14 (Splice Site)],[Capmatinib],if a patient with metastatic non-small cell lu...,If a patient with metastatic non-small cell lu...
181,182,Chronic Myelogenous Leukemia,philadelphia chromosome-positive chronic myelo...,philadelphia chromosome-positive chronic myelo...,[BCR::ABL1],[Imatinib],if a patient with philadelphia chromosome-posi...,If a patient with philadelphia chromosome-posi...
587,589,Non-Small Cell Lung Cancer,non-small cell lung cancer,non-small cell lung cancer,"[Wild type EGFR, Wild type ALK]","[Carboplatin, Paclitaxel, Nivolumab, Ipilimumab]",if a patient with non-small cell lung cancer c...,If a patient with non-small cell lung cancer c...


In [60]:
standardized_to_raw_mapping_subset.to_csv("data/latest_db/moalmanac_small_testing_dataset.csv", index=False)

## Number of tokens

In [None]:
standardized_to_raw_mapping_subset = pd.read_csv("data/latest_db/moalmanac_prompts_subset.csv")

In [13]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")  # or "gpt-4", "gpt-3.5-turbo", etc.

In [14]:
query_size=[len(encoding.encode(prompt)) for prompt in standardized_to_raw_mapping_subset.prompt]
print("# chunks: "+str(len(query_size)))
print("Min: "+str(np.min(query_size)))
print("Max: "+str(np.max(query_size)))
print("Mean: "+str(np.mean(query_size)))
print("Median: "+str(np.median(query_size)))


# chunks: 64
Min: 18
Max: 44
Mean: 25.9375
Median: 25.5


In [17]:
np.sum(query_size)

1660

In [13]:
query_size=[len(encoding.encode(prompt)) for prompt in standardized_to_raw_mapping_subset.answer]
print("# chunks: "+str(len(query_size)))
print("Min: "+str(np.min(query_size)))
print("Max: "+str(np.max(query_size)))
print("Mean: "+str(np.mean(query_size)))
print("Median: "+str(np.median(query_size)))


# chunks: 64
Min: 25
Max: 49
Mean: 35.1875
Median: 33.0


## Quick evaluation

In [62]:
#load test subset
standardized_to_raw_mapping_subset = pd.read_csv("data/latest_db/moalmanac_small_testing_dataset.csv")

#load synthetic query and ground-truth answer pair
synthetic_query_therapy_pair_dict=load_object("data/latest_db/synthetic_query_therapy_pair_dict.pkl")

In [65]:
# output_res = load_object(f"{root_dir}/output/RAG_res_gpt4o/structured_latest_db_prompts/RAGbatchstra0n1temp0.0_res_dict.pkl")
output_res = load_object(f"output/RAG_res_gpt4o/structured_latest_db_prompts_humantextctxv1/RAGbatchstra0n1temp0.0_res_dict.pkl")

### Check batch job status

In [9]:
from openai import OpenAI

load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
_CLIENT = OpenAI(api_key=api_key)
batch = _CLIENT.batches.retrieve("batch_68c05731f23c8190a037c9d60c6b07dc")
print(batch)

Batch(id='batch_68c05731f23c8190a037c9d60c6b07dc', completion_window='24h', created_at=1757435697, endpoint='/v1/chat/completions', input_file_id='file-VJaW3p7b1vTSERTSLueXCE', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='token_limit_exceeded', line=None, message='Enqueued token limit reached for gpt-4o-2024-05-13 in organization org-21HgjHxT7v4qV6YYhGGApRRN. Limit: 90,000 enqueued tokens. Please try again once some in_progress batches have been completed.', param=None)], object='list'), expired_at=None, expires_at=1757522097, failed_at=1757435700, finalizing_at=None, in_progress_at=None, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0), usage={'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0, 'input_tokens_details': {'cached_tokens': 0}, 'output_tokens_details': {'reasoning_tokens': 0}})


In [63]:
jobs = _CLIENT.batches.list()
for j in jobs.data:
    print(j.id, j.status)

batch_68c06d02ffc08190a4050d97558d64a0 completed
batch_68bf621f8a28819085f7bf43a9986802 completed


Calculate evaluation metrics

In [None]:
from utils.evaluation import calc_eval_metrics

output_eval = calc_eval_metrics(output_res['full output'][0], standardized_to_raw_mapping_subset.reset_index(drop=True)['prompt'], synthetic_query_therapy_pair_dict)
output_eval

Review exact match failed

In [None]:
exact_failed_ind = standardized_to_raw_mapping_subset.reset_index(drop=True)[~np.array(output_eval['exact_match_acc'])].index
print(exact_failed_ind)


Index([3, 19, 32, 34, 37, 38, 39, 40, 45, 49, 50, 55, 57, 58, 59, 61, 62], dtype='int64')

In [78]:
num_true_drugs_list = []
cancer_types = []
biomarkers = []
for i, row in standardized_to_raw_mapping_subset.reset_index(drop=True)[~np.array(output_eval['exact_match_acc'])].iterrows():
    print(row['prompt'])
    print(synthetic_query_answer_pair_dict[row['prompt']])
    num_true_drugs_list.append(len(synthetic_query_answer_pair_dict[row['prompt']]))
    print(len(synthetic_query_answer_pair_dict[row['prompt']]))
    print(output_res['input prompt'][0][i])
    print(output_res['full output'][0][i])
    cancer_types.append(row['standardized_cancer'])
    biomarkers.append(row['biomarker'])


if a patient with early invasive breast carcinoma cancer has her2-negative, er positive, what therapy is recommended?
[{'abemaciclib', 'tamoxifen'}, {'abemaciclib', 'letrozole'}, {'ribociclib', 'goserelin', 'anastrozole'}, {'ribociclib', 'goserelin', 'letrozole'}, {'abemaciclib', 'anastrozole'}, {'exemestane', 'abemaciclib'}]
6

        Context information is below.
        ---------------------
        ['If a patient with invasive breast carcinoma cancer has her2-positive, one recommended therapy is trastuzumab. therapy type: targeted therapy. therapy strategy: her2 inhibition. indication: herceptin is a her2/neu receptor antagonist indicated for the treatment of her2 -overexpressing breast cancer. select patients for therapy based on an fda-approved companion diagnostic for herceptin. approval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2024/103792s5354lbl.pdf', 'If a patient with early-stage invasive breast carcinoma cancer has her2-positive, one recommended therapy is

In [57]:
print(num_true_drugs_list)
print(np.min(num_true_drugs_list))
print(np.mean(num_true_drugs_list))
print(np.median(num_true_drugs_list))
print(np.max(num_true_drugs_list))

[6, 12, 5, 5, 9, 2, 9, 12, 6, 10, 3, 12, 2, 9, 12, 9, 10]
2
7.823529411764706
9.0
12


In [79]:
Counter(cancer_types).most_common()

[('Invasive Breast Carcinoma', 8),
 ('Non-Small Cell Lung Cancer', 3),
 ('Acute Myeloid Leukemia', 2),
 ('Chronic Myelogenous Leukemia', 2),
 ('Prostate Adenocarcinoma', 1),
 ('Melanoma', 1)]

In [80]:
Counter(biomarkers).most_common()

[("['HER2-negative', 'ER positive']", 3),
 ("['PR positive', 'HER2-negative']", 2),
 ("['BCR::ABL1']", 2),
 ("['PR positive', 'HER2-negative', 'ER positive']", 2),
 ("['Wild type EGFR', 'Wild type ALK']", 2),
 ("['BRCA2 oncogenic variants']", 1),
 ("['EGFR Exon 19 (Deletion)']", 1),
 ("['HER2-positive']", 1),
 ("['IDH1 p.R132S']", 1),
 ("['IDH1 p.R132L']", 1),
 ("['BRAF p.V600E']", 1)]

Here is what I think is happening:
- in many of these exact match failed cases, there are many ground-truth drugs linked to their queries, so there's inherently high chance to miss some of the ground-truth drugs
- many of these cases are associated with breast, lung, and leukemia
- many of these cases are associated with HER2-negative, ER positive, PR positive status and BCR::ABL2 variant
- some retrieved chunks do not have either exactly matching specific cancer type (i.e., the modifier/disease stage) or the gene biomarker (i.e., missing one of the queried biomarkers or a similar but different gene)
- all these cases have been seen in our previous study, and can be handled by the rule-based retrieval approach (e.g., entity matching) 