In [1]:
import pandas as pd 
import numpy as np
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
while os.getcwd().split('/')[-1] != 'rag-llm-cancer-paper':
    os.chdir('..')

In [2]:
import ast
standardized_to_raw_mapping = pd.read_csv('external-validation/non-moa-database/oncokb/oncokb-db/oncokb_core__2025-09.csv')
standardized_to_raw_mapping = standardized_to_raw_mapping.rename(columns={'modified_standard_cancer':'modified_standardized_cancer'})
standardized_to_raw_mapping['biomarker']=standardized_to_raw_mapping['biomarker'].apply(ast.literal_eval)
standardized_to_raw_mapping['therapy']=standardized_to_raw_mapping['therapy'].apply(ast.literal_eval)
print(standardized_to_raw_mapping.shape)
standardized_to_raw_mapping.head()

(625, 6)


Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standardized_cancer
0,0,B-Lymphoblastic Leukemia/Lymphoma,B-Lymphoblastic Leukemia/Lymphoma,[T315I],[Ponatinib],B-Lymphoblastic Leukemia/Lymphoma
1,1,Chronic Myelogenous Leukemia,Chronic Myelogenous Leukemia,[T315I],[Ponatinib],Chronic Myelogenous Leukemia
2,2,Chronic Myelogenous Leukemia,Chronic Myelogenous Leukemia,[T315I],[Asciminib],Chronic Myelogenous Leukemia
3,3,Breast Cancer,Breast Cancer,[E17K],"[Capivasertib,Fulvestrant]",Breast Cancer
4,4,Melanoma,Melanoma,[V600E],[Dabrafenib],Melanoma


In [3]:
def create_synthetic_query(db: dict) -> str:
    """
    Create a synthetic query based on the database entry.
    """
    if len(db['biomarker']) > 1:
        biomarker_str = ", ".join(db['biomarker'])
    else:
        biomarker_str = db['biomarker'][0]
    query_text=f"if a patient with {db['modified_standardized_cancer']} cancer has {biomarker_str.lower()}, what therapy is recommended?"
    return query_text

def create_synthetic_answer(db: dict) -> str:
    """
    Create a synthetic answer based on the database entry.
    """
    if len(db['biomarker']) > 1:
        biomarker_str = ", ".join(db['biomarker'])
    else:
        biomarker_str = db['biomarker'][0]
    if len(db['therapy']) > 1:
        therapy_str = " + ".join(db['therapy'])
    else:
        therapy_str = db['therapy'][0]
    answer_text=f"If a patient with {db['modified_standardized_cancer']} cancer has {biomarker_str.lower()}, one recommended therapy is {therapy_str.lower()}."
    return answer_text

synthetic_query_list = []
synthetic_answer_list = []
synthetic_query_therapy_pair_dict = {}
for _, row in standardized_to_raw_mapping.iterrows():
    #create synthetic query and answer and add to dataframe
    query = create_synthetic_query(row)
    answer = create_synthetic_answer(row)
    standardized_to_raw_mapping.at[_, 'prompt'] = query
    
    #add more therapy context to answers (context db)
    statements = pd.read_csv('external-validation/non-moa-database/civic/civic-db/civic-draft.dereferenced.unique.context_db.csv')
    statement_dict = dict(zip(statements['statement_id'], statements['context']))

    summary = statement_dict.get(row['statement_id'], "No statement available.")
    #summary, stmt_row = flatten_statements(fda_statements[_])
    # if len(stmt_row['therapy_type']) > 1:
    #     therapy_type = ' + '.join(stmt_row['therapy_type'])
    # else:
    # #     therapy_type = stmt_row['therapy_type'][0]
    # if len(stmt_row['therapy_strategy']) > 1:
    #     therapy_strategy = ' + '.join(stmt_row['therapy_strategy'])
    # else:
    #     therapy_strategy = stmt_row['therapy_strategy'][0]
        
    standardized_to_raw_mapping.at[_, 'answer'] = (
        #f"{answer} therapy type: {therapy_type.lower()}. therapy strategy: {therapy_strategy.lower()}. indication: {stmt_row['indication'].lower()} approval url: {stmt_row['approval_url']}"
        f"{answer}"
    )
    
    #create synthetic query and ground-truth mapping dict
    row['therapy'] = [therapy.lower().strip() for therapy in row['therapy']]
    if query in synthetic_query_therapy_pair_dict.keys():
        synthetic_query_therapy_pair_dict[query].append(set(row['therapy']))
    else:
        synthetic_query_therapy_pair_dict[query] = [set(row['therapy'])]

In [4]:
#split ranges for testing on small batches
import random, math
def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result
test_idx_list = split_ranges(standardized_to_raw_mapping.shape[0], 10, 50)

In [5]:
for i, e in enumerate(standardized_to_raw_mapping.loc[test_idx_list].prompt):
    print(i, e)
    if i == 10:
        break

0 if a patient with Breast Cancer cancer has e380,s463p,y537,l536,l469v,d538, what therapy is recommended?
1 if a patient with Biliary Tract cancer has v600e, what therapy is recommended?
2 if a patient with Chronic Myelogenous Leukemia cancer has t315i, what therapy is recommended?
3 if a patient with Oligodendroglioma cancer has r132s,r132l,r132c,r132h,r132g, what therapy is recommended?
4 if a patient with Melanoma cancer has v600e,v600k, what therapy is recommended?
5 if a patient with Pilocytic Astrocytoma cancer has v600e, what therapy is recommended?
6 if a patient with Non-Small Cell Lung Cancer cancer has g1202r, what therapy is recommended?
7 if a patient with Chronic Myelogenous Leukemia cancer has v299l,f317l,f317i,f317c,t315a,f317v, what therapy is recommended?
8 if a patient with Non-Small Cell Lung Cancer cancer has oncogenic mutations, what therapy is recommended?
9 if a patient with Breast Cancer cancer has c420r,e542k,e545a,e545d,e545g,e545k,q546e,q546r,h1047l,h1047r,

In [6]:
for i, e in enumerate(standardized_to_raw_mapping.loc[test_idx_list].answer):
    print(i, e)
    if i == 10:
        break

0 If a patient with Breast Cancer cancer has e380,s463p,y537,l536,l469v,d538, one recommended therapy is elacestrant.
1 If a patient with Biliary Tract cancer has v600e, one recommended therapy is dabrafenib,trametinib.
2 If a patient with Chronic Myelogenous Leukemia cancer has t315i, one recommended therapy is asciminib.
3 If a patient with Oligodendroglioma cancer has r132s,r132l,r132c,r132h,r132g, one recommended therapy is vorasidenib.
4 If a patient with Melanoma cancer has v600e,v600k, one recommended therapy is trametinib.
5 If a patient with Pilocytic Astrocytoma cancer has v600e, one recommended therapy is selumetinib.
6 If a patient with Non-Small Cell Lung Cancer cancer has g1202r, one recommended therapy is lorlatinib.
7 If a patient with Chronic Myelogenous Leukemia cancer has v299l,f317l,f317i,f317c,t315a,f317v, one recommended therapy is nilotinib.
8 If a patient with Non-Small Cell Lung Cancer cancer has oncogenic mutations, one recommended therapy is ado-trastuzumab e

### generate synthetic queries

In [8]:
from utils.io import load_object, save_object
save_object(synthetic_query_therapy_pair_dict, 
            "external-validation/non-moa-database/oncokb/oncokb-db/synthetic_query_therapy_pair_dict.pkl")

In [9]:
for k, v in synthetic_query_therapy_pair_dict.items():
    print(k, v)
    break

if a patient with B-Lymphoblastic Leukemia/Lymphoma cancer has t315i, what therapy is recommended? [{'ponatinib'}, {'imatinib'}, {'dasatinib'}, {'nilotinib'}, {'bosutinib'}]


In [10]:
import json
with open(f"external-validation/non-moa-database/oncokb/oncokb-db/synthetic_prompts__2025-09.json", "w") as f:
    json.dump(standardized_to_raw_mapping.prompt.to_list(), f)

with open(f"external-validation/non-moa-database/oncokb/oncokb-db/synthetic_answers__2025-09.json", "w") as f:
    json.dump(standardized_to_raw_mapping.answer.to_list(), f)

In [11]:
_CONTEXT = standardized_to_raw_mapping.answer.to_list()
print(_CONTEXT[0])
print(len(_CONTEXT))

If a patient with B-Lymphoblastic Leukemia/Lymphoma cancer has t315i, one recommended therapy is ponatinib.
625


In [12]:
def _cache_paths(output_dir: str, embed_name: str, name: str, version: str = "v1"):
    os.makedirs(output_dir, exist_ok=True)
    return (
        f"{output_dir}/{embed_name}_{name}__{version}.faiss",
        f"{output_dir}/{embed_name}_{name}__{version}.json",
    )

In [13]:
from dotenv import load_dotenv
from openai import OpenAI
from utils.embedding import retrieve_context, get_text_embedding, store_embedding #get_context_db,
def get_context_db(context_chunks, CLIENT, model_embed):
    context_embeddings=np.array([get_text_embedding(chunk, CLIENT, model_embed) for chunk in context_chunks])
    index=store_embedding(context_embeddings)
    return(index)
import faiss
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
_CLIENT = OpenAI(api_key=api_key)
_MODEL_EMBED = "text-embedding-3-small"

In [14]:
# Always prefer cached index if present; only build once
force_rebuild=True
version = '2025-09'
index_path, ctx_path =_cache_paths("external-validation/non-moa-database/oncokb/oncokb-db/indexes", 
                                   _MODEL_EMBED, 
                                   "structured_context", version=version)
context_json_path=f"external-validation/non-moa-database/oncokb/oncokb-db/synthetic_answers__{version}.json"

def build_index(context_json_path, index_path, ctx_path):
    global _READY, _CLIENT, _CONTEXT, _INDEX, _MODEL_TYPE, _MODEL_NAME, _MODEL_EMBED
 
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")
    _CLIENT = OpenAI(api_key=api_key)
    _MODEL_EMBED = "text-embedding-3-small"
    
    with open(context_json_path, "r") as f:
            _CONTEXT = json.load(f)
    _INDEX = get_context_db(_CONTEXT, _CLIENT, _MODEL_EMBED)
    faiss.write_index(_INDEX, index_path)
    with open(ctx_path, "w") as f:
        json.dump(_CONTEXT, f)

In [15]:
index_path = 'external-validation/non-moa-database/oncokb/oncokb-db/index/oncokb_2025-09_structured_context.faiss'
ctx_path = 'external-validation/non-moa-database/oncokb/oncokb-db/index/oncokb_2025-09_structured_context.json'
context_json_path = f'external-validation/non-moa-database/oncokb/oncokb-db/synthetic_answers__{version}.json'
build_index(context_json_path, index_path, ctx_path)

### perform extraction

In [7]:
biobert_dir = "context_retriever/biobert_ner"
#download model from huggingface
from transformers import AutoTokenizer, AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
tokenizer = AutoTokenizer.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
id2label = model.config.id2label

#save locally
model.save_pretrained(biobert_dir)
tokenizer.save_pretrained(biobert_dir)

('context_retriever/biobert_ner/tokenizer_config.json',
 'context_retriever/biobert_ner/special_tokens_map.json',
 'context_retriever/biobert_ner/vocab.txt',
 'context_retriever/biobert_ner/added_tokens.json',
 'context_retriever/biobert_ner/tokenizer.json')

In [8]:
version = '2025-09'
import json
context_json_path = f'external-validation/non-moa-database/oncokb/oncokb-db/synthetic_answers__{version}.json'
with open(context_json_path, "r") as f:
    _CONTEXT = json.load(f)
#run BioBert NER on all contexts
from utils.entity_prediction import extract_entities, db_extract_entities
context_extracted_entities = []
for ctx in tqdm(_CONTEXT):
    context_extracted_entities.append(extract_entities(ctx, model, tokenizer))
context_extracted_entities

100%|██████████| 625/625 [00:30<00:00, 20.75it/s]


[{'cancer_type': ['b-lymphoblastic leukemia', 'lymphoma cancer'],
  'biomarker': []},
 {'cancer_type': ['chronic myelogenous leukemia cancer'], 'biomarker': []},
 {'cancer_type': ['chronic myelogenous leukemia cancer'], 'biomarker': []},
 {'cancer_type': ['breast cancer cancer'], 'biomarker': []},
 {'cancer_type': ['melanoma cancer'], 'biomarker': []},
 {'cancer_type': ['all solid tumors cancer'], 'biomarker': []},
 {'cancer_type': ['anaplastic thyroid cancer cancer'], 'biomarker': []},
 {'cancer_type': ['hepatobiliary cancer cancer'], 'biomarker': []},
 {'cancer_type': ['biliary tract cancer'], 'biomarker': []},
 {'cancer_type': ['nos cancer', 'low-grade glioma'], 'biomarker': []},
 {'cancer_type': ['melanoma cancer'], 'biomarker': []},
 {'cancer_type': ['non-small cell lung cancer cancer'], 'biomarker': []},
 {'cancer_type': ['colorectal cancer cancer'], 'biomarker': []},
 {'cancer_type': ['colorectal cancer cancer'], 'biomarker': []},
 {'cancer_type': ['nos cancer', 'low-grade gliom

In [18]:
no_cancer_idx = []
no_biomarker_idx = []
for i, e in tqdm(enumerate(context_extracted_entities)):
    if not e['cancer_type']:
        no_cancer_idx.append(i)
    if not e['biomarker']:
        no_biomarker_idx.append(i)

print(f"{len(no_cancer_idx)*100/len(_CONTEXT):.3f}%") #no cancer entity extracted %
print(f"{len(no_biomarker_idx)*100/len(_CONTEXT):.3f}%") #no biomarker entity extracted %

625it [00:00, 1315983.94it/s]

0.000%
67.520%





In [10]:
#save
os.makedirs("external-validation/non-moa-database/oncokb/oncokb-db/context_retriever/entities", exist_ok=True)
with open(f"external-validation/non-moa-database/oncokb/oncokb-db/context_retriever/entities/synthetic_query_ner_entities__{version}.json", "w") as f:
    json.dump(context_extracted_entities, f, indent=4)

### create validation test dataset

In [11]:
standardized_to_raw_mapping_subset=standardized_to_raw_mapping.sample(frac=.1, random_state=42)
standardized_to_raw_mapping_subset

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standardized_cancer,prompt,answer
447,447,Prostate Cancer,Prostate Cancer,[Oncogenic Mutations],[Rucaparib],Prostate Cancer,if a patient with Prostate Cancer cancer has o...,If a patient with Prostate Cancer cancer has o...
485,485,"Prostate Cancer, NOS","Prostate Cancer, NOS",[Oncogenic Mutations],[Olaparib],"Prostate Cancer, NOS","if a patient with Prostate Cancer, NOS cancer ...","If a patient with Prostate Cancer, NOS cancer ..."
215,215,All Solid Tumors,All Solid Tumors,"[G469A,G469R,G469V,G464]",[Plixorafenib],All Solid Tumors,if a patient with All Solid Tumors cancer has ...,If a patient with All Solid Tumors cancer has ...
212,212,Non-Small Cell Lung Cancer,Non-Small Cell Lung Cancer,[G2032R],[Taletrectinib],Non-Small Cell Lung Cancer,if a patient with Non-Small Cell Lung Cancer c...,If a patient with Non-Small Cell Lung Cancer c...
480,480,Prostate Cancer,Prostate Cancer,[Oncogenic Mutations],[Olaparib],Prostate Cancer,if a patient with Prostate Cancer cancer has o...,If a patient with Prostate Cancer cancer has o...
...,...,...,...,...,...,...,...,...
434,434,"Prostate Cancer, NOS","Prostate Cancer, NOS",[Oncogenic Mutations],"[Talazoparib,Enzalutamide]","Prostate Cancer, NOS","if a patient with Prostate Cancer, NOS cancer ...","If a patient with Prostate Cancer, NOS cancer ..."
210,210,All Solid Tumors,All Solid Tumors,[G595R],[Selitrectinib],All Solid Tumors,if a patient with All Solid Tumors cancer has ...,If a patient with All Solid Tumors cancer has ...
30,30,Non-Small Cell Lung Cancer,Non-Small Cell Lung Cancer,"[Exon 19 in-frame deletions,L858R]",[Osimertinib],Non-Small Cell Lung Cancer,if a patient with Non-Small Cell Lung Cancer c...,If a patient with Non-Small Cell Lung Cancer c...
579,579,Hepatobiliary Cancer,Hepatobiliary Cancer,[Amplification],[Brigimadlin],Hepatobiliary Cancer,if a patient with Hepatobiliary Cancer cancer ...,If a patient with Hepatobiliary Cancer cancer ...


In [12]:
standardized_to_raw_mapping_subset.to_csv("external-validation/non-moa-database/oncokb/oncokb-db/oncokb_small_testing_dataset.csv", index=False)

In [13]:
standardized_to_raw_mapping_subset = pd.read_csv("external-validation/non-moa-database/oncokb/oncokb-db/oncokb_small_testing_dataset.csv")
standardized_to_raw_mapping_subset.head(1)

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standardized_cancer,prompt,answer
0,447,Prostate Cancer,Prostate Cancer,['Oncogenic Mutations'],['Rucaparib'],Prostate Cancer,if a patient with Prostate Cancer cancer has o...,If a patient with Prostate Cancer cancer has o...


In [14]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")
query_size=[len(encoding.encode(prompt)) for prompt in standardized_to_raw_mapping_subset.prompt]
print("# chunks: "+str(len(query_size)))
print("Min: "+str(np.min(query_size)))
print("Max: "+str(np.max(query_size)))
print("Mean: "+str(np.mean(query_size)))
print("Median: "+str(np.median(query_size)))
np.sum(query_size)

# chunks: 62
Min: 17
Max: 114
Mean: 25.79032258064516
Median: 21.0


np.int64(1599)

In [15]:
query_size=[len(encoding.encode(prompt)) for prompt in standardized_to_raw_mapping_subset.answer]
print("# chunks: "+str(len(query_size)))
print("Min: "+str(np.min(query_size)))
print("Max: "+str(np.max(query_size)))
print("Mean: "+str(np.mean(query_size)))
print("Median: "+str(np.median(query_size)))

# chunks: 62
Min: 21
Max: 119
Mean: 30.306451612903224
Median: 27.0


### perform evaluation

In [16]:
standardized_to_raw_mapping_subset = pd.read_csv("external-validation/non-moa-database/oncokb/oncokb-db/oncokb_small_testing_dataset.csv")
standardized_to_raw_mapping_subset.head(1)

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standardized_cancer,prompt,answer
0,447,Prostate Cancer,Prostate Cancer,['Oncogenic Mutations'],['Rucaparib'],Prostate Cancer,if a patient with Prostate Cancer cancer has o...,If a patient with Prostate Cancer cancer has o...


In [19]:
# ================== GENERAL IMPORTS ==================
import os
import json
from dotenv import load_dotenv

# ================== UTIL FUNCTIONS ==================
from utils.embedding import retrieve_context, get_text_embedding, store_embedding #get_context_db,
from utils.prompt import get_prompt
from llm.run_RAGLLM import run_RAG

# Generate context vector database
def get_context_db(context_chunks, CLIENT, model_embed):
    context_embeddings=np.array([get_text_embedding(chunk, CLIENT, model_embed) for chunk in context_chunks])
    index=store_embedding(context_embeddings)
    return(index)
from utils.io import load_object, save_object


# ================== MODEL & API IMPORTS ==================
from mistralai.client import MistralClient
from openai import OpenAI
from llm.inference import run_llm
import faiss
synthetic_query_therapy_pair_dict=load_object("external-validation/non-moa-database/oncokb/oncokb-db/synthetic_query_therapy_pair_dict.pkl")

In [20]:
#setup env
import faiss
from dotenv import load_dotenv
load_dotenv()
_MODEL = 'gpt-4.1-nano' #this could be gpt-4o-2024-05-13, gpt-4o-mini-2024-07-18, etc.
_MODEL_EMBED = 'text-embedding-3-small'
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Missing API key. Please set OPENAI_API_KEY in your .env file.")
CLIENT=OpenAI(api_key=api_key)

In [21]:
_VERSION = '2025-09-04'
version = '2025-09'
_QUERY_DF=pd.read_csv("external-validation/non-moa-database/oncokb/oncokb-db/oncokb_small_testing_dataset.csv",)
context_json_path=f"external-validation/non-moa-database/oncokb/oncokb-db/synthetic_answers__{version}.json"
with open(context_json_path, "r") as f:
     _CONTEXT = json.load(f)
index_path = 'external-validation/non-moa-database/oncokb/oncokb-db/index/oncokb_2025-09_structured_context.faiss'
_INDEX = faiss.read_index(index_path)
with open(f"context_retriever/entities/oncokb_db_context_ner_entities__oncokb-202509.json", "r") as f:
     _DB_ENTITY = json.load(f)
with open(f"external-validation/non-moa-database/oncokb/oncokb-db/context_retriever/entities/synthetic_query_ner_entities__{version}.json", "r") as f:
    _QUERY_ENTITY = json.load(f)


In [22]:
# Run RAG-LLM iterations
from llm.run_RAGLLM import run_iterations_rag
output_ls, input_ls, runtime_ls = run_iterations_rag(
    num_iterations=1,#args.num_iter, 
    data=_QUERY_DF, 
    context_chunks=_CONTEXT, 
    db_entity=_DB_ENTITY,
    query_entity=_QUERY_ENTITY, 
    num_vec=10, 
    index=_INDEX,
    client=CLIENT, 
    model=_MODEL, 
    model_embed=_MODEL_EMBED, 
    model_type='gpt',
    strategy=5,
    max_len=None, 
    temp=0.0, 
    random_seed=None
    )

# Save results
res_dict = {
    "full output": output_ls, 
    "input prompt": input_ls, 
    #"retrieval": retrieval_ls,
    "runtime": runtime_ls
    }

os.makedirs('external-validation/non-moa-database/oncokb/rag-llm/small-test', exist_ok=True)
output_dir = 'external-validation/non-moa-database/oncokb/rag-llm/small-test'

result_file=os.path.join(
    output_dir,
    f'RAGstra{str(5)}n{str(1)}temp{str(0)}_res_dict.pkl'
)
from utils.io import save_object
save_object(res_dict, filename=result_file)

Time elapsed for iteration 0: 2.7345 min


In [23]:
output_res = os.path.join(
    output_dir,
    f'RAGstra{str(5)}n{str(1)}temp{str(0)}_res_dict.pkl'
)
output_res = load_object(output_res)

In [24]:
from utils.evaluation import calc_eval_metrics

output_eval = calc_eval_metrics(output_res['full output'][0], 
                                standardized_to_raw_mapping_subset.reset_index(drop=True)['prompt'], 
                                synthetic_query_therapy_pair_dict)
output_eval

{'avg_exact_match_acc': 0.3387096774193548,
 'avg_partial_match_acc': 0.8387096774193549,
 'avg_precision': 0.8360215053763441,
 'avg_recall': 0.4759056579621095,
 'avg_f1': 0.540991235203759,
 'avg_specificity': 0.9990092973561309,
 'exact_match_acc': [False,
  False,
  True,
  False,
  False,
  False,
  True,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  True,
  False,
  False,
  False,
  True,
  False,
  False,
  True,
  True,
  False,
  False,
  False,
  False,
  True,
  True,
  True,
  True,
  False,
  False,
  True,
  False,
  False,
  True,
  True,
  True,
  False,
  True,
  False,
  False,
  False,
  True,
  True,
  False,
  False,
  True,
  False,
  False,
  False,
  True,
  True,
  False,
  True,
  False,
  False,
  False,
  False,
  False],
 'partial_match_acc': [True,
  False,
  True,
  True,
  True,
  True,
  True,
  True,
  False,
  True,
  True,
  True,
  True,
  True,
  True,
  True,
  True,
  True,
  True,
  True,
  False,
  True,
  True,
  

In [25]:
exact_failed_ind = standardized_to_raw_mapping_subset.reset_index(drop=True)[~np.array(output_eval['exact_match_acc'])].index
print(exact_failed_ind)


Index([ 0,  1,  3,  4,  5,  7,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21,
       24, 25, 26, 27, 32, 33, 35, 36, 40, 42, 43, 44, 47, 48, 50, 51, 52, 55,
       57, 58, 59, 60, 61],
      dtype='int64')


In [26]:
from utils.io import load_object, save_object
synthetic_query_therapy_pair_dict = load_object(
            "external-validation/non-moa-database/oncokb/oncokb-db/synthetic_query_therapy_pair_dict.pkl")

In [27]:
num_true_drugs_list = []
cancer_types = []
biomarkers = []
for i, row in standardized_to_raw_mapping_subset.reset_index(drop=True)[~np.array(output_eval['exact_match_acc'])].iterrows():
    print(row['prompt'])
    print(synthetic_query_therapy_pair_dict[row['prompt']])
    num_true_drugs_list.append(len(synthetic_query_therapy_pair_dict[row['prompt']]))
    print(len(synthetic_query_therapy_pair_dict[row['prompt']]))
    print(output_res['input prompt'][0][i])
    print(output_res['full output'][0][i])
    cancer_types.append(row['standardized_cancer'])
    biomarkers.append(row['biomarker'])

if a patient with Prostate Cancer cancer has oncogenic mutations, what therapy is recommended?
[{'olaparib'}, {'talazoparib,enzalutamide'}, {'talazoparib,enzalutamide'}, {'olaparib'}, {'olaparib'}, {'rucaparib'}, {'olaparib,abiraterone,prednisone'}, {'talazoparib,enzalutamide'}, {'niraparib,abiraterone acetate,prednisone'}, {'olaparib'}, {'rucaparib'}, {'olaparib,abiraterone,prednisone'}, {'talazoparib,enzalutamide'}, {'niraparib,abiraterone acetate,prednisone'}, {'olaparib'}, {'olaparib'}, {'talazoparib,enzalutamide'}, {'olaparib'}, {'olaparib'}, {'talazoparib,enzalutamide'}, {'talazoparib,enzalutamide'}, {'olaparib'}, {'talazoparib,enzalutamide'}, {'talazoparib,enzalutamide'}, {'talazoparib,enzalutamide'}, {'olaparib'}, {'talazoparib,enzalutamide'}, {'olaparib'}, {'olaparib'}, {'talazoparib,enzalutamide'}, {'olaparib'}]
31

    Context information is below.
    ---------------------
    ['If a patient with Prostate Cancer cancer has oncogenic mutations, one recommended therapy is tal

In [28]:
print(num_true_drugs_list)
print(np.min(num_true_drugs_list))
print(np.mean(num_true_drugs_list))
print(np.median(num_true_drugs_list))
print(np.max(num_true_drugs_list))

[31, 31, 3, 31, 31, 2, 31, 4, 4, 9, 12, 4, 5, 6, 5, 3, 4, 4, 31, 7, 2, 2, 7, 7, 31, 19, 5, 1, 6, 2, 31, 6, 7, 6, 4, 2, 31, 4, 9, 5, 31]
1
11.609756097560975
6.0
31


In [29]:
from collections import Counter
Counter(cancer_types).most_common()

[('Non-Small Cell Lung Cancer', 12),
 ('Prostate Cancer', 6),
 ('Prostate Cancer, NOS', 4),
 ('Gastrointestinal Stromal Tumor', 3),
 ('Chronic Myelogenous Leukemia', 2),
 ('Colorectal Cancer', 2),
 ('All Solid Tumors', 2),
 ('Small Bowel Cancer', 1),
 ('Langerhans Cell Histiocytosis', 1),
 ('Pancreatic Adenocarcinoma', 1),
 ('Myeloid/Lymphoid Neoplasms with Eosinophilia and Rearrangement of  PDGFRA/PDGFRB or FGFR1 or with PCM1-JAK2',
  1),
 ('B-Lymphoblastic Leukemia/Lymphoma', 1),
 ('Endometrial Cancer', 1),
 ('Inflammatory Myofibroblastic Tumor', 1),
 ('Ganglioglioma', 1),
 ('Melanoma', 1),
 ('Hepatobiliary Cancer', 1)]

In [30]:
Counter(biomarkers).most_common()

[("['Oncogenic Mutations']", 13),
 ("['Amplification']", 4),
 ("['Fusions']", 3),
 ("['D402N,L474P,S478N,E318K']", 2),
 ("['Exon 19 in-frame deletions,L858R']", 2),
 ("['L861Q']", 2),
 ("['G1202R']", 2),
 ("['G2032R']", 1),
 ("['L1196M']", 1),
 ("['T790M']", 1),
 ("['Oncogenic Mutations (excluding V600)']", 1),
 ("['T315I']", 1),
 ("['BCR-ABL1 Fusion']", 1),
 ("['G12C']", 1),
 ("['V299L,F317L,F317I,F317C,T315A,F317V']", 1),
 ("['V600E']", 1),
 ("['D816']", 1),
 ("['V600E,V600K']", 1),
 ("['Y823D']", 1),
 ("['G595R']", 1)]