In [17]:
import pandas as pd 
import numpy as np
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
while os.getcwd().split('/')[-1] != 'rag-llm-cancer-paper':
    os.chdir('..')

In [18]:
import ast
standardized_to_raw_mapping = pd.read_csv('external-validation/non-moa-database/civic/civic-db/civic_core__2025-09.csv')
standardized_to_raw_mapping = standardized_to_raw_mapping.rename(columns={'modified_standard_cancer':'modified_standardized_cancer'})
standardized_to_raw_mapping['biomarker']=standardized_to_raw_mapping['biomarker'].apply(ast.literal_eval)
standardized_to_raw_mapping['therapy']=standardized_to_raw_mapping['therapy'].apply(ast.literal_eval)
print(standardized_to_raw_mapping.shape)
standardized_to_raw_mapping.head()

(1569, 6)


Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standardized_cancer
0,17,Non-Small Cell Lung Cancer,Lung Non-small Cell Carcinoma,[ARAF S214C],[Sorafenib],Non-Small Cell Lung Cancer
1,18,Acute Myeloid Leukemia,Acute Myeloid Leukemia,[DNMT3A R882],[Idarubicin],Acute Myeloid Leukemia
2,19,Polycythemia Vera,Polycythemia Vera,[JAK2 V617F],[Peginterferon Alfa-2b],Polycythemia Vera
3,20,Polycythemia Vera,Polycythemia Vera,[JAK2 V617F],[Fedratinib],Polycythemia Vera
4,21,Melanoma,Melanoma,[NRAS G13D],[Tanespimycin],Melanoma


In [19]:
def create_synthetic_query(db: dict) -> str:
    """
    Create a synthetic query based on the database entry.
    """
    if len(db['biomarker']) > 1:
        biomarker_str = ", ".join(db['biomarker'])
    else:
        biomarker_str = db['biomarker'][0]
    query_text=f"if a patient with {db['modified_standardized_cancer']} cancer has {biomarker_str.lower()}, what therapy is recommended?"
    return query_text

def create_synthetic_answer(db: dict) -> str:
    """
    Create a synthetic answer based on the database entry.
    """
    if len(db['biomarker']) > 1:
        biomarker_str = ", ".join(db['biomarker'])
    else:
        biomarker_str = db['biomarker'][0]
    if len(db['therapy']) > 1:
        therapy_str = " + ".join(db['therapy'])
    else:
        therapy_str = db['therapy'][0]
    answer_text=f"If a patient with {db['modified_standardized_cancer']} cancer has {biomarker_str.lower()}, one recommended therapy is {therapy_str.lower()}."
    return answer_text

synthetic_query_list = []
synthetic_answer_list = []
synthetic_query_therapy_pair_dict = {}
for _, row in standardized_to_raw_mapping.iterrows():
    #create synthetic query and answer and add to dataframe
    query = create_synthetic_query(row)
    answer = create_synthetic_answer(row)
    standardized_to_raw_mapping.at[_, 'prompt'] = query
    
    #add more therapy context to answers (context db)
    statements = pd.read_csv('external-validation/non-moa-database/civic/civic-db/civic-draft.dereferenced.unique.context_db.csv')
    statement_dict = dict(zip(statements['statement_id'], statements['context']))

    summary = statement_dict.get(row['statement_id'], "No statement available.")
    #summary, stmt_row = flatten_statements(fda_statements[_])
    # if len(stmt_row['therapy_type']) > 1:
    #     therapy_type = ' + '.join(stmt_row['therapy_type'])
    # else:
    # #     therapy_type = stmt_row['therapy_type'][0]
    # if len(stmt_row['therapy_strategy']) > 1:
    #     therapy_strategy = ' + '.join(stmt_row['therapy_strategy'])
    # else:
    #     therapy_strategy = stmt_row['therapy_strategy'][0]
        
    standardized_to_raw_mapping.at[_, 'answer'] = (
        #f"{answer} therapy type: {therapy_type.lower()}. therapy strategy: {therapy_strategy.lower()}. indication: {stmt_row['indication'].lower()} approval url: {stmt_row['approval_url']}"
        f"{answer} entry url: https://civicdb.org/links/evidence_items/{row['statement_id']}"
    )
    
    #create synthetic query and ground-truth mapping dict
    row['therapy'] = [therapy.lower().strip() for therapy in row['therapy']]
    if query in synthetic_query_therapy_pair_dict.keys():
        synthetic_query_therapy_pair_dict[query].append(set(row['therapy']))
    else:
        synthetic_query_therapy_pair_dict[query] = [set(row['therapy'])]

In [20]:
#split ranges for testing on small batches
import random, math
def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result
test_idx_list = split_ranges(standardized_to_raw_mapping.shape[0], 10, 50)

In [21]:
for i, e in enumerate(standardized_to_raw_mapping.loc[test_idx_list].prompt):
    print(i, e)
    if i == 10:
        break

0 if a patient with Non-Small Cell Lung Cancer cancer has eml4::alk fusion, alk l1196m, what therapy is recommended?
1 if a patient with Neuroblastoma cancer has alk f1174l, what therapy is recommended?
2 if a patient with Breast Cancer cancer has erbb2 l755w, what therapy is recommended?
3 if a patient with Non-Small Cell Lung Cancer cancer has egfr l858r, what therapy is recommended?
4 if a patient with Non-Small Cell Lung Cancer cancer has ddr2 l63v, what therapy is recommended?
5 if a patient with Head And Neck Squamous Cell Carcinoma cancer has mapk1 e322k, what therapy is recommended?
6 if a patient with Glioblastoma cancer has pten expression, what therapy is recommended?
7 if a patient with Lung Adenocarcinoma cancer has egfr l858r, what therapy is recommended?
8 if a patient with Melanoma cancer has erbb4 mutation, what therapy is recommended?
9 if a patient with Melanoma cancer has braf v600e, what therapy is recommended?
10 if a patient with nan cancer has bap1 mutation, wha

In [22]:
for i, e in enumerate(standardized_to_raw_mapping.loc[test_idx_list].answer):
    print(i, e)
    if i == 10:
        break

0 If a patient with Non-Small Cell Lung Cancer cancer has eml4::alk fusion, alk l1196m, one recommended therapy is alectinib. entry url: https://civicdb.org/links/evidence_items/141
1 If a patient with Neuroblastoma cancer has alk f1174l, one recommended therapy is alectinib. entry url: https://civicdb.org/links/evidence_items/37
2 If a patient with Breast Cancer cancer has erbb2 l755w, one recommended therapy is neratinib. entry url: https://civicdb.org/links/evidence_items/284
3 If a patient with Non-Small Cell Lung Cancer cancer has egfr l858r, one recommended therapy is gefitinib. entry url: https://civicdb.org/links/evidence_items/276
4 If a patient with Non-Small Cell Lung Cancer cancer has ddr2 l63v, one recommended therapy is dasatinib. entry url: https://civicdb.org/links/evidence_items/271
5 If a patient with Head And Neck Squamous Cell Carcinoma cancer has mapk1 e322k, one recommended therapy is erlotinib. entry url: https://civicdb.org/links/evidence_items/791
6 If a patien

### generate synthetic queries

In [21]:
from utils.io import load_object, save_object
save_object(synthetic_query_therapy_pair_dict, 
            "external-validation/non-moa-database/civic/civic-db/synthetic_query_therapy_pair_dict.pkl")

In [22]:
for k, v in synthetic_query_therapy_pair_dict.items():
    print(k, v)
    break

if a patient with Non-Small Cell Lung Cancer cancer has araf s214c, what therapy is recommended? [{'sorafenib'}, {'sorafenib'}, {'trametinib'}]


In [24]:
import json
with open(f"external-validation/non-moa-database/civic/civic-db/synthetic_prompts__2025-09.json", "w") as f:
    json.dump(standardized_to_raw_mapping.prompt.to_list(), f)

with open(f"external-validation/non-moa-database/civic/civic-db/synthetic_answers__2025-09.json", "w") as f:
    json.dump(standardized_to_raw_mapping.answer.to_list(), f)

In [25]:
_CONTEXT = standardized_to_raw_mapping.answer.to_list()
print(_CONTEXT[0])
print(len(_CONTEXT))

If a patient with Non-Small Cell Lung Cancer cancer has araf s214c, one recommended therapy is sorafenib. entry url: https://civicdb.org/links/evidence_items/17
1569


In [26]:
def _cache_paths(output_dir: str, embed_name: str, name: str, version: str = "v1"):
    os.makedirs(output_dir, exist_ok=True)
    return (
        f"{output_dir}/{embed_name}_{name}__{version}.faiss",
        f"{output_dir}/{embed_name}_{name}__{version}.json",
    )

In [36]:
from dotenv import load_dotenv
from openai import OpenAI
from utils.embedding import retrieve_context, get_text_embedding, store_embedding #get_context_db,
def get_context_db(context_chunks, CLIENT, model_embed):
    context_embeddings=np.array([get_text_embedding(chunk, CLIENT, model_embed) for chunk in context_chunks])
    index=store_embedding(context_embeddings)
    return(index)
import faiss
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")
_CLIENT = OpenAI(api_key=api_key)
_MODEL_EMBED = "text-embedding-3-small"

In [37]:
# Always prefer cached index if present; only build once
force_rebuild=True
version = '2025-09'
index_path, ctx_path =_cache_paths("external-validation/non-moa-database/civic/civic-db/indexes", 
                                   _MODEL_EMBED, 
                                   "structured_context", version=version)
context_json_path=f"external-validation/non-moa-database/civic/civic-db/synthetic_answers__{version}.json"

def build_index(context_json_path, index_path, ctx_path):
    global _READY, _CLIENT, _CONTEXT, _INDEX, _MODEL_TYPE, _MODEL_NAME, _MODEL_EMBED
 
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")
    _CLIENT = OpenAI(api_key=api_key)
    _MODEL_EMBED = "text-embedding-3-small"
    
    with open(context_json_path, "r") as f:
            _CONTEXT = json.load(f)
    _INDEX = get_context_db(_CONTEXT, _CLIENT, _MODEL_EMBED)
    faiss.write_index(_INDEX, index_path)
    with open(ctx_path, "w") as f:
        json.dump(_CONTEXT, f)

In [38]:
index_path = 'external-validation/non-moa-database/civic/civic-db/index/civic_2025-09_structured_context.faiss'
ctx_path = 'external-validation/non-moa-database/civic/civic-db/index/civic_2025-09_structured_context.json'
context_json_path = f'external-validation/non-moa-database/civic/civic-db/synthetic_answers__{version}.json'
build_index(context_json_path, index_path, ctx_path)

### perform extraction

In [9]:
biobert_dir = "context_retriever/biobert_ner"
#download model from huggingface
from transformers import AutoTokenizer, AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
tokenizer = AutoTokenizer.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
id2label = model.config.id2label

#save locally
model.save_pretrained(biobert_dir)
tokenizer.save_pretrained(biobert_dir)

('context_retriever/biobert_ner/tokenizer_config.json',
 'context_retriever/biobert_ner/special_tokens_map.json',
 'context_retriever/biobert_ner/vocab.txt',
 'context_retriever/biobert_ner/added_tokens.json',
 'context_retriever/biobert_ner/tokenizer.json')

In [11]:
version = '2025-09'
import json
context_json_path = f'external-validation/non-moa-database/civic/civic-db/synthetic_answers__{version}.json'
with open(context_json_path, "r") as f:
    _CONTEXT = json.load(f)
#run BioBert NER on all contexts
from utils.entity_prediction import extract_entities, db_extract_entities
context_extracted_entities = []
for ctx in tqdm(_CONTEXT):
    context_extracted_entities.append(extract_entities(ctx, model, tokenizer))
context_extracted_entities

100%|██████████| 1569/1569 [00:58<00:00, 26.75it/s]


[{'cancer_type': ['non-small cell lung cancer cancer'],
  'biomarker': ['araf s214c']},
 {'cancer_type': ['acute myeloid leukemia cancer'], 'biomarker': ['dnmt3a']},
 {'cancer_type': ['polycythemia vera cancer'], 'biomarker': ['jak2']},
 {'cancer_type': ['polycythemia vera cancer'], 'biomarker': ['jak2']},
 {'cancer_type': ['melanoma cancer'], 'biomarker': ['nras g13d']},
 {'cancer_type': ['melanoma cancer'], 'biomarker': []},
 {'cancer_type': ['melanoma cancer'], 'biomarker': []},
 {'cancer_type': ['neuroblastoma cancer'], 'biomarker': []},
 {'cancer_type': ['neuroblastoma cancer'], 'biomarker': []},
 {'cancer_type': ['cancer cancer'], 'biomarker': ['alk']},
 {'cancer_type': ['non-small cell lung cancer cancer'],
  'biomarker': ['araf s214c']},
 {'cancer_type': ['non-small cell lung cancer cancer'],
  'biomarker': ['araf s214c']},
 {'cancer_type': ['melanoma cancer'], 'biomarker': ['l576p']},
 {'cancer_type': ['gastrointestinal stromal tumor cancer'],
  'biomarker': ['pdgfra']},
 {'ca

In [12]:
no_cancer_idx = []
no_biomarker_idx = []
for i, e in tqdm(enumerate(context_extracted_entities)):
    if not e['cancer_type']:
        no_cancer_idx.append(i)
    if not e['biomarker']:
        no_biomarker_idx.append(i)

print(f"{len(no_cancer_idx)*100/len(_CONTEXT):.3f}%") #no cancer entity extracted %
print(f"{len(no_biomarker_idx)*100/len(_CONTEXT):.3f}%") #no biomarker entity extracted %

1569it [00:00, 3470919.29it/s]

0.191%
14.978%





In [13]:
#save
os.makedirs("external-validation/non-moa-database/civic/civic-db/context_retriever/entities", exist_ok=True)
with open(f"external-validation/non-moa-database/civic/civic-db/context_retriever/entities/synthetic_query_ner_entities__{version}.json", "w") as f:
    json.dump(context_extracted_entities, f, indent=4)

### create validation test dataset

In [39]:
standardized_to_raw_mapping_subset=standardized_to_raw_mapping.sample(frac=.1, random_state=42)
standardized_to_raw_mapping_subset

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standardized_cancer,prompt,answer
1412,11139,Ovarian Cancer,Ovarian Cancer,[BRCA2 Mutation],[Rucaparib],Ovarian Cancer,if a patient with Ovarian Cancer cancer has br...,If a patient with Ovarian Cancer cancer has br...
985,4759,Non-Small Cell Lung Cancer,Lung Non-small Cell Carcinoma,[EGFR L858R],"[Gefitinib, Erlotinib]",Non-Small Cell Lung Cancer,if a patient with Non-Small Cell Lung Cancer c...,If a patient with Non-Small Cell Lung Cancer c...
432,1246,,Ewing Sarcoma Of Bone,[FGFR1 N546K],[Ponatinib],,"if a patient with nan cancer has fgfr1 n546k, ...","If a patient with nan cancer has fgfr1 n546k, ..."
964,4537,Chronic Myelogenous Leukemia,Chronic Myeloid Leukemia,"[BCR::ABL1 Fusion, ABL1 V299L]",[Axitinib],Chronic Myelogenous Leukemia,if a patient with Chronic Myelogenous Leukemia...,If a patient with Chronic Myelogenous Leukemia...
1169,7278,Acute Myeloid Leukemia,Acute Myeloid Leukemia,[IDH1 Mutation],[Ivosidenib],Acute Myeloid Leukemia,if a patient with Acute Myeloid Leukemia cance...,If a patient with Acute Myeloid Leukemia cance...
...,...,...,...,...,...,...,...,...
812,2480,Gastrointestinal Stromal Tumor,Gastrointestinal Stromal Tumor,[KIT EXPRESSION],[Imatinib],Gastrointestinal Stromal Tumor,if a patient with Gastrointestinal Stromal Tum...,If a patient with Gastrointestinal Stromal Tum...
303,991,Colorectal Cancer,Colorectal Cancer,[KRAS Mutation],"[Trametinib, Afatinib]",Colorectal Cancer,if a patient with Colorectal Cancer cancer has...,If a patient with Colorectal Cancer cancer has...
664,1694,,Bladder Carcinoma,[ERBB2 Mutation],[Platinum Compound],,if a patient with nan cancer has erbb2 mutatio...,If a patient with nan cancer has erbb2 mutatio...
538,1440,Lung Adenocarcinoma,Lung Adenocarcinoma,[RICTOR Amplification],"[Sapanisertib, Onatasertib]",Lung Adenocarcinoma,if a patient with Lung Adenocarcinoma cancer h...,If a patient with Lung Adenocarcinoma cancer h...


In [40]:
standardized_to_raw_mapping_subset.to_csv("external-validation/non-moa-database/civic/civic-db/civic_small_testing_dataset.csv", index=False)

In [41]:
standardized_to_raw_mapping_subset = pd.read_csv("external-validation/non-moa-database/civic/civic-db/civic_small_testing_dataset.csv")
standardized_to_raw_mapping_subset.head(1)

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standardized_cancer,prompt,answer
0,11139,Ovarian Cancer,Ovarian Cancer,['BRCA2 Mutation'],['Rucaparib'],Ovarian Cancer,if a patient with Ovarian Cancer cancer has br...,If a patient with Ovarian Cancer cancer has br...


In [42]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")
query_size=[len(encoding.encode(prompt)) for prompt in standardized_to_raw_mapping_subset.prompt]
print("# chunks: "+str(len(query_size)))
print("Min: "+str(np.min(query_size)))
print("Max: "+str(np.max(query_size)))
print("Mean: "+str(np.mean(query_size)))
print("Median: "+str(np.median(query_size)))
np.sum(query_size)

# chunks: 157
Min: 16
Max: 31
Mean: 21.044585987261147
Median: 21.0


np.int64(3304)

In [43]:
query_size=[len(encoding.encode(prompt)) for prompt in standardized_to_raw_mapping_subset.answer]
print("# chunks: "+str(len(query_size)))
print("Min: "+str(np.min(query_size)))
print("Max: "+str(np.max(query_size)))
print("Mean: "+str(np.mean(query_size)))
print("Median: "+str(np.median(query_size)))

# chunks: 157
Min: 34
Max: 52
Mean: 42.82165605095541
Median: 43.0


### perform evaluation

In [15]:
standardized_to_raw_mapping_subset = pd.read_csv("external-validation/non-moa-database/civic/civic-db/civic_small_testing_dataset.csv")
standardized_to_raw_mapping_subset.head(1)

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,biomarker,therapy,modified_standardized_cancer,prompt,answer
0,11139,Ovarian Cancer,Ovarian Cancer,['BRCA2 Mutation'],['Rucaparib'],Ovarian Cancer,if a patient with Ovarian Cancer cancer has br...,If a patient with Ovarian Cancer cancer has br...


In [26]:
# ================== GENERAL IMPORTS ==================
import os
import json
from dotenv import load_dotenv

# ================== UTIL FUNCTIONS ==================
from utils.embedding import retrieve_context, get_text_embedding, store_embedding #get_context_db,
from utils.prompt import get_prompt
from llm.run_RAGLLM import run_RAG

# Generate context vector database
def get_context_db(context_chunks, CLIENT, model_embed):
    context_embeddings=np.array([get_text_embedding(chunk, CLIENT, model_embed) for chunk in context_chunks])
    index=store_embedding(context_embeddings)
    return(index)


# ================== MODEL & API IMPORTS ==================
from mistralai.client import MistralClient
from openai import OpenAI
from llm.inference import run_llm
import faiss
synthetic_query_therapy_pair_dict=load_object("external-validation/non-moa-database/civic/civic-db/synthetic_query_therapy_pair_dict.pkl")

In [27]:
#setup env
import faiss
from dotenv import load_dotenv
load_dotenv()
_MODEL = 'gpt-4.1-nano' #this could be gpt-4o-2024-05-13, gpt-4o-mini-2024-07-18, etc.
_MODEL_EMBED = 'text-embedding-3-small'
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Missing API key. Please set OPENAI_API_KEY in your .env file.")
CLIENT=OpenAI(api_key=api_key)

In [29]:
_VERSION = '2025-09-04'
version = '2025-09'
_QUERY_DF=pd.read_csv("external-validation/non-moa-database/civic/civic-db/civic_small_testing_dataset.csv",)
context_json_path=f"external-validation/non-moa-database/civic/civic-db/synthetic_answers__{version}.json"
with open(context_json_path, "r") as f:
     _CONTEXT = json.load(f)
index_path = 'external-validation/non-moa-database/civic/civic-db/index/civic_2025-09_structured_context.faiss'
_INDEX = faiss.read_index(index_path)
with open(f"context_retriever/entities/civic_db_context_ner_entities__civic-202509.json", "r") as f:
     _DB_ENTITY = json.load(f)
with open(f"external-validation/non-moa-database/civic/civic-db/context_retriever/entities/synthetic_query_ner_entities__{version}.json", "r") as f:
    _QUERY_ENTITY = json.load(f)


In [30]:
# Run RAG-LLM iterations
from llm.run_RAGLLM import run_iterations_rag
output_ls, input_ls, runtime_ls = run_iterations_rag(
    num_iterations=1,#args.num_iter, 
    data=_QUERY_DF, 
    context_chunks=_CONTEXT, 
    db_entity=_DB_ENTITY,
    query_entity=_QUERY_ENTITY, 
    num_vec=10, 
    index=_INDEX,
    client=CLIENT, 
    model=_MODEL, 
    model_embed=_MODEL_EMBED, 
    model_type='gpt',
    strategy=5,
    max_len=None, 
    temp=0.0, 
    random_seed=None
    )

# Save results
res_dict = {
    "full output": output_ls, 
    "input prompt": input_ls, 
    #"retrieval": retrieval_ls,
    "runtime": runtime_ls
    }

os.makedirs('external-validation/non-moa-database/civic/rag-llm/small-test', exist_ok=True)
output_dir = 'external-validation/non-moa-database/civic/rag-llm/small-test'

result_file=os.path.join(
    output_dir,
    f'RAGstra{str(5)}n{str(1)}temp{str(0)}_res_dict.pkl'
)
from utils.io import save_object
save_object(res_dict, filename=result_file)

Time elapsed for iteration 0: 5.0569 min


In [31]:
output_res = os.path.join(
    output_dir,
    f'RAGstra{str(5)}n{str(1)}temp{str(0)}_res_dict.pkl'
)
output_res = load_object(output_res)

In [32]:
from utils.evaluation import calc_eval_metrics

output_eval = calc_eval_metrics(output_res['full output'][0], 
                                standardized_to_raw_mapping_subset.reset_index(drop=True)['prompt'], 
                                synthetic_query_therapy_pair_dict)
output_eval

{'avg_exact_match_acc': 0.39490445859872614,
 'avg_partial_match_acc': 0.6687898089171974,
 'avg_precision': 0.6687898089171974,
 'avg_recall': 0.49282133201844913,
 'avg_f1': 0.5325042968355069,
 'avg_specificity': 1.0,
 'exact_match_acc': [False,
  False,
  True,
  False,
  False,
  True,
  True,
  False,
  True,
  True,
  False,
  True,
  False,
  False,
  False,
  False,
  False,
  True,
  True,
  False,
  False,
  True,
  False,
  False,
  True,
  False,
  False,
  False,
  True,
  True,
  False,
  False,
  True,
  True,
  False,
  False,
  False,
  False,
  False,
  True,
  True,
  True,
  True,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  False,
  True,
  False,
  True,
  True,
  True,
  True,
  False,
  False,
  False,
  False,
  False,
  False,
  True,
  False,
  False,
  False,
  False,
  False,
  False,
  True,
  False,
  False,
  False,
  True,
  True,
  True,
  False,
  True,
  True,
  True,
  False,
  False,
  False,
  False,
  False,

In [33]:
exact_failed_ind = standardized_to_raw_mapping_subset.reset_index(drop=True)[~np.array(output_eval['exact_match_acc'])].index
print(exact_failed_ind)


Index([  0,   1,   3,   4,   7,  10,  12,  13,  14,  15,  16,  19,  20,  22,
        23,  25,  26,  27,  30,  31,  34,  35,  36,  37,  38,  43,  44,  45,
        46,  47,  48,  49,  50,  51,  52,  54,  59,  60,  61,  62,  63,  64,
        66,  67,  68,  69,  70,  71,  73,  74,  75,  79,  83,  84,  85,  86,
        87,  89,  90,  91,  93,  98, 101, 103, 104, 106, 107, 108, 109, 110,
       112, 113, 114, 115, 116, 117, 120, 121, 122, 125, 126, 127, 131, 136,
       138, 140, 144, 147, 148, 150, 151, 153, 154, 155, 156],
      dtype='int64')


In [37]:
from utils.io import load_object, save_object
synthetic_query_therapy_pair_dict = load_object(
            "external-validation/non-moa-database/civic/civic-db/synthetic_query_therapy_pair_dict.pkl")
synthetic_query_answer_pair_dict = load_object(
            "external-validation/non-moa-database/civic/civic-db/synthetic_query_answer_pair_dict.pkl")

FileNotFoundError: [Errno 2] No such file or directory: 'external-validation/non-moa-database/civic/civic-db/synthetic_query_answer_pair_dict.pkl'

In [41]:
num_true_drugs_list = []
cancer_types = []
biomarkers = []
for i, row in standardized_to_raw_mapping_subset.reset_index(drop=True)[~np.array(output_eval['exact_match_acc'])].iterrows():
    print(row['prompt'])
    print(synthetic_query_therapy_pair_dict[row['prompt']])
    num_true_drugs_list.append(len(synthetic_query_therapy_pair_dict[row['prompt']]))
    print(len(synthetic_query_therapy_pair_dict[row['prompt']]))
    print(output_res['input prompt'][0][i])
    print(output_res['full output'][0][i])
    cancer_types.append(row['standardized_cancer'])
    biomarkers.append(row['biomarker'])

if a patient with Ovarian Cancer cancer has brca2 mutation, what therapy is recommended?
[{'cediranib', 'olaparib'}, {'olaparib'}, {'rucaparib'}, {'rucaparib'}, {'olaparib'}, {'rucaparib'}, {'rucaparib'}]
7

    Context information is below.
    ---------------------
    ['If a patient with Ovarian Cancer cancer has brca2 mutation, one recommended therapy is olaparib. entry url: https://civicdb.org/links/evidence_items/7276', 'If a patient with Ovarian Cancer cancer has brca2 mutation, one recommended therapy is olaparib. entry url: https://civicdb.org/links/evidence_items/1773', 'If a patient with Ovarian Cancer cancer has brca2 mutation, one recommended therapy is rucaparib. entry url: https://civicdb.org/links/evidence_items/11137', 'If a patient with Ovarian Cancer cancer has brca2 mutation, one recommended therapy is rucaparib. entry url: https://civicdb.org/links/evidence_items/11139', 'If a patient with Ovarian Cancer cancer has brca2 mutation, one recommended therapy is cediran

In [42]:
print(num_true_drugs_list)
print(np.min(num_true_drugs_list))
print(np.mean(num_true_drugs_list))
print(np.median(num_true_drugs_list))
print(np.max(num_true_drugs_list))

[7, 30, 3, 2, 4, 15, 1, 30, 4, 32, 22, 6, 5, 7, 2, 3, 1, 4, 1, 3, 1, 3, 1, 1, 9, 2, 4, 1, 1, 32, 1, 1, 5, 3, 1, 2, 3, 1, 2, 2, 1, 2, 11, 1, 3, 6, 2, 1, 1, 5, 2, 1, 3, 1, 2, 9, 4, 2, 9, 2, 4, 2, 6, 2, 5, 3, 2, 19, 8, 6, 5, 32, 1, 6, 3, 3, 1, 1, 1, 30, 15, 4, 1, 5, 16, 32, 1, 30, 32, 3, 2, 6, 1, 1, 1]
1
6.484210526315789
3.0
32


In [43]:
from collections import Counter
Counter(cancer_types).most_common()

[('Non-Small Cell Lung Cancer', 24),
 (nan, 24),
 ('Melanoma', 5),
 ('Lung Adenocarcinoma', 5),
 ('Ovarian Cancer', 4),
 ('Chronic Myelogenous Leukemia', 3),
 ('Acute Myeloid Leukemia', 3),
 ('Cancer', 3),
 ('Colorectal Cancer', 3),
 ('Gastrointestinal Stromal Tumor', 3),
 ('Neuroblastoma', 2),
 ('Solid Tumor', 2),
 ('Glioblastoma', 2),
 ('Pancreatic Cancer', 1),
 ('adrenal', 1),
 ('B-lymphoblastic Leukemia/lymphoma', 1),
 ('Prostate Cancer', 1),
 ('Rhabdomyosarcoma', 1),
 ('Breast Cancer', 1),
 ('Medulloblastoma', 1),
 ('Sarcoma', 1),
 ('Low-Grade Glioma, NOS', 1),
 ('Pancreatic Adenocarcinoma', 1),
 ('Endometrial Cancer', 1),
 ('head and neck', 1)]

In [44]:
Counter(biomarkers).most_common()

[("['EGFR L858R']", 7),
 ("['ERBB2 Amplification']", 6),
 ("['EGFR T790M']", 3),
 ("['BRCA1 Loss-of-function']", 3),
 ("['KRAS Mutation']", 3),
 ("['KIT Exon 11 Mutation']", 3),
 ("['BRCA2 Mutation']", 2),
 ("['BRCA1 Mutation']", 2),
 ("['PML::RARA Fusion']", 2),
 ("['TP53 Mutation']", 2),
 ("['BRAF V600E']", 2),
 ("['PTEN Loss']", 2),
 ("['v::ROS1 Fusion']", 2),
 ("['PTEN Expression']", 2),
 ("['EGFR Exon 19 Deletion']", 2),
 ("['FGFR1 Amplification']", 2),
 ("['BCR::ABL1 Fusion', 'ABL1 V299L']", 1),
 ("['IDH1 Mutation']", 1),
 ("['DDR2 S768R']", 1),
 ("['v::ALK Fusion']", 1),
 ("['ALK F1174L']", 1),
 ("['EML4::ALK Fusion', 'ALK L1196M']", 1),
 ("['PIK3CA Amplification']", 1),
 ("['EGFR R108K']", 1),
 ("['RRM1 Underexpression']", 1),
 ("['EGFR Rare Exon 18-21 Mutation']", 1),
 ("['FGFR3::TACC3 Fusion', 'FGFR2::CCDC6 Fusion']", 1),
 ("['EGFR Overexpression']", 1),
 ("['RB1 Mutation']", 1),
 ("['BRAF V600K']", 1),
 ("['ETV6::ABL1 Fusion']", 1),
 ("['KIAA1549::BRAF Fusion']", 1),
 ("['PD