In [1]:
import pandas as pd 
import numpy as np
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
while os.getcwd().split('/')[-1] != 'rag-llm-cancer-paper':
    os.chdir('..')

In [2]:
# ================== GENERAL IMPORTS ==================
import os
import json
from dotenv import load_dotenv

# ================== UTIL FUNCTIONS ==================
from utils.embedding import retrieve_context, get_text_embedding, store_embedding #get_context_db,
from utils.prompt import get_prompt
from llm.run_RAGLLM import run_RAG

# Generate context vector database
def get_context_db(context_chunks, CLIENT, model_embed):
    context_embeddings=np.array([get_text_embedding(chunk, CLIENT, model_embed) for chunk in context_chunks])
    index=store_embedding(context_embeddings)
    return(index)


# ================== MODEL & API IMPORTS ==================
from mistralai.client import MistralClient
from openai import OpenAI
from llm.inference import run_llm
import faiss

for the sake of thi immediate test, brute force adapt the `run_RAGLLM.py` script. we'll introduce it once it's stable.

In [3]:
#setup env
load_dotenv()
_MODEL = 'gpt-4.1-nano' #this could be gpt-4o-2024-05-13, gpt-4o-mini-2024-07-18, etc.
_MODEL_EMBED = 'text-embedding-3-small'
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Missing API key. Please set OPENAI_API_KEY in your .env file.")
CLIENT=OpenAI(api_key=api_key)

In [4]:
# #_VERSION = get_local_version()
# _QUERY_DF=pd.read_csv('data/real_world_db/real_world_validation__v1.csv', index_col = 0)#pd.read_csv(args.csv_path, index_col=0)
# with open(args.context_chunks, "r") as f:
#     _CONTEXT = json.load(f)
# _INDEX = faiss.read_index(f"./data/latest_db/indexes/text-embedding-3-small_structured_context__{_VERSION}.faiss")
# # Load entity database
# with open(f"context_retriever/entities/moalmanac_db_ner_entities__{_VERSION}.json", "r") as f:
#     _DB_ENTITY = json.load(f)
# with open(f"context_retriever/entities/synthetic_query_ner_entities__{_VERSION}.json", "r") as f:
#     _QUERY_ENTITY = json.load(f)

In [5]:
_VERSION = '2025-09-04'
version = '2025-09'
_QUERY_DF=pd.read_csv('data/real_world_db/real_world_validation__v1.csv', index_col = 0)
context_json_path=f"external-validation/non-moa-database/oncokb/oncokb-db/synthetic_answers__{version}.json"
with open(context_json_path, "r") as f:
     _CONTEXT = json.load(f)
index_path = 'external-validation/non-moa-database/oncokb/oncokb-db/index/oncokb_2025-09_structured_context.faiss'
_INDEX = faiss.read_index(index_path)
with open(f"context_retriever/entities/oncokb_db_context_ner_entities__oncokb-202509.json", "r") as f:
     _DB_ENTITY = json.load(f)
with open(f"context_retriever/entities/real_world_query_ner_entities__v1.json", "r") as f:
    _QUERY_ENTITY = json.load(f)

In [6]:
# Run RAG-LLM iterations
from llm.run_RAGLLM import run_iterations_rag
output_ls, input_ls, runtime_ls = run_iterations_rag(
    num_iterations=1,#args.num_iter, 
    data=_QUERY_DF, 
    context_chunks=_CONTEXT, 
    db_entity=_DB_ENTITY,
    query_entity=_QUERY_ENTITY, 
    num_vec=10, 
    index=_INDEX,
    client=CLIENT, 
    model=_MODEL, 
    model_embed=_MODEL_EMBED, 
    model_type='gpt',
    strategy=5,
    max_len=None, 
    temp=0.0, 
    random_seed=None
    )

# Save results
res_dict = {
    "full output": output_ls, 
    "input prompt": input_ls, 
    #"retrieval": retrieval_ls,
    "runtime": runtime_ls
    }

os.makedirs('external-validation/non-moa-database/oncokb/rag-llm/real-world-query', exist_ok=True)
output_dir = 'external-validation/non-moa-database/oncokb/rag-llm/real-world-query'

result_file=os.path.join(
    output_dir,
    f'RAGstra{str(5)}n{str(1)}temp{str(0)}_res_dict.pkl'
)
from utils.io import save_object
save_object(res_dict, filename=result_file)

Time elapsed for iteration 0: 0.2958 min


In [7]:
run_iterations_rag(
    num_iterations=1,#args.num_iter, 
    data=_QUERY_DF, 
    context_chunks=_CONTEXT, 
    db_entity=_DB_ENTITY,
    query_entity=_QUERY_ENTITY, 
    num_vec=10, 
    index=_INDEX,
    client=CLIENT, 
    model=_MODEL, 
    model_embed=_MODEL_EMBED, 
    model_type='gpt',
    strategy=5,
    max_len=None, 
    temp=0.0, 
    random_seed=None
    )

Time elapsed for iteration 0: 0.2972 min


([['{\n  "Status": "success",\n  "Treatment 1": {\n    "Disease Name": "Cholangiocarcinoma",\n    "Disease Phase or Condition": "metastatic",\n    "Drug Name": "pemigatinib",\n    "Prior Treatment or Resistance Status": "progressed on standard first-line treatments",\n    "Genomic Features": "FGFR2 fusion",\n    "FDA-approval status": "FDA-approved",\n    "Link to FDA-approved Label": "https://www.fda.gov/drugs/resources-information-approved-drugs/pemigatinib"\n  }\n}',
   '{\n  "Status": "no_match",\n  "Message": "No drugs are FDA-approved for the provided context"\n}',
   '{\n  "Status": "no_match",\n  "Message": "No drugs are FDA-approved for the provided context"\n}',
   '{\n  "Status": "no_match",\n  "Message": "No drugs are FDA-approved for the provided context"\n}',
   '{\n  "Status": "success",\n  "Treatment 1": {\n    "Disease Name": "Medullary Thyroid Cancer",\n    "Disease Phase or Condition": "progressive or metastatic",\n    "Drug Name": "selpercatinib",\n    "Prior Treatm

In [8]:
run_iterations_rag(
    num_iterations=1,#args.num_iter, 
    data=_QUERY_DF, 
    context_chunks=_CONTEXT, 
    db_entity=_DB_ENTITY,
    query_entity=_QUERY_ENTITY, 
    num_vec=10, 
    index=_INDEX,
    client=CLIENT, 
    model=_MODEL, 
    model_embed=_MODEL_EMBED, 
    model_type='gpt',
    strategy=0,
    max_len=None, 
    temp=0.0, 
    random_seed=None
    )

Time elapsed for iteration 0: 0.4069 min


([['{\n  "Treatment 1": {\n    "Disease Name": "Cholangiocarcinoma",\n    "Disease Phase or Condition": "Metastatic",\n    "Drug Name": "Futibatinib",\n    "Prior Treatment or Resistance Status": "Progressed on standard first-line treatments",\n    "Genomic Features": "FGFR2 fusion",\n    "Link to FDA-approved Label": "https://www.fda.gov/drugs/resources-information-approved-drugs/futibatinib"\n  }\n}',
   '{\n  "Treatment 1": {\n    "Disease Name": "Neuroblastoma",\n    "Disease Phase or Condition": "relapsed high-risk",\n    "Drug Name": "ALK inhibitor",\n    "Prior Treatment or Resistance Status": "relapsed",\n    "Genomic Features": "ALK mutation x",\n    "Link to FDA-approved Label": "https://www.fda.gov"\n  }\n}',
   '{\n  "Treatment 1": {\n    "Disease Name": "B-cell Acute Lymphoblastic Leukemia (B-ALL)",\n    "Disease Phase or Condition": "Relapsed",\n    "Drug Name": "Blinatumomab",\n    "Prior Treatment or Resistance Status": "Relapsed",\n    "Genomic Features": "CD19 express