In [1]:
import pandas as pd 
import numpy as np
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
while os.getcwd().split('/')[-1] != 'rag-llm-cancer-paper':
    os.chdir('..')

In [2]:
# ================== GENERAL IMPORTS ==================
import os
import json
from dotenv import load_dotenv

# ================== UTIL FUNCTIONS ==================
from utils.embedding import retrieve_context, get_text_embedding, store_embedding #get_context_db,
from utils.prompt import get_prompt
from llm.run_RAGLLM import run_RAG

# Generate context vector database
def get_context_db(context_chunks, CLIENT, model_embed):
    context_embeddings=np.array([get_text_embedding(chunk, CLIENT, model_embed) for chunk in context_chunks])
    index=store_embedding(context_embeddings)
    return(index)


# ================== MODEL & API IMPORTS ==================
from mistralai.client import MistralClient
from openai import OpenAI
from llm.inference import run_llm
import faiss

In [3]:
# --- existing module state ---
_READY = False
_CLIENT = None
_CONTEXT = None
_INDEX = None
_MODEL_TYPE = None
_MODEL_NAME = None
_MODEL_EMBED = None

def _cache_paths(embed_name: str, version: str = "v1"):
    os.makedirs("indexes", exist_ok=True)
    return (
        f"indexes/{embed_name}__{version}.faiss",
        f"indexes/{embed_name}__{version}.context.json",
    )

In [4]:
#make a quick first pass at building the context db
structured_db = pd.read_csv('external-validation/non-moa-database/oncokb/oncokb-db/oncokb_core__2025-09.csv')
statements = pd.read_csv('external-validation/non-moa-database/oncokb/oncokb-db/oncokb-draft.dereferenced.unique.context_db.csv')
statement_dict = dict(zip(statements['statement_id'], statements['context']))
import ast
def build_context_chunks():
    context_chunks = []
    for _, row in tqdm(structured_db.iterrows()):
        # biomarker = ast.literal_eval(row['biomarker'])
        # cancer = row['raw_cancer']
        # therapy = ast.literal_eval(row['therapy'])
        # level = row['level']
        statement_id = row['statement_id']
        statement = statement_dict.get(statement_id, "No statement available.")

        # text = (
        #     'Biomarker: {biomarker}\nCancer: {cancer}\n'\
        #     'Therapy: {therapy}\nLevel: {level}\nEvidence Statement: {statement}'.format(
        #         biomarker=biomarker,
        #         cancer=cancer,
        #         therapy=therapy,
        #         level=level,
        #         statement=statement
        #     )
        # )

        context_chunks.append(statement)
    return context_chunks

context_chunks = build_context_chunks()
context_db = pd.DataFrame(context_chunks, columns=['context'])
context_db.to_csv('external-validation/non-moa-database/oncokb/oncokb-db/context_db.csv', index=False)
with open('external-validation/non-moa-database/oncokb/oncokb-db/context_db.json', 'w') as f:
    json.dump(context_db['context'].tolist(), f, indent=2)

625it [00:00, 56337.50it/s]


In [5]:
def build_index(context_json_path, index_path, ctx_path):
    global _READY, _CLIENT, _CONTEXT, _INDEX, _MODEL_TYPE, _MODEL_NAME, _MODEL_EMBED
 
    load_dotenv()
    api_key = os.getenv("OPENAI_API_KEY")
    _CLIENT = OpenAI(api_key=api_key)
    _MODEL_EMBED = "text-embedding-3-small"
    
    with open(context_json_path, "r") as f:
            _CONTEXT = json.load(f)
    _INDEX = get_context_db(_CONTEXT, _CLIENT, _MODEL_EMBED)
    faiss.write_index(_INDEX, index_path)
    with open(ctx_path, "w") as f:
        json.dump(_CONTEXT, f)

os.makedirs('external-validation/non-moa-database/oncokb/oncokb-db/index', exist_ok=True)
index_path = 'external-validation/non-moa-database/oncokb/oncokb-db/index/oncokb_2025-09.faiss'
ctx_path = 'external-validation/non-moa-database/oncokb/oncokb-db/index/oncokb_2025-09.json'
context_json_path = 'external-validation/non-moa-database/oncokb/oncokb-db/context_db.json'
build_index(context_json_path, index_path, ctx_path)