## Loads

In [1]:
import sys
import os
script_dir = os.getcwd()
root_dir = os.path.join(os.path.dirname(os.path.abspath(script_dir)))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(script_dir))))
os.chdir(root_dir)
os.getcwd()


'/home/helenajun/rag-llm-cancer-paper'

In [2]:
import json
import pandas as pd
import numpy as np
from utils.io import load_object, save_object
from collections import Counter
import requests
import random
import math
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
import re
from utils.entity_prediction import extract_entities
import ast


In [3]:
#local db version
CACHE_FILE = "db_version_cache.json"

def get_remote_version():
    agents = requests.get('https://api.moalmanac.org/agents').json()
    return agents['service']['last_updated']

def get_local_version():
    if os.path.exists(CACHE_FILE):
        with open(CACHE_FILE) as f:
            return json.load(f).get("version")
    return None

def save_local_version(version):
    with open(CACHE_FILE, "w") as f:
        json.dump({"version": version}, f)


#split ranges for testing on small batches
def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result


_VERSION=get_local_version()
_VERSION

'2025-09-04'

## Entity extraction using BioBert

In [7]:
biobert_dir = "context_retriever/biobert_ner"

In [None]:
#download model from huggingface
model = AutoModelForTokenClassification.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
tokenizer = AutoTokenizer.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
id2label = model.config.id2label

#save locally
model.save_pretrained(biobert_dir)
tokenizer.save_pretrained(biobert_dir)

In [8]:
#load locally
model = AutoModelForTokenClassification.from_pretrained(biobert_dir)
tokenizer = AutoTokenizer.from_pretrained(biobert_dir)
id2label = model.config.id2label

### Context DB

In [5]:
#load structured context
with open(f"data/latest_db/indexes/text-embedding-3-small_structured_context__{_VERSION}.json", "r") as f:
    _CONTEXT = json.load(f)

for i, e in enumerate(_CONTEXT):
    print(e)
    if i == 5:
        break

If a patient with early invasive breast carcinoma cancer has her2-negative, er positive, one recommended therapy is tamoxifen + abemaciclib. therapy type: hormone therapy + targeted therapy. therapy strategy: estrogen receptor inhibition + cdk4/6 inhibition. indication: verzenio is a kinase inhibitor indicated in combination with endocrine therapy (tamoxifen or an aromatase inhibitor) for the adjuvant treatment of adult patients with hormone receptor (hr)-positive, human epidermal growth factor receptor 2 (her2)-negative, node positive, early breast cancer at high risk of recurrence. approval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2023/208716s010s011lbl.pdf
If a patient with early invasive breast carcinoma cancer has pr positive, her2-negative, one recommended therapy is tamoxifen + abemaciclib. therapy type: hormone therapy + targeted therapy. therapy strategy: estrogen receptor inhibition + cdk4/6 inhibition. indication: verzenio is a kinase inhibitor indicated in 

In [9]:
test_ind = split_ranges(len(_CONTEXT), 10, 20)
for i, e in enumerate(_CONTEXT):
    if i in test_ind:
        print(f"======={i}=======")
        print(e)
        print(extract_entities(e, model, tokenizer))

If a patient with advanced or metastatic invasive breast carcinoma cancer has pr positive, her2-negative, one recommended therapy is anastrozole + abemaciclib. therapy type: hormone therapy + targeted therapy. therapy strategy: aromatase inhibition + cdk4/6 inhibition. indication: verzenio is a kinase inhibitor indicated in combination with an aromatase inhibitor as initial endocrine-based therapy for the treatment of adult patients with hormone receptor (hr)-positive, human epidermal growth factor receptor 2 (her2)-negative advanced or metastatic breast cancer. approval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2023/208716s010s011lbl.pdf
{'cancer_type': ['breast carcinoma cancer', 'breast cancer'], 'biomarker': ['pr', 'cdk4', 'human epidermal growth factor receptor 2', 'her2', 'hormone receptor', 'aromatase', 'hr', None]}
If a patient with metastatic castration-resistant prostate adenocarcinoma cancer has bard1 oncogenic variants, one recommended therapy is prednisone 

In [10]:
#run BioBert NER on all contexts
context_extracted_entities = []
for ctx in _CONTEXT:
    context_extracted_entities.append(extract_entities(ctx, model, tokenizer))
context_extracted_entities

[{'cancer_type': ['breast carcinoma cancer', 'early breast cancer'],
  'biomarker': ['aromatase',
   'cdk4',
   'her2',
   'hormone receptor',
   'estrogen receptor',
   'hr)',
   'er',
   'human epidermal growth factor receptor 2']},
 {'cancer_type': ['breast carcinoma cancer', 'breast cancer'],
  'biomarker': ['pr',
   'cdk4',
   'her2',
   'hormone receptor',
   'estrogen receptor',
   'aromatase',
   'hr',
   'human epidermal growth factor receptor 2']},
 {'cancer_type': ['breast carcinoma cancer', 'early breast cancer'],
  'biomarker': ['pr',
   'aromatase',
   'cdk4',
   'her2',
   'hormone receptor',
   'estrogen receptor',
   'hr)',
   'er',
   'human epidermal growth factor receptor 2']},
 {'cancer_type': ['breast carcinoma cancer', 'breast cancer'],
  'biomarker': ['aromatase',
   'cdk4',
   'human epidermal growth factor receptor 2',
   'her2',
   'hormone receptor',
   'er',
   'hr',
   None]},
 {'cancer_type': ['breast carcinoma cancer', 'breast cancer'],
  'biomarker': ['

In [11]:
no_cancer_idx = []
no_biomarker_idx = []
for i, e in enumerate(context_extracted_entities):
    if not e['cancer_type']:
        no_cancer_idx.append(i)
    if not e['biomarker']:
        no_biomarker_idx.append(i)

print(f"{len(no_cancer_idx)/len(_CONTEXT):.3f}%") #no cancer entity extracted %
print(f"{len(no_biomarker_idx)/len(_CONTEXT):.3f}%") #no biomarker entity extracted %

0.089%
0.002%


In [13]:
#save
with open(f"context_retriever/entities/moalmanac_db_ner_entities__{_VERSION}.json", "w") as f:
    json.dump(context_extracted_entities, f)

### Synthetic query dataset

In [14]:
with open(f"data/latest_db/synthetic_prompts__{_VERSION}.json", "r") as f:
    synthetic_query = json.load(f)

In [15]:
#run BioBert NER on all synthetic queries
synthetic_query_extracted_entities = []
for query in synthetic_query:
    synthetic_query_extracted_entities.append(extract_entities(query, model, tokenizer))
synthetic_query_extracted_entities

[{'cancer_type': ['early invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2']},
 {'cancer_type': ['early invasive breast carcinoma cancer'],
  'biomarker': ['her2', 'pr']},
 {'cancer_type': ['early invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['her2', 'pr']},
 {'canc

In [16]:
#save
with open(f"context_retriever/entities/synthetic_query_ner_entities__{_VERSION}.json", "w") as f:
    json.dump(synthetic_query_extracted_entities, f)

In [17]:
synthetic_query_extracted_entities[0]

{'cancer_type': ['early invasive breast carcinoma cancer'],
 'biomarker': ['er', 'her2']}

In [19]:
#test entities matching
from utils.entity_matching import match_entities

with open(f"context_retriever/entities/moalmanac_db_ner_entities__{_VERSION}.json", "r") as f:
    _DB_ENTITY = json.load(f)
with open(f"context_retriever/entities/synthetic_query_ner_entities__{_VERSION}.json", "r") as f:
    _QUERY_ENTITY = json.load(f)

test_idx=0
print(_QUERY_ENTITY[test_idx])
matched_score_all=match_entities(_QUERY_ENTITY[test_idx], _DB_ENTITY, fuzzy_thres=70)
print(len(matched_score_all))
matched_score_all

{'cancer_type': ['early invasive breast carcinoma cancer'], 'biomarker': ['er', 'her2']}
186


[(591,
  3.0,
  {'cancer_type': ['breast carcinoma cancer', 'breast cancer'],
   'biomarker': ['pi3k',
    'her2)',
    'akt',
    'cdk4',
    'fda',
    'pik3ca',
    'human epidermal growth factor receptor 2',
    'her2',
    'mtor',
    'estrogen receptor',
    'hr)',
    'hormone receptor (',
    'er',
    None]}),
 (593,
  3.0,
  {'cancer_type': ['breast carcinoma cancer', 'breast cancer'],
   'biomarker': ['pi3k',
    'her2)',
    'pr',
    'akt',
    'cdk4',
    'fda',
    'pik3ca',
    'human epidermal growth factor receptor 2',
    'her2',
    'mtor',
    'estrogen receptor',
    'hr)',
    'hormone receptor (',
    'er',
    None]}),
 (0,
  2.5,
  {'cancer_type': ['breast carcinoma cancer', 'early breast cancer'],
   'biomarker': ['aromatase',
    'cdk4',
    'her2',
    'hormone receptor',
    'estrogen receptor',
    'hr)',
    'er',
    'human epidermal growth factor receptor 2']}),
 (2,
  2.5,
  {'cancer_type': ['breast carcinoma cancer', 'early breast cancer'],
   'bioma

In [20]:
matching_idx = [match[0] for match in matched_score_all]
matching_score = [match[1] for match in matched_score_all]

In [23]:
#load groundtruth
synthetic_query_therapy_pair_dict=load_object("data/latest_db/synthetic_query_therapy_pair_dict.pkl")

In [24]:
print(synthetic_query[test_idx])
print(synthetic_query_therapy_pair_dict[synthetic_query[test_idx]])

if a patient with early invasive breast carcinoma cancer has her2-negative, er positive, what therapy is recommended?
[{'abemaciclib', 'tamoxifen'}, {'abemaciclib', 'letrozole'}, {'anastrozole', 'goserelin', 'ribociclib'}, {'letrozole', 'goserelin', 'ribociclib'}, {'anastrozole', 'abemaciclib'}, {'abemaciclib', 'exemestane'}]


In [None]:
#load retrieved context with high matching score
moalmanac_db = pd.read_csv(f"data/latest_db/moalmanac_core__{_VERSION}.csv")
moalmanac_db.therapy = moalmanac_db.therapy.apply(ast.literal_eval) 

['Inavolisib', 'Fulvestrant', 'Palbociclib']


In [30]:
subset = moalmanac_db[
    (moalmanac_db["modified_standardized_cancer"] == moalmanac_db["modified_standardized_cancer"].iloc[test_idx]) &
    (moalmanac_db["biomarker"] == moalmanac_db["biomarker"].iloc[test_idx])
]
subset

Unnamed: 0,statement_id,standardized_cancer,raw_cancer,modified_standardized_cancer,biomarker,therapy
0,0,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"['HER2-negative', 'ER positive']","[Tamoxifen, Abemaciclib]"
98,99,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"['HER2-negative', 'ER positive']","[Abemaciclib, Letrozole]"
577,579,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"['HER2-negative', 'ER positive']","[Goserelin, Anastrozole, Ribociclib]"
580,582,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"['HER2-negative', 'ER positive']","[Goserelin, Ribociclib, Letrozole]"
616,619,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"['HER2-negative', 'ER positive']","[Anastrozole, Abemaciclib]"
619,622,Invasive Breast Carcinoma,early breast cancer,early invasive breast carcinoma,"['HER2-negative', 'ER positive']","[Exemestane, Abemaciclib]"


In [39]:
Counter(matching_score).most_common()

[(1.5, 64), (2.5, 63), (1, 24), (0.5, 24), (2.0, 9), (3.0, 2)]

In [35]:
for i in subset.index:
    for rank, match in enumerate(matched_score_all):
        if match[0] == i:
            print(rank, match[1], match[2])

2 2.5 {'cancer_type': ['breast carcinoma cancer', 'early breast cancer'], 'biomarker': ['aromatase', 'cdk4', 'her2', 'hormone receptor', 'estrogen receptor', 'hr)', 'er', 'human epidermal growth factor receptor 2']}
29 2.5 {'cancer_type': ['breast carcinoma cancer', 'early breast cancer'], 'biomarker': ['aromatase', 'cdk4 / 6', 'hormone receptor', 'her2', 'er', 'hr', 'human epidermal growth factor receptor 2']}
55 2.5 {'cancer_type': ['breast carcinoma cancer', 'early breast cancer'], 'biomarker': ['gonadotropin-releasing hormone', 'aromatase', 'cdk4', 'kisqali', 'human epidermal growth factor receptor 2', 'her2', 'gnrh', 'hr)', 'hormone receptor (', 'er', None]}
57 2.5 {'cancer_type': ['breast carcinoma cancer'], 'biomarker': ['gonadotropin-releasing hormone', 'cdk4 / 6', 'aromatase', 'kisqali', 'her2', 'gnrh', 'hr)', 'hormone receptor (', 'er', 'human epidermal growth factor receptor 2']}
60 2.5 {'cancer_type': ['breast carcinoma cancer', 'breast cancer'], 'biomarker': ['aromatase', 

ok there are so many retrieved context with the same matching score...