## Loads

In [1]:
import sys
import os
script_dir = os.getcwd()
root_dir = os.path.join(os.path.dirname(os.path.abspath(script_dir)))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(script_dir))))
os.chdir(root_dir)

In [2]:
os.getcwd()

'/home/helenajun/rag-llm-cancer-paper'

In [3]:
# !pip install rank_bm25
import json
import pandas as pd
import random
import math
from transformers import AutoTokenizer, AutoModelForTokenClassification
from utils.entity_prediction import extract_entities, db_extract_entities
from utils.check_db_version import get_local_version
_VERSION=get_local_version()
print(_VERSION)

def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result

2025-09-04


## Load BioBERT NER fine-tuned models

In [4]:
biobert_dir = "context_retriever/biobert_ner"

In [None]:
#download model from huggingface
model = AutoModelForTokenClassification.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
tokenizer = AutoTokenizer.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
id2label = model.config.id2label

#save locally
model.save_pretrained(biobert_dir)
tokenizer.save_pretrained(biobert_dir)

In [5]:
#load locally
model = AutoModelForTokenClassification.from_pretrained(biobert_dir)
tokenizer = AutoTokenizer.from_pretrained(biobert_dir)
id2label = model.config.id2label

## Extract entities

### Moalmanac DB

#### Context snippet

In [None]:
#load structured context
with open(f"data/latest_db/indexes/text-embedding-3-small_structured_context__{_VERSION}.json", "r") as f:
    _CONTEXT = json.load(f)

for i, e in enumerate(_CONTEXT):
    print(e)
    if i == 5:
        break

If a patient with early invasive breast carcinoma cancer has her2-negative, er positive, one recommended therapy is tamoxifen + abemaciclib. therapy type: hormone therapy + targeted therapy. therapy strategy: estrogen receptor inhibition + cdk4/6 inhibition. indication: verzenio is a kinase inhibitor indicated in combination with endocrine therapy (tamoxifen or an aromatase inhibitor) for the adjuvant treatment of adult patients with hormone receptor (hr)-positive, human epidermal growth factor receptor 2 (her2)-negative, node positive, early breast cancer at high risk of recurrence. approval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2023/208716s010s011lbl.pdf
If a patient with early invasive breast carcinoma cancer has pr positive, her2-negative, one recommended therapy is tamoxifen + abemaciclib. therapy type: hormone therapy + targeted therapy. therapy strategy: estrogen receptor inhibition + cdk4/6 inhibition. indication: verzenio is a kinase inhibitor indicated in 

In [None]:
test_ind = split_ranges(len(_CONTEXT), 10, 20)
for i, e in enumerate(_CONTEXT):
    if i in test_ind:
        print(f"======={i}=======")
        print(e)
        print(extract_entities(e, model, tokenizer))

If a patient with advanced or metastatic invasive breast carcinoma cancer has pr positive, her2-negative, one recommended therapy is anastrozole + abemaciclib. therapy type: hormone therapy + targeted therapy. therapy strategy: aromatase inhibition + cdk4/6 inhibition. indication: verzenio is a kinase inhibitor indicated in combination with an aromatase inhibitor as initial endocrine-based therapy for the treatment of adult patients with hormone receptor (hr)-positive, human epidermal growth factor receptor 2 (her2)-negative advanced or metastatic breast cancer. approval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2023/208716s010s011lbl.pdf
{'cancer_type': ['breast carcinoma cancer', 'breast cancer'], 'biomarker': ['pr', 'cdk4', 'human epidermal growth factor receptor 2', 'her2', 'hormone receptor', 'aromatase', 'hr', None]}
If a patient with metastatic castration-resistant prostate adenocarcinoma cancer has bard1 oncogenic variants, one recommended therapy is prednisone 

In [None]:
#run BioBert NER on all contexts
context_extracted_entities = []
for ctx in _CONTEXT:
    context_extracted_entities.append(extract_entities(ctx, model, tokenizer))
context_extracted_entities

[{'cancer_type': ['breast carcinoma cancer', 'early breast cancer'],
  'biomarker': ['her2',
   'aromatase',
   'hr)',
   'estrogen receptor',
   'er',
   'cdk4',
   'human epidermal growth factor receptor 2',
   'hormone receptor']},
 {'cancer_type': ['breast carcinoma cancer', 'breast cancer'],
  'biomarker': ['her2',
   'hr',
   'aromatase',
   'pr',
   'estrogen receptor',
   'cdk4',
   'human epidermal growth factor receptor 2',
   'hormone receptor']},
 {'cancer_type': ['breast carcinoma cancer', 'early breast cancer'],
  'biomarker': ['her2',
   'aromatase',
   'pr',
   'hr)',
   'estrogen receptor',
   'er',
   'cdk4',
   'human epidermal growth factor receptor 2',
   'hormone receptor']},
 {'cancer_type': ['breast carcinoma cancer', 'breast cancer'],
  'biomarker': ['her2',
   'hr',
   'aromatase',
   'er',
   'cdk4',
   'human epidermal growth factor receptor 2',
   'hormone receptor',
   None]},
 {'cancer_type': ['breast carcinoma cancer', 'breast cancer'],
  'biomarker': ['

In [None]:
no_cancer_idx = []
no_biomarker_idx = []
for i, e in enumerate(context_extracted_entities):
    if not e['cancer_type']:
        no_cancer_idx.append(i)
    if not e['biomarker']:
        no_biomarker_idx.append(i)

print(f"{len(no_cancer_idx)/len(_CONTEXT):.3f}%") #no cancer entity extracted %
print(f"{len(no_biomarker_idx)/len(_CONTEXT):.3f}%") #no biomarker entity extracted %

0.089%
0.002%


In [None]:
#save
with open(f"context_retriever/entities/moalmanac_db_context_ner_entities__{_VERSION}.json", "w") as f:
    json.dump(context_extracted_entities, f)

#### Cancer type and biomarker entities

In [10]:
moalmanac_db = pd.read_csv(f"data/latest_db/moalmanac_core__{_VERSION}.csv")

In [None]:
#run BioBert NER on all DB entities
db_extracted_entities = []
for _, row in moalmanac_db.iterrows():
    db_extracted_entities.append(db_extract_entities(row))
db_extracted_entities

[{'cancer_type': ['early invasive breast carcinoma'],
  'biomarker': ['her2', 'er']},
 {'cancer_type': ['early invasive breast carcinoma'],
  'biomarker': ['pr', 'her2']},
 {'cancer_type': ['early invasive breast carcinoma'],
  'biomarker': ['pr', 'her2', 'er']},
 {'cancer_type': ['advanced or metastatic invasive breast carcinoma'],
  'biomarker': ['her2', 'er']},
 {'cancer_type': ['advanced or metastatic invasive breast carcinoma'],
  'biomarker': ['pr', 'her2']},
 {'cancer_type': ['advanced or metastatic invasive breast carcinoma'],
  'biomarker': ['pr', 'her2', 'er']},
 {'cancer_type': ['advanced or metastatic invasive breast carcinoma'],
  'biomarker': ['her2', 'er']},
 {'cancer_type': ['advanced or metastatic invasive breast carcinoma'],
  'biomarker': ['pr', 'her2']},
 {'cancer_type': ['advanced or metastatic invasive breast carcinoma'],
  'biomarker': ['pr', 'her2', 'er']},
 {'cancer_type': ['advanced or metastatic invasive breast carcinoma'],
  'biomarker': ['her2', 'er']},
 {'

In [None]:
#save
with open(f"context_retriever/entities/moalmanac_db_ner_entities__{_VERSION}.json", "w") as f:
    json.dump(db_extracted_entities, f)

#### Synthetic queries

In [None]:
with open(f"data/latest_db/synthetic_prompts__{_VERSION}.json", "r") as f:
    synthetic_query = json.load(f)

In [None]:
#run BioBert NER on all synthetic queries
synthetic_query_extracted_entities = []
for query in synthetic_query:
    synthetic_query_extracted_entities.append(extract_entities(query, model, tokenizer))
synthetic_query_extracted_entities

[{'cancer_type': ['early invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2']},
 {'cancer_type': ['early invasive breast carcinoma cancer'],
  'biomarker': ['her2', 'pr']},
 {'cancer_type': ['early invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2', 'pr']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['er', 'her2']},
 {'cancer_type': ['invasive breast carcinoma cancer'],
  'biomarker': ['her2', 'pr']},
 {'canc

In [None]:
#save
with open(f"context_retriever/entities/synthetic_query_ner_entities__{_VERSION}.json", "w") as f:
    json.dump(synthetic_query_extracted_entities, f)

### Real-world queries

In [68]:
real_world_db=pd.read_csv("data/real_world_db/updated_real_world_queries_answers__v1.csv", index_col=0)
real_world_db=real_world_db.dropna(subset='Questions').reset_index(drop=True)

In [None]:
real_queries=real_world_db['Questions'].str.lower().str.strip().tolist()

In [79]:
#run BioBert NER on all real-world queries
real_query_extracted_entities = []
for query in real_queries:
    real_query_extracted_entities.append(extract_entities(query, model, tokenizer))
real_query_extracted_entities

[{'cancer_type': ['urothelial carcinoma', 'metastatic'],
  'biomarker': ['fgfr3']},
 {'cancer_type': ['leiomyosarcoma'], 'biomarker': ['dna', 'parp']},
 {'cancer_type': ['rhabdomyosarcoma'], 'biomarker': ['alk', 'tfcp2']},
 {'cancer_type': ['prostate cancer', 'cancer'], 'biomarker': ['brca2']},
 {'cancer_type': ['breast cancer', 'disease'], 'biomarker': ['her2']},
 {'cancer_type': ['cancer'], 'biomarker': ['ntrk']},
 {'cancer_type': ['cancer'], 'biomarker': []},
 {'cancer_type': ['pancreatic ductal adenocarcinoma'],
  'biomarker': ['ntrk', 'fda']},
 {'cancer_type': ['bladder cancer'], 'biomarker': ['fgfr2', 'fda']},
 {'cancer_type': ['ewing sarcoma', 'localized'], 'biomarker': ['stag2']},
 {'cancer_type': ['high-risk neuroblastoma'], 'biomarker': ['alk']},
 {'cancer_type': ['neuroblastoma'], 'biomarker': ['bard1']},
 {'cancer_type': ['neuroblastoma'], 'biomarker': ['alk']},
 {'cancer_type': ['tumor', 'osteosarcoma'], 'biomarker': ['myc']},
 {'cancer_type': [], 'biomarker': ['kmt2a', 'a

In [None]:
#save prompt-entity mapped dataframe to check NER quality and annotate cancer categories
real_world_db_entity_mapped=pd.DataFrame({
    "prompt": real_queries,
    "cancer_type": [e['cancer_type'] for e in real_query_extracted_entities],
    "biomarker": [e['biomarker'] for e in real_query_extracted_entities]
})
real_world_db_entity_mapped.to_csv("data/real_world_db/real_world_db_entity_mapped__v1.csv")

In [None]:
#save entities
with open(f"context_retriever/entities/real_world_query_ner_entities__v1.json", "w") as f:
    json.dump(real_query_extracted_entities, f)