In [1]:
import pandas as pd 
import numpy as np
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
while os.getcwd().split('/')[-1] != 'rag-llm-cancer-paper':
    os.chdir('..')

### setup biobert

In [3]:
biobert_dir = "context_retriever/biobert_ner"
#download model from huggingface
from transformers import AutoTokenizer, AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
tokenizer = AutoTokenizer.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
id2label = model.config.id2label

#save locally
model.save_pretrained(biobert_dir)
tokenizer.save_pretrained(biobert_dir)

('context_retriever/biobert_ner/tokenizer_config.json',
 'context_retriever/biobert_ner/special_tokens_map.json',
 'context_retriever/biobert_ner/vocab.txt',
 'context_retriever/biobert_ner/added_tokens.json',
 'context_retriever/biobert_ner/tokenizer.json')

In [4]:
#load locally
model = AutoModelForTokenClassification.from_pretrained(biobert_dir)
tokenizer = AutoTokenizer.from_pretrained(biobert_dir)
id2label = model.config.id2label

### load other libraries

In [5]:
# !pip install rank_bm25
import json
import pandas as pd
import random
import math
from transformers import AutoTokenizer, AutoModelForTokenClassification
from utils.entity_prediction import extract_entities, db_extract_entities
#version check is for mol almanac
# from utils.check_db_version import get_local_version
# _VERSION=get_local_version()
# print(_VERSION)

def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result

### context snippets

In [6]:
structured_db = pd.read_csv('external-validation/non-moa-database/oncokb/oncokb-db/oncokb_core__2025-09.csv')
db = structured_db[['statement_id']]
statements = pd.read_csv('external-validation/non-moa-database/oncokb/oncokb-db/oncokb-draft.dereferenced.unique.context_db.csv')
statement_dict = dict(zip(statements['statement_id'], statements['context']))
db['evidence_statement'] = db['statement_id'].map(statement_dict)
print(db.shape)
db.head(1)

(625, 2)


Unnamed: 0,statement_id,evidence_statement
0,0,Approval level: LEVEL_1\nDatabase: OncoKB\nDes...


In [7]:
# db = pd.read_csv('external-validation/non-moa-database/civic/civic-db/evidence_statements.csv')
# print(db.shape)
# db.head(1)

In [8]:
_VERSION = 'oncokb-202509'
_CONTEXT = db.evidence_statement.tolist()
test_ind = split_ranges(len(_CONTEXT), 10, 20)
for i, e in enumerate(_CONTEXT):
    if i in test_ind:
        print(f"======={i}=======")
        print(e)
        print(extract_entities(e, model, tokenizer))

Approval level: LEVEL_1
Database: OncoKB
Description: Dabrafenib, an orally bioavailable RAF inhibitor, and trametinib, an orally bioavailable MEK1/2 inhibitor, are FDA-approved in combination for the treatment of patients with solid tumors other than colorectal harboring BRAF V600E mutation. FDA approval was based on data from 131 adult patients with solid tumors treated with dabrafenib and trametinib in the BRF117019 and NCI-MATCH trials and 36 pediatric patients treated with dabrafenib and trametinib in the CTMT212X2101 study. Of the 131 adult patients treated with dabrafenib and trametinib, the overall response rate was 41% (54/131; 95% CI = 33-50) and of the 36 pediatric patients treated with dabrafenib and trametinib (low-grade glioma, n=34; high-grade glioma, n=2), the overall response rate was 25% (95% CI = 12-24) (PMID: 32818466, 34838156, 32758030)(Abstract: Bouffet et al. Abstract# LGG-49, Neuro-Oncology 2020. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7715318/). Approval 

In [9]:
#run BioBert NER on all contexts
context_extracted_entities = []
for ctx in tqdm(_CONTEXT):
    context_extracted_entities.append(extract_entities(ctx, model, tokenizer))
context_extracted_entities

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 625/625 [01:22<00:00,  7.53it/s]


[{'cancer_type': ['cancer',
   'lymphoma',
   'ph + all',
   'b-lymphoblastic leukemia'],
  'biomarker': ['bcr', 'bcr-abl1']},
 {'cancer_type': ['cancer',
   'chronic myeloid leukemia',
   'chronic myelogenous leukemia'],
  'biomarker': ['bcr-abl1', 'bcr-abl']},
 {'cancer_type': ['cancer',
   'ph + cml',
   'chronic myelogenous leukemia',
   'cml',
   'chronic myeloid leukemia'],
  'biomarker': ['abl', 'bcr-abl1']},
 {'cancer_type': ['tumors', 'breast cancer'],
  'biomarker': ['pten', 'her2', 'er', 'akt1', 'pik3ca']},
 {'cancer_type': ['melanoma', 'cancer'],
  'biomarker': ['v600e', 'braf', 'raf', 'v600k']},
 {'cancer_type': ['high-grade glioma',
   'low-grade glioma',
   'solid tumors',
   'colorectal'],
  'biomarker': ['braf', 'raf', 'mek1 / 2']},
 {'cancer_type': ['anaplastic thyroid cancer', 'atc', 'cancer'],
  'biomarker': ['v600e', 'braf', 'raf', 'mek1 / 2']},
 {'cancer_type': ['low-grade glioma',
   'solid tumors',
   'colorectal',
   'lung cancer',
   'high-grade glioma',
   'm

In [12]:
no_cancer_idx = []
no_biomarker_idx = []
for i, e in tqdm(enumerate(context_extracted_entities)):
    if not e['cancer_type']:
        no_cancer_idx.append(i)
    if not e['biomarker']:
        no_biomarker_idx.append(i)

print(f"{len(no_cancer_idx)*100/len(_CONTEXT):.3f}%") #no cancer entity extracted %
print(f"{len(no_biomarker_idx)*100/len(_CONTEXT):.3f}%") #no biomarker entity extracted %

625it [00:00, 2508555.02it/s]

0.160%
0.320%





In [13]:
#save
os.makedirs("context_retriever/entities", exist_ok=True)
with open(f"context_retriever/entities/oncokb_db_context_ner_entities__{_VERSION}.json", "w") as f:
    json.dump(context_extracted_entities, f, indent=4)