In [1]:
import pandas as pd 
import numpy as np
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
while os.getcwd().split('/')[-1] != 'rag-llm-cancer-paper':
    os.chdir('..')

### setup biobert

In [3]:
biobert_dir = "context_retriever/biobert_ner"
#download model from huggingface
from transformers import AutoTokenizer, AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
tokenizer = AutoTokenizer.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
id2label = model.config.id2label

#save locally
model.save_pretrained(biobert_dir)
tokenizer.save_pretrained(biobert_dir)

('context_retriever/biobert_ner/tokenizer_config.json',
 'context_retriever/biobert_ner/special_tokens_map.json',
 'context_retriever/biobert_ner/vocab.txt',
 'context_retriever/biobert_ner/added_tokens.json',
 'context_retriever/biobert_ner/tokenizer.json')

In [4]:
#load locally
model = AutoModelForTokenClassification.from_pretrained(biobert_dir)
tokenizer = AutoTokenizer.from_pretrained(biobert_dir)
id2label = model.config.id2label

### load other libraries

In [5]:
# !pip install rank_bm25
import json
import pandas as pd
import random
import math
from transformers import AutoTokenizer, AutoModelForTokenClassification
from utils.entity_prediction import extract_entities, db_extract_entities
#version check is for mol almanac
# from utils.check_db_version import get_local_version
# _VERSION=get_local_version()
# print(_VERSION)

def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result

### context snippets

In [6]:
structured_db = pd.read_csv('external-validation/non-moa-database/civic/civic-db/civic_core__2025-09.csv')
db = structured_db[['statement_id']]
statements = pd.read_csv('external-validation/non-moa-database/civic/civic-db/civic-draft.dereferenced.unique.context_db.csv')
statement_dict = dict(zip(statements['statement_id'], statements['context']))
db['evidence_statement'] = db['statement_id'].map(statement_dict)
print(db.shape)
db.head(1)

(1569, 2)


Unnamed: 0,statement_id,evidence_statement
0,17,Approval level: C\nDatabase: CIVIC\nDescriptio...


In [7]:
# db = pd.read_csv('external-validation/non-moa-database/civic/civic-db/evidence_statements.csv')
# print(db.shape)
# db.head(1)

In [8]:
_VERSION = 'civic-202509'
_CONTEXT = db.evidence_statement.tolist()
test_ind = split_ranges(len(_CONTEXT), 10, 20)
for i, e in enumerate(_CONTEXT):
    if i in test_ind:
        print(f"======={i}=======")
        print(e)
        print(extract_entities(e, model, tokenizer))

Approval level: D
Database: CIVIC
Description: CH5424802 (Alectinib) is effective in inhibiting the activity of the F1174L ALK mutant in a kinase activity assay and proliferation assay using neuroblastoma KELLY cells.
Cancer type: Neuroblastoma
Biomarkers: ALK F1174LTherapy: AlectinibApproval url: https://civicdb.org/links/evidence_items/37
Updated date: 2023-01-09 21:46:24 UTC
{'cancer_type': ['cancer', 'neuroblastoma'], 'biomarker': ['alk']}
Approval level: D
Database: CIVIC
Description: CH5424802 treatment resulted in significant tumor regression in xenograft models produced from Ba/F3 cells expressing EML4-ALK or EML4-ALK with the L1196M mutation.
Cancer type: Lung Non-small Cell Carcinoma
Biomarkers: EML4::ALK Fusion, ALK L1196MTherapy: AlectinibApproval url: https://civicdb.org/links/evidence_items/141
Updated date: 2023-02-21 22:30:01 UTC
{'cancer_type': ['cancer', 'lung non-small cell carcinoma', 'xenograft', 'tumor'], 'biomarker': ['eml4-alk', 'eml4', 'alk']}
Approval level: C

In [None]:
#run BioBert NER on all contexts
context_extracted_entities = []
for ctx in tqdm(_CONTEXT):
    context_extracted_entities.append(extract_entities(ctx, model, tokenizer))
context_extracted_entities

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 1569/1569 [01:59<00:00, 13.17it/s]


[{'cancer_type': ['cancer', 'lung non-small cell carcinoma'],
  'biomarker': ['araf']},
 {'cancer_type': ['cancer', 'aml', 'acute myeloid leukemia'],
  'biomarker': ['dnmt3a']},
 {'cancer_type': ['cancer', 'polycythemia'], 'biomarker': ['jak2']},
 {'cancer_type': ['cancer', 'polycythemia'],
  'biomarker': ['jak2617f', 'jak2', 'stat5']},
 {'cancer_type': ['malignant melanoma', 'cancer', 'melanoma'],
  'biomarker': ['hsp90', 'nras']},
 {'cancer_type': ['cancer', 'melanoma'], 'biomarker': ['nras']},
 {'cancer_type': ['cancer', 'melanoma'], 'biomarker': ['nras']},
 {'cancer_type': ['cancer', 'neuroblastoma'], 'biomarker': ['alk']},
 {'cancer_type': ['cancer', 'neuroblastoma'], 'biomarker': ['eml4', 'alk']},
 {'cancer_type': ['cancer'], 'biomarker': ['alk']},
 {'cancer_type': ['cancer', 'lung non-small cell carcinoma'],
  'biomarker': ['mek', 'araf s214ctherapy', 'arf', 'araf s214c']},
 {'cancer_type': ['cancer', 'lung non-small cell carcinoma'],
  'biomarker': ['araf s214ctherapy', 'arf', 

In [10]:
no_cancer_idx = []
no_biomarker_idx = []
for i, e in tqdm(enumerate(context_extracted_entities)):
    if not e['cancer_type']:
        no_cancer_idx.append(i)
    if not e['biomarker']:
        no_biomarker_idx.append(i)

print(f"{len(no_cancer_idx)*100/len(_CONTEXT):.3f}%") #no cancer entity extracted %
print(f"{len(no_biomarker_idx)*100/len(_CONTEXT):.3f}%") #no biomarker entity extracted %

1569it [00:00, 1617714.60it/s]

0.000%
0.064%





In [11]:
#save
os.makedirs("context_retriever/entities", exist_ok=True)
with open(f"context_retriever/entities/civic_db_context_ner_entities__{_VERSION}.json", "w") as f:
    json.dump(context_extracted_entities, f, indent=4)