In [1]:
import pandas as pd 
import numpy as np
import os, sys
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

In [2]:
while os.getcwd().split('/')[-1] != 'rag-llm-cancer-paper':
    os.chdir('..')

### setup biobert

In [3]:
biobert_dir = "context_retriever/biobert_ner"
#download model from huggingface
from transformers import AutoTokenizer, AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
tokenizer = AutoTokenizer.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
id2label = model.config.id2label

#save locally
model.save_pretrained(biobert_dir)
tokenizer.save_pretrained(biobert_dir)

('context_retriever/biobert_ner/tokenizer_config.json',
 'context_retriever/biobert_ner/special_tokens_map.json',
 'context_retriever/biobert_ner/vocab.txt',
 'context_retriever/biobert_ner/added_tokens.json',
 'context_retriever/biobert_ner/tokenizer.json')

In [4]:
#load locally
model = AutoModelForTokenClassification.from_pretrained(biobert_dir)
tokenizer = AutoTokenizer.from_pretrained(biobert_dir)
id2label = model.config.id2label

### load other libraries

In [5]:
# !pip install rank_bm25
import json
import pandas as pd
import random
import math
from transformers import AutoTokenizer, AutoModelForTokenClassification
from utils.entity_prediction import extract_entities, db_extract_entities
#version check is for mol almanac
# from utils.check_db_version import get_local_version
# _VERSION=get_local_version()
# print(_VERSION)

def split_ranges(max_int, num_ranges, samples=None, seed=42):
    random.seed(seed)
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result

### context snippets

In [6]:
db = pd.read_csv('external-validation/non-moa-database/civic/civic-db/evidence_statements.csv')
db.head(1)

Unnamed: 0,evidence_statement,evidence_civic_url
0,Daunorubicin treatment resulted in similar ove...,https://civicdb.org/links/evidence_items/11


In [7]:
_VERSION = 'civic-202509'
_CONTEXT = db.evidence_statement.tolist()
test_ind = split_ranges(len(_CONTEXT), 10, 20)
for i, e in enumerate(_CONTEXT):
    if i in test_ind:
        print(f"======={i}=======")
        print(e)
        print(extract_entities(e, model, tokenizer))

Ba/F3 cells expressing the RANBP2-ALK fusion containing an F1174L mutation were more resistant to crizotinib treatment than Ba/F3 cells expressing RANBP2-ALK without this mutation.
{'cancer_type': [], 'biomarker': ['ranbp2', 'alk']}
Chemotherapy-refractory patients with metastatic colorectal cancer harboring KRAS mutations (primarily G12/G13; n=230 out of 253 total KRAS mutant patients) had lower response rates (17/253; 6.7% vs KRAS wt 126/352; 35.8%; P < .0001), disease control rates and shorter progression free and overall survival following cetuximab plus chemotherapy than those with wildtype KRAS. Authors note that these patients were treated with cetuximab prior to widespead adoption of regular KRAS mutational status screening. Patients treated with cetuximab or panitumumab monotherapy were not included.
{'cancer_type': ['colorectal cancer'], 'biomarker': ['kras']}
In this phase 2 trial, treatment-naive, ERBB2-positive (by IHC) breast cancer patients with stage IIIA, B, C or infla

In [8]:
#run BioBert NER on all contexts
context_extracted_entities = []
for ctx in tqdm(_CONTEXT):
    context_extracted_entities.append(extract_entities(ctx, model, tokenizer))
context_extracted_entities

  0%|          | 0/2714 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 2714/2714 [02:43<00:00, 16.58it/s]


[{'cancer_type': ['aml'], 'biomarker': ['dnmt3a']},
 {'cancer_type': [], 'biomarker': ['perk', 'map2k1']},
 {'cancer_type': [], 'biomarker': ['perk', 'map2k1']},
 {'cancer_type': [], 'biomarker': ['nras']},
 {'cancer_type': ['gist cancer'], 'biomarker': []},
 {'cancer_type': ['cancer'], 'biomarker': ['pdgfra']},
 {'cancer_type': [], 'biomarker': []},
 {'cancer_type': ['aml'], 'biomarker': ['dnmt3a']},
 {'cancer_type': [], 'biomarker': ['ifn-alpha-2a', 'jak2']},
 {'cancer_type': [], 'biomarker': ['jak2617f', 'stat5']},
 {'cancer_type': [], 'biomarker': ['hsp90', 'nras']},
 {'cancer_type': ['melanoma'], 'biomarker': ['nras']},
 {'cancer_type': ['melanoma'], 'biomarker': []},
 {'cancer_type': [], 'biomarker': ['ranbp2', 'alk']},
 {'cancer_type': [], 'biomarker': ['alk', 'eml4-alk', 'eml4']},
 {'cancer_type': [], 'biomarker': ['kit']},
 {'cancer_type': ['nsclc'], 'biomarker': ['kras', 'egfr']},
 {'cancer_type': ['colorectal cancer'], 'biomarker': ['kras', 'nras']},
 {'cancer_type': [], 'bi

In [12]:
no_cancer_idx = []
no_biomarker_idx = []
for i, e in tqdm(enumerate(context_extracted_entities)):
    if not e['cancer_type']:
        no_cancer_idx.append(i)
    if not e['biomarker']:
        no_biomarker_idx.append(i)

print(f"{len(no_cancer_idx)*100/len(_CONTEXT):.3f}%") #no cancer entity extracted %
print(f"{len(no_biomarker_idx)*100/len(_CONTEXT):.3f}%") #no biomarker entity extracted %

2714it [00:00, 1246806.25it/s]

32.867%
2.063%





In [10]:
#save
os.makedirs("context_retriever/entities", exist_ok=True)
with open(f"context_retriever/entities/civic_db_context_ner_entities__{_VERSION}.json", "w") as f:
    json.dump(context_extracted_entities, f, indent=4)