In [1]:
import sys
import os
script_dir = os.getcwd()
root_dir = os.path.join(os.path.dirname(os.path.abspath(script_dir)))
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(script_dir))))

import json
import pandas as pd
import numpy as np
from utils.io import load_object, save_object
from utils.evaluation import calc_eval_metrics
import ast
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import colorcet as cc
from statsmodels.stats.multitest import multipletests
import scispacy
import spacy
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoModel
import torch

In [None]:
test_ind = split_ranges(len(_CONTEXT), 10, 20)
for i, e in enumerate(_CONTEXT):
    if i in test_ind:
        print(f"======={i}=======")
        print(e)
        print(extract_entities(e, model, tokenizer))

If a patient with advanced or metastatic invasive breast carcinoma cancer has pr positive, her2-negative, one recommended therapy is anastrozole + abemaciclib. therapy type: hormone therapy + targeted therapy. therapy strategy: aromatase inhibition + cdk4/6 inhibition. indication: verzenio is a kinase inhibitor indicated in combination with an aromatase inhibitor as initial endocrine-based therapy for the treatment of adult patients with hormone receptor (hr)-positive, human epidermal growth factor receptor 2 (her2)-negative advanced or metastatic breast cancer. approval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2023/208716s010s011lbl.pdf
{'cancer_type': ['breast carcinoma cancer', 'breast cancer'], 'biomarker': ['pr', 'her2', 'aromatase', 'cdk4', None, 'aromatase', 'hormone receptor', 'hr', 'human epidermal growth factor receptor 2', 'her2']}
If a patient with metastatic castration-resistant prostate adenocarcinoma cancer has bard1 oncogenic variants, one recommended t

### Load MOAlamanc data and synthetic queries

In [2]:
old_moalmanac_db=pd.read_csv(os.path.join(root_dir, 'data/moa_fda_queries_answers.csv'), index_col=0)
synthetic_prompt_groundtruth_dict=load_object(os.path.join(root_dir, 'data/synthetic_prompt_groundtruth_dict.pkl'))

### Testing NER using scispacy on synthetic queries

In [None]:
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_scibert-0.5.4.tar.gz

In [None]:
bionlp13cg_nlp = spacy.load("en_ner_bionlp13cg_md")
bc5cdr_nlp = spacy.load("en_ner_bc5cdr_md")

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [None]:
spacy_ner_bionlp13cg_entities=[]
spacy_ner_bc5cdr_entities=[]
# spacy_ner_scibert_entities=[]

for i in range(len(synthetic_prompt_groundtruth_dict.keys())):
    
    text=list(synthetic_prompt_groundtruth_dict.keys())[i]
    
    doc=bionlp13cg_nlp(text)
    spacy_ner_bionlp13cg_entities.append(doc.ents)
    
    doc=bc5cdr_nlp(text)
    spacy_ner_bc5cdr_entities.append(doc.ents)
    

In [6]:
ner_entities = pd.DataFrame(
    {"synthetic_queries":synthetic_prompt_groundtruth_dict.keys(), 
     "spacy_ner_bionlp13cg_entities":spacy_ner_bionlp13cg_entities,
     "spacy_ner_bc5cdr_entities":spacy_ner_bc5cdr_entities,
     })

### Load BioBERT finetuned for NER 

Pre-trained Models (saved in `/mnt/disks/moa-llm/models`):
- Ishan0612/biobert-ner-disease-ncbi (https://huggingface.co/Ishan0612/biobert-ner-disease-ncbi): extracts disease
- judithrosell/BioNLP13CG_bioBERT_NER (https://huggingface.co/judithrosell/BioNLP13CG_bioBERT_NER): fine-tuned version of biobert v1.1
- judithrosell/BioBERT_BioNLP13CG_NER_new

In [3]:
# load pre-trained BioBERT model for NER
model = AutoModelForTokenClassification.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
tokenizer = AutoTokenizer.from_pretrained("judithrosell/BioBERT_BioNLP13CG_NER_new")
id2label = model.config.id2label

### Define BioBERT NER function

In [4]:
# function to predict entities using BioBERT
def ner_predict_entities(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # predictions
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, axis=2)
    
    # convert predictions to labels 
    predicted_labels = [id2label[pred.item()] for pred in predictions[0]]
    
    # align predictions with original tokents
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    # extract entities
    entities = []
    current_entity = None
    for token, label in zip(tokens, predicted_labels):
        # skip special tokents
        if token in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]:
            continue
        
        # handle subwords
        if token.startswith("##"):
            if current_entity:
                current_entity["text"] += token[2:] # remove '##' prefix
            continue
        if label.startswith("B-"):
            if current_entity:
                entities.append(current_entity)
            entity_type = label[2:]  # remove B- prefix
            current_entity = {"type": entity_type, "text": token}
        elif label.startswith("I-") and current_entity and current_entity["type"] == label[2:]:
            current_entity["text"] += " " + token
        elif label == "O":
            if current_entity:
                entities.append(current_entity)
                current_entity = None
                
    # last entity
    if current_entity:
        entities.append(current_entity)
    return entities

Testing on small samples

In [None]:
sample_text = list(synthetic_prompt_groundtruth_dict.keys())[0]
entities = ner_predict_entities(sample_text, model, tokenizer)
print(sample_text)
print(entities)

If a chronic myelogenous leukemia patient has a rearrangement in gene BCR and ABL1, and is chronic, accelerated, or blast phase, what would be the suggested lines of treatment?
[{'type': 'Cancer', 'text': 'chronic myelogenous leukemia'}, {'type': 'Organism', 'text': 'patient'}, {'type': 'Gene_or_gene_product', 'text': 'BCR'}, {'type': 'Gene_or_gene_product', 'text': 'ABL1'}]


## Rule-based retrieval based on matching key features

The goal is to retrieve and prerank relevant context chunks based on matching key features that are critical for treatment decision-making.

The key features we will use are:
- cancer type
- biomarker

Below is the context db generated from the latest release of moalamanc database

In [None]:
moalmanac_new_draft = pd.read_csv(f"{root_dir}/data/latest_db/moalmanac-draft.dereferenced.unique.context_db.csv")
moalmanac_new_draft.head()

Unnamed: 0,statement_id,approval_status,approval_org,description,indication,cancer_type,biomarker,therapy_drug,therapy_approach,therapy_strategy,therapy_type,approval_url,publication_date,context
0,0,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...
1,1,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"HER2-negative [present], PR positive [present]","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...
2,2,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]...","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...
3,3,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]","['Anastrozole', 'Abemaciclib']",Combination therapy,"['Aromatase inhibition', 'CDK4/6 inhibition']","['Hormone therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...
4,4,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"HER2-negative [present], PR positive [present]","['Anastrozole', 'Abemaciclib']",Combination therapy,"['Aromatase inhibition', 'CDK4/6 inhibition']","['Hormone therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...


#### BioBERT NER extraction and standardization of moalmanac entities

Here, I'm going to:
- extract cleaned gene symbols using BioBERT NER extraction
- if nothing is extracted by BioBERT, fallback to the original biomarker (e.g., broad genomic markers like dMMR and MSI-H are not recognized by BioBERT)

In [5]:
import re

def clean_biomarker(text):
    txt = re.sub(r"\s*\[.*?\]", "", text).strip()
    txt = txt.strip().replace(' ', '').upper() #["v", "ALK"], ["PD - L1"], ["c-Met"]
    
    #ignore if it's too short (e.g. 1 character long)
    if len(txt) <= 1:
        return None
    return txt


def db_extract_entities(db, cancer_col='cancer_type', biomarker_col='biomarker'):
    entities_dict = {'cancer_type': [], 'biomarker': []}
    
    #append cancer type (pre-standardized)
    entities_dict['cancer_type'].append(db[cancer_col])
    
    #extract gene names using biobert 
    b_list = ner_predict_entities(db[biomarker_col], model, tokenizer)
    
    #if nothing's been extracted, append the original biomarker
    if not b_list:
        entities_dict['biomarker'].append(clean_biomarker(db[biomarker_col]))
    else:
        for b in b_list:
            #clean and standardize the extracted biomarker
            cleaned = clean_biomarker(b['text'])
            if cleaned is not None and cleaned not in entities_dict['biomarker']:
                entities_dict['biomarker'].append(cleaned)
                    
    return entities_dict


def db_standardize_entities(entities_dict): 
    entities_dict['cancer_type'] = [cancer.upper() for cancer in entities_dict['cancer_type']] #so that both cancer types and biomarkers are in upper case
    return entities_dict

In [None]:
#split ranges for testing on small batches
import random
import math

def split_ranges(max_int, num_ranges, samples=None):
    step = math.ceil(max_int / num_ranges)
    ranges = [(i*step+1, min((i+1)*step, max_int)) for i in range(num_ranges)]
    
    if samples is None:
        return ranges
    
    # split samples evenly across ranges
    n_per_range = samples // num_ranges
    remainder = samples % num_ranges
    
    result = []
    for i, r in enumerate(ranges):
        count = n_per_range + (1 if i < remainder else 0)
        result.extend(random.randint(r[0], r[1]) for _ in range(count))
    
    return result

test_idx_list = split_ranges(moalmanac_new_draft.shape[0], 10, 50)

Run on all moalmanac entities (old)

In [None]:
for idx, row in old_moalmanac_db.iterrows():
    entities_dict = db_extract_entities(row)
    old_moalmanac_db.at[idx, 'entities_dict'] = json.dumps(entities_dict)
    standardized_entities_dict = db_standardize_entities(entities_dict)
    old_moalmanac_db.at[idx, 'standardized_entities_dict'] = json.dumps(standardized_entities_dict)
    
    print(idx)
    print(row['cancer_type'])
    print(row['biomarker'])
    print(standardized_entities_dict)
    

Run on all moalmanac entities (new)

In [415]:
for idx, row in moalmanac_new_draft.iterrows():
    entities_dict = db_extract_entities(row)
    moalmanac_new_draft.at[idx, 'entities_dict'] = json.dumps(entities_dict)
    standardized_entities_dict = db_standardize_entities(entities_dict)
    moalmanac_new_draft.at[idx, 'standardized_entities_dict'] = json.dumps(standardized_entities_dict)
    
    print(idx)
    print(row['cancer_type'])
    print(row['biomarker'])
    print(standardized_entities_dict)
    

0
Invasive Breast Carcinoma
ER positive [present], HER2-negative [present]
{'cancer_type': ['INVASIVE BREAST CARCINOMA'], 'biomarker': ['ER', 'HER2']}
1
Invasive Breast Carcinoma
HER2-negative [present], PR positive [present]
{'cancer_type': ['INVASIVE BREAST CARCINOMA'], 'biomarker': ['HER2', 'PR']}
2
Invasive Breast Carcinoma
ER positive [present], HER2-negative [present], PR positive [present]
{'cancer_type': ['INVASIVE BREAST CARCINOMA'], 'biomarker': ['ER', 'HER2', 'PR']}
3
Invasive Breast Carcinoma
ER positive [present], HER2-negative [present]
{'cancer_type': ['INVASIVE BREAST CARCINOMA'], 'biomarker': ['ER', 'HER2']}
4
Invasive Breast Carcinoma
HER2-negative [present], PR positive [present]
{'cancer_type': ['INVASIVE BREAST CARCINOMA'], 'biomarker': ['HER2', 'PR']}
5
Invasive Breast Carcinoma
ER positive [present], HER2-negative [present], PR positive [present]
{'cancer_type': ['INVASIVE BREAST CARCINOMA'], 'biomarker': ['ER', 'HER2', 'PR']}
6
Invasive Breast Carcinoma
ER posit

In [416]:
moalmanac_new_draft

Unnamed: 0,statement_id,approval_status,approval_org,description,indication,cancer_type,biomarker,therapy_drug,therapy_approach,therapy_strategy,therapy_type,approval_url,publication_date,context,entities_dict,standardized_entities_dict
0,0,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Invasive Breast Carcinoma""],...","{""cancer_type"": [""INVASIVE BREAST CARCINOMA""],..."
1,1,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"HER2-negative [present], PR positive [present]","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Invasive Breast Carcinoma""],...","{""cancer_type"": [""INVASIVE BREAST CARCINOMA""],..."
2,2,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]...","['Abemaciclib', 'Tamoxifen']",Combination therapy,"['CDK4/6 inhibition', 'Estrogen receptor inhib...","['Targeted therapy', 'Hormone therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Invasive Breast Carcinoma""],...","{""cancer_type"": [""INVASIVE BREAST CARCINOMA""],..."
3,3,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"ER positive [present], HER2-negative [present]","['Anastrozole', 'Abemaciclib']",Combination therapy,"['Aromatase inhibition', 'CDK4/6 inhibition']","['Hormone therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Invasive Breast Carcinoma""],...","{""cancer_type"": [""INVASIVE BREAST CARCINOMA""],..."
4,4,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,Verzenio is a kinase inhibitor indicated in co...,Invasive Breast Carcinoma,"HER2-negative [present], PR positive [present]","['Anastrozole', 'Abemaciclib']",Combination therapy,"['Aromatase inhibition', 'CDK4/6 inhibition']","['Hormone therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2023-03-03,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Invasive Breast Carcinoma""],...","{""cancer_type"": [""INVASIVE BREAST CARCINOMA""],..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
579,624,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,OPDIVO is a programmed death receptor-1 (PD-1)...,Colorectal Adenocarcinoma,MSI-H [present],"['Ipilimumab', 'Nivolumab']",Combination therapy,"['CTLA-4 inhibition', 'PD-1/PD-L1 inhibition']","['Immunotherapy', 'Immunotherapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2025-04-11,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Colorectal Adenocarcinoma""],...","{""cancer_type"": [""COLORECTAL ADENOCARCINOMA""],..."
580,625,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,IBTROZI is a kinase inhibitor indicated for th...,Non-Small Cell Lung Cancer,v::ROS1 [present],['Taletrectinib'],Monotherapy,"['NTRK inhibition', 'ROS1 inhibition']",['Targeted therapy'],https://www.accessdata.fda.gov/drugsatfda_docs...,2025-06-11,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Non-Small Cell Lung Cancer""]...","{""cancer_type"": [""NON-SMALL CELL LUNG CANCER""]..."
581,626,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,EMRELIS is a c-Met-directed antibody and micro...,Lung Non-Squamous Non-Small Cell Carcinoma,c-Met >= 50% [present],['Telisotuzumab Vedotin'],Monotherapy,['MET inhibition'],['Targeted therapy'],https://www.accessdata.fda.gov/drugsatfda_docs...,2025-05-14,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Lung Non-Squamous Non-Small ...","{""cancer_type"": [""LUNG NON-SQUAMOUS NON-SMALL ..."
582,627,Regulatory approval,fda,The U.S. Food and Drug Administration (FDA) gr...,"AVMAPKI FAKZYNJA CO-PACK, a combination of avu...",Low-Grade Serous Ovarian Cancer,KRAS p.G12C [present],"['Avutometinib', 'Defactinib']",Combination therapy,"['MEK inhibition', 'RAF inhibition', 'FAK inhi...","['Targeted therapy', 'Targeted therapy']",https://www.accessdata.fda.gov/drugsatfda_docs...,2025-05-08,Approval status: Regulatory approval (fda)\nDe...,"{""cancer_type"": [""Low-Grade Serous Ovarian Can...","{""cancer_type"": [""LOW-GRADE SEROUS OVARIAN CAN..."


In [417]:
moalmanac_new_draft.to_csv(f"{root_dir}/data/latest_db/moalmanac-draft.dereferenced.unique.context_db.standardized_entities.csv", index=False)

#### BioBERT NER extraction of entities from synthetic user queries

In [11]:
synthetic_query=old_moalmanac_db['prompt'].values

In [15]:
#extract BioBERT NER from synthetic user queries and standardize entities
def query_biobert_ner(query, model, tokenizer):
    entities = ner_predict_entities(query, model, tokenizer)
    entities_dict = {}
    
    for e in entities:
        if e['type'] == 'Cancer':
            entities_dict.setdefault('cancer_type', []).append(e['text'].upper())
        if e['type'] == 'Gene_or_gene_product':
            entities_dict.setdefault('biomarker', []).append(e['text'].upper())
    return entities, entities_dict
    
#run for all synthetic queries    
biobert_entities_list = []
standardized_entities_dict_list = []
for i, query in enumerate(synthetic_query):
    biobert_entities, standardized_entities_dict = query_biobert_ner(query, model, tokenizer)
    biobert_entities_list.append(biobert_entities)
    standardized_entities_dict_list.append(standardized_entities_dict)    
    
    if i % 10 == 0:
        print(i)
        print(biobert_entities)
        print(standardized_entities_dict)


0
[{'type': 'Cancer', 'text': 'chronic myelogenous leukemia'}, {'type': 'Organism', 'text': 'patient'}, {'type': 'Gene_or_gene_product', 'text': 'BCR'}, {'type': 'Gene_or_gene_product', 'text': 'ABL1'}]
{'cancer_type': ['CHRONIC MYELOGENOUS LEUKEMIA'], 'biomarker': ['BCR', 'ABL1']}
10
[{'type': 'Cancer', 'text': 'cholangiocarcinoma'}, {'type': 'Organism', 'text': 'patient'}, {'type': 'Gene_or_gene_product', 'text': 'FGFR2'}]
{'cancer_type': ['CHOLANGIOCARCINOMA'], 'biomarker': ['FGFR2']}
20
[{'type': 'Cancer', 'text': 'myelodysplastic'}, {'type': 'Organism', 'text': 'patient'}, {'type': 'Gene_or_gene_product', 'text': 'PDGFRA'}]
{'cancer_type': ['MYELODYSPLASTIC'], 'biomarker': ['PDGFRA']}
30
[{'type': 'Cancer', 'text': 'melanoma'}, {'type': 'Organism', 'text': 'patient'}, {'type': 'Gene_or_gene_product', 'text': 'BRAF'}]
{'cancer_type': ['MELANOMA'], 'biomarker': ['BRAF']}
40
[{'type': 'Cancer', 'text': 'colorectal cancer'}, {'type': 'Organism', 'text': 'patient'}, {'type': 'Gene_or_g

In [18]:
# synthetic_query_ner.to_csv(f"{root_dir}/data/synthetic_query_ner_entities.csv", index=False)
with open("entities/synthetic_query_ner_entities.json", "w") as f:
    json.dump(standardized_entities_dict_list, f)

In [19]:
with open("entities/synthetic_query_ner_entities.json", "r") as f:
    standardized_entities_dict_list = json.load(f)
len(standardized_entities_dict_list)

234

In [None]:
#load all ner results
ner_entities=pd.read_csv(os.path.join(root_dir, 'data/synthetic_query_ner_entities_all_methods.csv'))

#merge biobert ner entities
ner_entities['BioNLP13CG_biobert_ner_entities'] = synthetic_query_ner['biobert_entities']
ner_entities['extracted_entities_dict'] = synthetic_query_ner['standardized_entities_dict']
ner_entities

Unnamed: 0,synthetic_queries,spacy_ner_bionlp13cg_entities,spacy_ner_bc5cdr_entities,BioNLP13CG_biobert_ner_entities,extracted_entities_dict
0,If a chronic myelogenous leukemia patient has ...,"(chronic myelogenous leukemia patient, BCR, AB...","(chronic myelogenous leukemia,)","[{""type"": ""Cancer"", ""text"": ""chronic myelogeno...","{""cancer_type"": [""CHRONIC MYELOGENOUS LEUKEMIA..."
1,If a chronic myelogenous leukemia patient has ...,"(chronic myelogenous leukemia patient, BCR, AB...","(chronic myelogenous leukemia,)","[{""type"": ""Cancer"", ""text"": ""chronic myelogeno...","{""cancer_type"": [""CHRONIC MYELOGENOUS LEUKEMIA..."
2,If a acute lymphoblastic leukemia patient has ...,"(acute lymphoblastic leukemia patient, BCR, AB...","(acute lymphoblastic leukemia,)","[{""type"": ""Cancer"", ""text"": ""acute lymphoblast...","{""cancer_type"": [""ACUTE LYMPHOBLASTIC LEUKEMIA..."
3,If a acute lymphoblastic leukemia patient has ...,"(acute lymphoblastic leukemia patient, BCR, AB...","(acute lymphoblastic leukemia,)","[{""type"": ""Cancer"", ""text"": ""acute lymphoblast...","{""cancer_type"": [""ACUTE LYMPHOBLASTIC LEUKEMIA..."
4,If a chronic myeloid leukemia patient has a re...,"(chronic myeloid leukemia patient, BCR, ABL1, ...","(chronic myeloid leukemia,)","[{""type"": ""Cancer"", ""text"": ""chronic myeloid l...","{""cancer_type"": [""CHRONIC MYELOID LEUKEMIA""], ..."
...,...,...,...,...,...
178,If a anaplastic large cell lymphoma (alcl) pat...,"(anaplastic large cell lymphoma, patient, ALK,...","(anaplastic, large cell lymphoma, alcl)","[{""type"": ""Cancer"", ""text"": ""anaplastic large ...","{""cancer_type"": [""ANAPLASTIC LARGE CELL LYMPHO..."
179,If a non-small cell lung cancer patient has a ...,"(non-small cell lung cancer patient, ROS1, lines)","(non-small cell lung cancer,)","[{""type"": ""Cancer"", ""text"": ""non - small cell ...","{""cancer_type"": [""NON - SMALL CELL LUNG CANCER..."
180,If a prostate cancer patient has a somatic var...,"(prostate cancer patient, MRE11, lines)","(prostate cancer, castration-resistant)","[{""type"": ""Cancer"", ""text"": ""prostate cancer""}...","{""cancer_type"": [""PROSTATE CANCER""], ""biomarke..."
181,If a acute lymphoblastic leukemia patient has ...,"(acute lymphoblastic leukemia patient, ABL1, l...","(acute lymphoblastic leukemia,)","[{""type"": ""Cancer"", ""text"": ""acute lymphoblast...","{""cancer_type"": [""ACUTE LYMPHOBLASTIC LEUKEMIA..."


In [None]:
ner_entities.to_csv(f"{root_dir}/data/synthetic_query_ner_entities_all_methods.csv", index=False)

### Exact and fuzzy matching

Load synthetic query entities and db entities

In [475]:
#load synthetic query entities 
synthetic_query_ner = pd.read_csv(f"{root_dir}/data/synthetic_query_ner_entities.csv")
synthetic_query_ner["standardized_entities_dict"] = synthetic_query_ner["standardized_entities_dict"].apply(ast.literal_eval)

#and moalmanac entities
moalmanac_db = pd.read_csv(f"{root_dir}/data/latest_db/moalmanac-draft.dereferenced.unique.context_db.standardized_entities.csv")
moalmanac_db['standardized_entities_dict'] = moalmanac_db['standardized_entities_dict'].apply(ast.literal_eval)

In [505]:
from rapidfuzz import fuzz

def check_list(input):
    if isinstance(input, list):
        input = input
    else:
        input = [input]
    return input

def match_entities(user_entities, db, fuzzy_thres=70, id_col='statement_id', entities_col='standardized_entities_dict', context_col='context'):
    """
    Calculate score based on matching cancer types and biomarkers between user's query and the database
    
        Arguments:
        user_entities (dict): A dictionary with 'cancer_type' and 'biomarker' entities extracted using biobert
        db (DataFrame): A context database with 'cancer_type' and 'biomarker' entities extracted using biobert, context, and id

    """
    
    user_cancer = user_entities['cancer_type']
    user_biomarker = user_entities['biomarker']
    user_cancer = check_list(user_cancer)
    user_biomarker = check_list(user_biomarker)
    match_score_all=[]
    
    #iterate over all db entities
    for idx, row in db.iterrows():
        #entities dict
        db_entities = row[entities_col]
        
        #initialize matching score
        score=0
        
        #append matching count
        for db_cancer in db_entities['cancer_type']: 
            db_cancer = check_list(db_cancer)
            if len(set(db_cancer) & set(user_cancer)) > 0:
                score += len(set(db_cancer) & set(user_cancer))
            elif any(fuzz.ratio(dbc, uc) > fuzzy_thres for uc in user_cancer for dbc in db_cancer):
                score += 0.5
        
        for db_biomarker in db_entities['biomarker']:
            db_biomarker = check_list(db_biomarker)
            if len(set(db_biomarker) & set(user_biomarker)) > 0:
                score += len(set(db_biomarker) & set(user_biomarker))
            elif any(fuzz.ratio(dbb, ub) > fuzzy_thres for ub in user_biomarker for dbb in db_biomarker):
                score += 0.5

        if score > 0:
            match_score_all.append((row[id_col], score, row[entities_col], row[context_col]))
        
    #sort by score descending
    match_score_all.sort(key=lambda x: x[1], reverse=True)
    return match_score_all

In [508]:
# test_idx_list = split_ranges(synthetic_query_ner.shape[0], 10, 50)
test_idx = 0
match_score_all=match_entities(synthetic_query_ner['standardized_entities_dict'][test_idx], moalmanac_db)
print(synthetic_query_ner['standardized_entities_dict'][test_idx])
match_score_all

{'cancer_type': ['CHRONIC MYELOGENOUS LEUKEMIA'], 'biomarker': ['BCR', 'ABL1']}


[(32,
  3,
  {'cancer_type': ['CHRONIC MYELOGENOUS LEUKEMIA'],
   'biomarker': ['BCR', 'ABL1']},
  'Approval status: Regulatory approval (fda)\nDescription: The U.S. Food and Drug Administration granted approval to asciminib for the treatment of adult patients with previously treated philadelphia chromosome-positive chronic myeloid leukemia (Ph+ CML) in chronic phase (CP).\nIndication: SCEMBLIX is a kinase inhibitor indicated for the treatment of adult patients with previously treated philadelphia chromosome-positive chronic myeloid leukemia (Ph+ CML) in chronic phase (CP).\nCancer type: Chronic Myelogenous Leukemia\nBiomarkers: BCR::ABL1 [present]\nTherapy: Asciminib\nTherapy approach: Monotherapy\nTherapy strategy: BCR-ABL inhibition\nTherapy type: Targeted therapy\nApproval url: https://www.accessdata.fda.gov/drugsatfda_docs/label/2024/215358Orig1s008lbl.pdf\nPublication date: 2024-10-29'),
 (33,
  3,
  {'cancer_type': ['CHRONIC MYELOGENOUS LEUKEMIA'],
   'biomarker': ['BCR', 'ABL1'

## Rerank based on semantic similarity using BioBERT embeddings

Next, we can also try enhancing the semantic search by using embedding models pretrained on biomedical text

BioBERT finetuned for sentence semantic search:
- pritamdeka/S-PubMedBert-MS-MARCO (https://huggingface.co/pritamdeka/S-PubMedBert-MS-MARCO)
- 

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

In [5]:
def embed_text(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # using [CLS] token embedding as sentence representation
    return outputs.last_hidden_state[:,0,:].numpy()

def find_semantic_matches(moalmanac_list, user_list, threshold=0.7):
    """
    Return semantic matches using BioBERT embeddings.
    """
    matches = []
    moalmanac_embeddings = [embed_text(b) for b in moalmanac_list]
    user_embeddings = [embed_text(u) for u in user_list]
    
    for i, b in enumerate(moalmanac_list):
        b_vec = moalmanac_embeddings[i]
        matched = []
        for j, u in enumerate(user_list):
            u_vec = user_embeddings[j]
            sim = cosine_similarity(b_vec, u_vec)[0][0]
            if sim >= threshold:
                matched.append(u)
        if matched:
            matches.append((b, matched))
    return matches


In [None]:
# find_semantic_matches(moalmanac_new_draft['standardized_entities_dict'][0])
# find_semantic_matches(moalmanac_new_draft['context'][0], list(synthetic_prompt_groundtruth_dict.keys())[99])

In [None]:
# # find_semantic_matches(moalmanac_entities_dict['biomarker'], extracted_entities_dict['biomarker'], threshold=1)
# find_semantic_matches(moalmanac_entities_dict['cancer_type'], extracted_entities_dict['cancer_type'], threshold=.7)
# moalmanac_entities_dict['cancer_type']
# user_entities["extracted_entities_dict"]=user_entities["extracted_entities_dict"].apply(ast.literal_eval)

['BCR', 'ABL1']