In [1]:
!pip install transformers torch sentencepiece



In [2]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

# Load the BERT-based biomedical NER model
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Create a NER pipeline
medical_ner = pipeline(
    "ner",
    model=model,
    tokenizer=tokenizer,
    aggregation_strategy="simple"  # Combines tokens into full entities
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cpu


In [4]:
text = '''Thousands of cancer patients will receive faster treatment thanks to new "state-of-the-art" radiotherapy machines, the government has announced. Across every region in the country, 28 hospitals, including ones managed by the Royal Berkshire and Hampshire Hospitals NHS Foundation Trusts, are to receive the cutting-edge equipment. The Department of Health & Social Care said that by March 2027, "up to 27,500 additional treatments per year will be delivered, including up to 4,500 receiving their first treatment for cancer within 62-days of referral". The machines will be rolled out from August, funded by a £70m investment as part of the plans to improve cancer care. The government said the new linear accelerator (LINAC) machines would replace the older ones, saving "save as many as 13,000 appointments from being lost to equipment breakdown". It said the new machines were "safer for patients" and "can more precisely target tumours". The technology is being prioritised in hospitals which are currently using outdated treatment machines older than 10 years. Health Secretary Wes Streeting said that as a cancer survivor, "I know just how important timely treatment is". "There is a revolution taking place in medical technology which can transform treatment for cancer patients," Mr Streeting said. "But NHS hospitals are forced to use outdated, malfunctioning equipment thanks to 14 years of underinvestment under the previous government.'''

# Run NER
results = medical_ner(text)

# Display entities
for entity in results:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Confidence: {entity['score']:.2f}")

Entity: treatment, Label: Therapeutic_procedure, Confidence: 0.91
Entity: radiotherapy, Label: Therapeutic_procedure, Confidence: 0.99
Entity: social care, Label: Nonbiological_location, Confidence: 0.74
Entity: - days, Label: Date, Confidence: 0.55
Entity: linear, Label: Medication, Confidence: 0.94
Entity: accelerator, Label: Therapeutic_procedure, Confidence: 0.69
Entity: ( lina, Label: Medication, Confidence: 0.61
Entity: ##c, Label: Therapeutic_procedure, Confidence: 0.69
Entity: ), Label: Medication, Confidence: 0.51
Entity: machines, Label: Therapeutic_procedure, Confidence: 0.97
Entity: appointments, Label: Detailed_description, Confidence: 0.44
Entity: machines, Label: Coreference, Confidence: 0.67
Entity: outdated, Label: Detailed_description, Confidence: 0.94
Entity: treatment, Label: Therapeutic_procedure, Confidence: 1.00
Entity: machines, Label: Detailed_description, Confidence: 0.77
Entity: medical, Label: Therapeutic_procedure, Confidence: 0.44
Entity: technology, Label

In [5]:
import spacy
from spacy import displacy

# Convert Hugging Face output to spaCy's displacy format
def convert_to_displacy_format(ner_results):
    displacy_data = {
        "text": text,
        "ents": [],
        "title": None
    }
    for entity in ner_results:
        displacy_data["ents"].append({
            "start": entity["start"],
            "end": entity["end"],
            "label": entity["entity_group"],
            "text": entity["word"]
        })
    return displacy_data

displacy_data = convert_to_displacy_format(results)
displacy.render(displacy_data, style="ent", manual=True, jupyter=True)

In [6]:
# comparing with other BERT model
modelD_name = "obi/deid_bert_i2b2"
tokenizerD = AutoTokenizer.from_pretrained(modelD_name)
modelD = AutoModelForTokenClassification.from_pretrained(modelD_name)

# Create a NER pipeline
medical_nerD = pipeline(
    "ner",
    model=modelD,
    tokenizer=tokenizerD,
    aggregation_strategy="simple"
)

Device set to use cpu


In [7]:
print(modelD.config.id2label)

{0: 'B-AGE', 1: 'B-DATE', 2: 'B-EMAIL', 3: 'B-HOSP', 4: 'B-ID', 5: 'B-LOC', 6: 'B-OTHERPHI', 7: 'B-PATIENT', 8: 'B-PATORG', 9: 'B-PHONE', 10: 'B-STAFF', 11: 'I-AGE', 12: 'I-DATE', 13: 'I-EMAIL', 14: 'I-HOSP', 15: 'I-ID', 16: 'I-LOC', 17: 'I-OTHERPHI', 18: 'I-PATIENT', 19: 'I-PATORG', 20: 'I-PHONE', 21: 'I-STAFF', 22: 'L-AGE', 23: 'L-DATE', 24: 'L-EMAIL', 25: 'L-HOSP', 26: 'L-ID', 27: 'L-LOC', 28: 'L-OTHERPHI', 29: 'L-PATIENT', 30: 'L-PATORG', 31: 'L-PHONE', 32: 'L-STAFF', 33: 'O', 34: 'U-AGE', 35: 'U-DATE', 36: 'U-EMAIL', 37: 'U-HOSP', 38: 'U-ID', 39: 'U-LOC', 40: 'U-OTHERPHI', 41: 'U-PATIENT', 42: 'U-PATORG', 43: 'U-PHONE', 44: 'U-STAFF'}


In [8]:
text = '''Thousands of cancer patients will receive faster treatment thanks to new "state-of-the-art" radiotherapy machines, the government has announced. Across every region in the country, 28 hospitals, including ones managed by the Royal Berkshire and Hampshire Hospitals NHS Foundation Trusts, are to receive the cutting-edge equipment. The Department of Health & Social Care said that by March 2027, "up to 27,500 additional treatments per year will be delivered, including up to 4,500 receiving their first treatment for cancer within 62-days of referral". The machines will be rolled out from August, funded by a £70m investment as part of the plans to improve cancer care. The government said the new linear accelerator (LINAC) machines would replace the older ones, saving "save as many as 13,000 appointments from being lost to equipment breakdown". It said the new machines were "safer for patients" and "can more precisely target tumours". The technology is being prioritised in hospitals which are currently using outdated treatment machines older than 10 years. Health Secretary Wes Streeting said that as a cancer survivor, "I know just how important timely treatment is". "There is a revolution taking place in medical technology which can transform treatment for cancer patients," Mr Streeting said. "But NHS hospitals are forced to use outdated, malfunctioning equipment thanks to 14 years of underinvestment under the previous government.'''

# Run NER
resultsD = medical_nerD(text)

# Display entities
for entity in resultsD:
    print(f"Entity: {entity['word']}, Label: {entity['entity_group']}, Confidence: {entity['score']:.2f}")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Entity: Royal Berkshire, Label: HOSP, Confidence: 0.62
Entity: Hampshire Hospitals NHS Foundation, Label: HOSP, Confidence: 0.79
Entity: Trust, Label: PATORG, Confidence: 0.64
Entity: March, Label: DATE, Confidence: 1.00
Entity: 2027, Label: DATE, Confidence: 1.00
Entity: August, Label: DATE, Confidence: 1.00
Entity: Wes, Label: PATIENT, Confidence: 1.00
Entity: Street, Label: PATIENT, Confidence: 0.99
Entity: Street, Label: PATIENT, Confidence: 0.97
Entity: ##ing, Label: PATIENT, Confidence: 0.57
Entity: NHS hospitals, Label: HOSP, Confidence: 0.53


In [9]:
print(resultsD)

[{'entity_group': 'HOSP', 'score': np.float32(0.62422216), 'word': 'Royal Berkshire', 'start': 225, 'end': 240}, {'entity_group': 'HOSP', 'score': np.float32(0.78892696), 'word': 'Hampshire Hospitals NHS Foundation', 'start': 245, 'end': 279}, {'entity_group': 'PATORG', 'score': np.float32(0.6435455), 'word': 'Trust', 'start': 280, 'end': 285}, {'entity_group': 'DATE', 'score': np.float32(0.99951065), 'word': 'March', 'start': 383, 'end': 388}, {'entity_group': 'DATE', 'score': np.float32(0.99516475), 'word': '2027', 'start': 389, 'end': 393}, {'entity_group': 'DATE', 'score': np.float32(0.9996432), 'word': 'August', 'start': 590, 'end': 596}, {'entity_group': 'PATIENT', 'score': np.float32(0.9975973), 'word': 'Wes', 'start': 1083, 'end': 1086}, {'entity_group': 'PATIENT', 'score': np.float32(0.9927327), 'word': 'Street', 'start': 1087, 'end': 1093}, {'entity_group': 'PATIENT', 'score': np.float32(0.9673273), 'word': 'Street', 'start': 1291, 'end': 1297}, {'entity_group': 'PATIENT', 's

In [10]:
def convert_to_displacy_format(ner_results):
    ents = []
    for ent in ner_results:
        ents.append({
            "start": ent["start"],
            "end": ent["end"],
            "label": ent["entity_group"],
            "text": ent["word"]
        })
    return {"text": text, "ents": ents}

displacy_data = convert_to_displacy_format(resultsD)

# Render in Jupyter/Colab
displacy.render(displacy_data, style="ent", manual=True, jupyter=True)