In [1]:
!pip install gliner

Collecting gliner
  Downloading gliner-0.2.16-py3-none-any.whl.metadata (8.8 kB)
Collecting onnxruntime (from gliner)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->gliner)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->gliner)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->gliner)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->gliner)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->gliner)
  Downloading nvidia_cublas_c

In [8]:
###Mount data directory
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [9]:
###Read dataset
import pandas as pd
from pathlib import Path

###init dataset path
dataset_dir = Path('/content/gdrive/MyDrive/HDRUK-2025/hackathon/')
interim_directory = Path('/content/gdrive/MyDrive/HDRUK-2025/')

cvd_df = pd.read_csv(dataset_dir / "cvd_note.csv")

column_names = cvd_df.columns
print(column_names)

###Give a placeholder ID for each text
cvd_df.columns.values[0] = 'note_ID'

print(cvd_df['note_ID'])


  cvd_df = pd.read_csv(dataset_dir / "cvd_note.csv")


Index(['Unnamed: 0', 'row_id', 'subject_id', 'hadm_id', 'seq_num', 'icd9_code',
       'row_id.1', 'subject_id.1', 'hadm_id.1', 'chartdate', 'charttime',
       'storetime', 'category', 'description', 'cgid', 'iserror', 'text'],
      dtype='object')
0              0
1              1
2              2
3              3
4              4
           ...  
364074    364074
364075    364075
364076    364076
364077    364077
364078    364078
Name: note_ID, Length: 364079, dtype: int64


In [10]:
###MIMIC headings
known_headings = [
    "Name:",
    "Unit No:",
    "Admission Date:",
    "Discharge Date:",
    "Date of Birth:",
    "Sex:",
    "Service:",
    "Allergies:",
    "Attending:",
    "Chief Complaint:",
    "History of Present Illness:",
    "Past Medical History:",
    "Social History:",
    "Family History:",
    "Physical Exam:",
    "Pathology:",
    "Brief Hospital Course:",
    "Medications on Admission:",
    "Discharge Medications:",
    "Discharge Disposition:",
    "Discharge Diagnosis:",
    "Discharge Condition:",
    "Discharge Instructions:",
    "Followup Instructions:",
    "Discharge:",
    "Pertinent Results:",
    "Studies:",
    "Pending Results:",
    "Transitional Issues:",
    "PAST SURGICAL HISTORY:",
    "ADMISSION PHYSICAL EXAM:",
    "DISCHARGE PHYSICAL EXAM:",
    "PERTINENT LABS:",
    "DISCHARGE LABS:",
    "MICROBIOLOGY:",
    "IMAGING:",
    "ACTIVE ISSUES:",
    "CHRONIC ISSUES:",
    "Review of Systems:",
    "Major Surgical or Invasive Procedure:",
    "ADMISSION CXR:",
    "FOLLOW UP CXR:",
    "VASCULAR SURGERY ADMISSION EXAM:",
    "ADMISSION LABS:",
    "DEATH EXAM:",
    "CXR:",
    "CXR ___:",
    "SECONDARY:",
    "LABS:",
    "HOSPITAL COURSE:" #JC added
]

In [11]:
###Splitting MIMIC notes by heading
import re
import numpy as np

print("Splitting notes and annotations based on subheadings...")

def extract_subsections(x):
    section_dict = {}
    # for heading in known_headings:
    for heading in headings_to_extract:
        #print(f"Extracting subsection for heading: {heading}")
        pattern = r"(^|\s\s+)" + re.escape(heading)

        if not re.search(pattern, x):
            continue

        match = re.search(pattern, x)

        start_index_extract = match.start()

        # find closest next section, starting from end of note
        next_section_index = len(x) - 1
        for next_heading in known_headings:
            if next_heading.__eq__(heading):
                continue

            pattern_next = r"(^|\s\s+)" + re.escape(next_heading)
            match_next = re.search(pattern_next, x)

            if not re.search(pattern_next, x):
                continue

            if next_section_index > match_next.start() > start_index_extract:
                next_section_index = match_next.start()

        # extract section between start and next section, store
        section_dict[heading] = [start_index_extract, next_section_index, x[start_index_extract:next_section_index]]

    return section_dict

########
### call it
########
headings_to_extract = [
    "History of Present Illness:",
    "Medications on Admission:",
    "Discharge Medications:"
]

# for each note_id, extract sections and save as [note_id | section | section_begin | section_end | section_type]
# do a subset
cvd_subset_df = cvd_df.head(100)

note_ids = cvd_subset_df["note_ID"].unique()

# subsections = []
subsection_texts = []
i=0
for note_id in note_ids:
    print(f"Extracting subsection for: {i}th note.")
    text = cvd_subset_df.loc[cvd_subset_df["note_ID"] == note_id, 'text'].item()
    subsections_dict = extract_subsections(text)

    subnote_text = ""

    for key in subsections_dict.keys():
        start_index = subsections_dict[key][0]
        end_index = subsections_dict[key][1]

        # concat the text and codes into single entry
        subnote_text = subnote_text + subsections_dict[key][2]

    # remove duplicate codes and descriptions TODO: optional
    subsection_texts.append(subnote_text.lstrip())
    i+=1

# bring subsection text into dataframe + add ids
notes_sections_df = pd.DataFrame(subsection_texts)
notes_sections_df.insert(0, "note_ID", note_ids)
notes_sections_df.columns = ['note_ID', 'text']

# remove rows where note did not contain subsections
drop_rows = notes_sections_df[notes_sections_df['text']==''].index
notes_sections_df.drop(drop_rows, inplace=True)

print(notes_sections_df.columns.values)
notes_sections_df.to_csv(interim_directory / "mimic_text_subsections.csv", index=False)

print("NOTES SECT:")
#print(notes_sections_df.iloc[0])


Splitting notes and annotations based on subheadings...
Extracting subsection for: 0th note.
Extracting subsection for: 1th note.
Extracting subsection for: 2th note.
Extracting subsection for: 3th note.
Extracting subsection for: 4th note.
Extracting subsection for: 5th note.
Extracting subsection for: 6th note.
Extracting subsection for: 7th note.
Extracting subsection for: 8th note.
Extracting subsection for: 9th note.
Extracting subsection for: 10th note.
Extracting subsection for: 11th note.
Extracting subsection for: 12th note.
Extracting subsection for: 13th note.
Extracting subsection for: 14th note.
Extracting subsection for: 15th note.
Extracting subsection for: 16th note.
Extracting subsection for: 17th note.
Extracting subsection for: 18th note.
Extracting subsection for: 19th note.
Extracting subsection for: 20th note.
Extracting subsection for: 21th note.
Extracting subsection for: 22th note.
Extracting subsection for: 23th note.
Extracting subsection for: 24th note.
Extr

In [12]:
###Initialize NER model
import torch

from gliner import GLiNER

#config = {'temperature': 0.0}

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print("Initializing")
#model_path = "urchade/gliner_mediumv2.1"
model_path = "urchade/gliner_large_bio-v0.2"
model = GLiNER.from_pretrained(model_path, max_length=2000)

Initializing


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

gliner_config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.78G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [13]:
###Single line test

input_text = "Patient has biliary pancreatitis, without signs of RA. Reports chest pain after taking NSAIDs and fluoroxicillin. Blood results normal. Also has nausea."
labels = ["Disorder", "Symptom", "Medication"]

###Run NER
print("Running NER")
entities = model.predict_entities(input_text, labels, threshold=0.5)

for entity in entities:
    print(entity["text"], "=>", entity["label"])



Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Running NER
biliary pancreatitis => Disorder
RA => Disorder
chest pain => Symptom
NSAIDs => Medication
fluoroxicillin => Medication
nausea => Symptom


In [14]:
###NER for each note + place in tabular format (based on label)

labels = ["Symptom", "Medication"]

print("Running NER")

NER_result_df = notes_sections_df.copy()

i=0

symptoms, medications = ([] for i in range(2))
for input_text in notes_sections_df['text']:
  entities = model.predict_entities(input_text, labels, threshold=0.5)

  print(entities)

  ## if only caring about unique - use sets, else use []
  symptoms_note, medications_note = (set() for i in range(2))
  for entity in entities:
    label = entity["label"]

    if label == "Symptom":
      symptoms_note.add(entity['text'].lower())
    elif label == "Medication":
      medications_note.add(entity['text'].lower())

  i+=1
  symptoms.append(str(symptoms_note))
  medications.append(str(medications_note))

NER_result_df["symptoms"] = symptoms
NER_result_df["medications"] = medications

NER_result_df.replace(r'^\s*$', np.nan, regex=True)

NER_result_df.to_csv(interim_directory / "ner_test.csv", index=False)

Running NER
[{'start': 459, 'end': 465, 'text': 'fevers', 'label': 'Symptom', 'score': 0.8830780386924744}, {'start': 470, 'end': 480, 'text': 'ankle pain', 'label': 'Symptom', 'score': 0.7697964310646057}, {'start': 607, 'end': 616, 'text': 'cefazolin', 'label': 'Medication', 'score': 0.964414656162262}, {'start': 736, 'end': 745, 'text': 'cefazolin', 'label': 'Medication', 'score': 0.978729784488678}, {'start': 979, 'end': 989, 'text': 'chest pain', 'label': 'Symptom', 'score': 0.8688995242118835}, {'start': 1003, 'end': 1014, 'text': 'tachycardic', 'label': 'Symptom', 'score': 0.6594568490982056}, {'start': 1356, 'end': 1364, 'text': 'myalgias', 'label': 'Symptom', 'score': 0.7672026753425598}, {'start': 1366, 'end': 1377, 'text': 'joint pains', 'label': 'Symptom', 'score': 0.7221419215202332}, {'start': 1379, 'end': 1384, 'text': 'cough', 'label': 'Symptom', 'score': 0.7067722082138062}, {'start': 1386, 'end': 1396, 'text': 'hemoptysis', 'label': 'Symptom', 'score': 0.8250517249107

KeyboardInterrupt: 