Segment MIMIC Notes

In [5]:
# Import Libraries
from collections import defaultdict
from functools import lru_cache
import re

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import unidecode

import os


In [60]:

#subsplit(text) tries to identify subsection headers (like Medications:, Allergies:) and split the text accordingly.
def subsplit(text):
    l = re.split(r"\n(.{1,30}:)(?![0-9])", text) # Split on the first 30 characters followed by a colon(So it's looking for section headers like: Medications:, Allergies:, etc.)
    if len(l) == 1:
        yield text # If no subsection headers are found, return the whole text
        return
    if l[0]:
        yield l[0] # If there's text before the first matched header, yield that separately.
    for i in range(1, len(l), 2):
        yield l[i] + l[i+1] # Yield the matched header and the text that follows it.


# Chief Complaint:
# Chest pain and shortness of breath.
# Medications:
# Aspirin 81mg daily.


In [58]:
def cut_record(text):
    top_split_pattern = r"\n\n|\n ?__+\n" # split by double newlines or a newline followed by one or more underscores
    for part in re.split(top_split_pattern, text):
        part = re.sub(r"^\s*\[\*\*[0-9\-]*\*\*\]\s+([0-9]{4}|[0-9]{1,2}:[0-9]{1,2} (PM|AM))", "", part) #  Remove certain date/time tokens
        part = re.sub(r" +FINAL REPORT\n", "", part) # Remove "FINAL REPORT" header
        part = part.strip() # Remove leading/trailing whitespace
        if not part: # Skips any empty or whitespace-only parts after cleaning.
            continue
        yield from subsplit(part) #Passes each cleaned part to a function called subsplit

In [8]:
def get_title(text):
    m = re.search(r"^(.*?)(?:\:|\.{3,4})(?![0-9])", text)
    if not m:
        return None, text
    l, r = m.span()
    title = m.group(1).strip()
    body = text[r:].strip()
    return title, body

In [9]:
def normalize_title(title):
    if title is None:
        return None
    title = re.sub(r"\s+", " ", title)
    title = title.strip()
    title = title.lower()
    title = unidecode.unidecode(title)
    title = re.sub(r"[0-9]", "9", title)
    return title

In [10]:
def extract_and_normalize(text):
    title, body = get_title(text)
    return body, title, normalize_title(title)

In [11]:
def select_good_titles(titles, repeats=20, words=6):
    mask = titles["count"] >= repeats
    mask &= titles["title"].str.len() > 0
    mask &= titles["title"].str.count(" ") < words
    mask &= ~titles["title"].str.contains(",")
    return mask



In [12]:
def get_good_titles(parts, col="stitle"):
    titles = parts[col].value_counts().reset_index()
    titles.columns = ["title", "count"]
    titles = titles[select_good_titles(titles)].reset_index(drop=True)

    tid2t = titles.title.to_dict()
    tid2t = {k+1: v for k,v in tid2t.items()}
    t2tid = {v: k for k,v in tid2t.items()}
    return tid2t, t2tid

In [13]:
def filter_parts(parts, min_stext_length=10):
    """Remove segments with little context"""
    mask = parts["stext"].str.len() >= min_stext_length
    return parts[mask].reset_index(drop=True)

In [14]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        max_length=512,
        truncation=True,
    )


Load records!

In [15]:
types = {
        'CHARTDATE': pd.StringDtype(),
        'CHARTTIME': pd.StringDtype(),
        'STORETIME': pd.StringDtype(),
        'CATEGORY': pd.StringDtype(),
        'DESCRIPTION': pd.StringDtype(),
        'ISERROR': pd.StringDtype(),
        'TEXT': pd.StringDtype()
    }

In [17]:
good_categories = {
    'Nursing/other': 11, # 10
    'Radiology': 9,
    'Nursing': 6, # mostly Action, response, plan
    'ECG': 0,
    'Physician ': 10, # 10
    'Discharge summary': 10,
    'Echo': 10,
    'Respiratory ': 10,
    'Nutrition': 9,
    'General': 8,
    'Rehab Services': 9,
    'Social Work': 8, # no good titles
    'Case Management ': 5, # Action, response, plan
    'Pharmacy': 4, # assesment, recommanation
    'Consult': 10,
}

In [18]:
note_path = "/home/h6x/git_projects/cosc-526-data-engineering-project/data/NOTEEVENTS.csv.gz"

In [23]:
cutoff = None

In [24]:
notes = pd.read_csv(note_path, dtype=types, nrows=cutoff)

In [26]:
stats = pd.DataFrame({
    "count": notes["CATEGORY"].sort_index(inplace=False).value_counts(),
    "goodness": good_categories
}).sort_values(["goodness", "count"], ascending=[False, False])

In [27]:
stats

Unnamed: 0,count,goodness
Nursing/other,822497,11
Physician,141624,10
Discharge summary,59652,10
Echo,45794,10
Respiratory,31739,10
Consult,98,10
Radiology,522279,9
Nutrition,9418,9
Rehab Services,5431,9
General,8301,8


In [28]:
notes.head(3)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,2119-05-25,,,Discharge summary,Report,,,Admission Date: [**2119-5-4**] D...


In [29]:
notes["CHARTDATE"] = pd.to_datetime(notes["CHARTDATE"])
notes = notes.sort_values(["SUBJECT_ID", "CHARTDATE"])

In [30]:
notes.head(3)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
1671019,1678765,2,163353.0,2138-07-17,2138-07-17 23:08:00,2138-07-17 23:18:00,Nursing/other,Report,17774.0,,Nursing Transfer note Pt admitted to NICU fo...
1671574,1678764,2,163353.0,2138-07-17,2138-07-17 22:51:00,2138-07-17 23:12:00,Nursing/other,Report,16929.0,,Neonatology Attending Triage Note Baby [**Nam...
291220,272794,3,,2101-10-06,,,ECG,Report,,,Sinus rhythm Inferior/lateral ST-T changes are...


Filter the notes by goodness

In [33]:
note_relevance = notes["CATEGORY"].isin(stats.query("goodness == 11").index)
# note_relevance

In [34]:
notes = notes[note_relevance]

In [None]:
notes.head(3)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
1671019,1678765,2,163353.0,2138-07-17,2138-07-17 23:08:00,2138-07-17 23:18:00,Nursing/other,Report,17774.0,,Nursing Transfer note Pt admitted to NICU fo...
1671574,1678764,2,163353.0,2138-07-17,2138-07-17 22:51:00,2138-07-17 23:12:00,Nursing/other,Report,16929.0,,Neonatology Attending Triage Note Baby [**Nam...
1297688,1260684,3,145834.0,2101-10-21,2101-10-21 06:58:00,2101-10-21 07:15:00,Nursing/other,Report,21570.0,,Micu Progress Nursing Note: Patient arrived i...


In [36]:
notes['CATEGORY'].unique()

<StringArray>
['Nursing/other']
Length: 1, dtype: string

In [None]:
if len(notes) == 0:
    raise Exception("Filtering removed all notes")

In [None]:
# Add a new column record_number to the notes DataFrame, which gives each note a sequential number per patient (SUBJECT_ID). So for each patient, their first note will be 0, next is 1, and so on.
notes = notes.groupby('SUBJECT_ID', group_keys=False).apply(lambda group: group.assign(record_number=range(len(group))))

  notes = notes.groupby('SUBJECT_ID', group_keys=False).apply(lambda group: group.assign(record_number=range(len(group))))


In [39]:
notes.head(3)

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,record_number
1671019,1678765,2,163353.0,2138-07-17,2138-07-17 23:08:00,2138-07-17 23:18:00,Nursing/other,Report,17774.0,,Nursing Transfer note Pt admitted to NICU fo...,0
1671574,1678764,2,163353.0,2138-07-17,2138-07-17 22:51:00,2138-07-17 23:12:00,Nursing/other,Report,16929.0,,Neonatology Attending Triage Note Baby [**Nam...,1
1297688,1260684,3,145834.0,2101-10-21,2101-10-21 06:58:00,2101-10-21 07:15:00,Nursing/other,Report,21570.0,,Micu Progress Nursing Note: Patient arrived i...,0


In [40]:
notes = notes[["ROW_ID", "SUBJECT_ID", "record_number", "TEXT"]]
notes = notes.rename(columns={"ROW_ID": "rid", "SUBJECT_ID": "pid", "record_number": "rord", "TEXT": "text"})
notes = notes.set_index(["rid", "pid", "rord"])

In [41]:
notes.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,text
rid,pid,rord,Unnamed: 3_level_1
1678765,2,0,Nursing Transfer note Pt admitted to NICU fo...
1678764,2,1,Neonatology Attending Triage Note Baby [**Nam...
1260684,3,0,Micu Progress Nursing Note: Patient arrived i...


In [None]:
# records should be a dataframe with multiindex (record_id, patient_id, record_number)
# and a column called text

Now we are doing some data cleaning and segments

In [42]:
#  create_parts(records):
records = notes

In [43]:
it = records.text

In [48]:
it.head()

rid      pid  rord
1678765  2    0       Nursing Transfer note


Pt admitted to NICU fo...
1678764  2    1       Neonatology Attending Triage Note

Baby [**Nam...
1260684  3    0       Micu Progress Nursing Note:

Patient arrived i...
1260685  3    1       MICU NSG PROG NOTE: days
Remains stable on hig...
1260686  3    2       MICU NPN 3PM-11PM:
Neuro: Pt is restless at ti...
Name: text, dtype: string

In [49]:
it_test = it.head(10)

In [53]:

tqdm.pandas()
parts = it_test.progress_apply(lambda x: pd.Series(cut_record(x))).stack()

100%|██████████| 10/10 [00:00<00:00, 5260.63it/s]


In [54]:
parts

rid      pid  rord    
1678765  2    0     0                                 Nursing Transfer note
                    1     Pt admitted to NICU for sepsis eval. Please se...
                    2     Infant stable in RA. RR 30-40's, sats 96-100%....
1678764  2    1     0                     Neonatology Attending Triage Note
                    1     Baby [**Name (NI) 1**] [**Known lastname 2**] ...
                    2                       Mother is 34 years old G1 P0-1.
                    3     PNS:  A pos, Ab neg, HBSAg neg, RPR NR, RI, GB...
                    4     PE - Baby is [**Name2 (NI) 5**] and vigorous, ...
                    5                                               DS - 72
                    6     Assessment/plan:\nTerm male infant with increa...
1260684  3    0     0                           Micu Progress Nursing Note:
                    1     Patient arrived in unit at 19:15 from ED. Hx o...
1260685  3    1     0     MICU NSG PROG NOTE: days\nRemains stabl

In [55]:
parts.index.names = ['rid', 'pid', 'rord',  'srord']
parts.name = "text"
parts = parts.reset_index()

In [57]:
parts

Unnamed: 0,rid,pid,rord,srord,text
0,1678765,2,0,0,Nursing Transfer note
1,1678765,2,0,1,Pt admitted to NICU for sepsis eval. Please se...
2,1678765,2,0,2,"Infant stable in RA. RR 30-40's, sats 96-100%...."
3,1678764,2,1,0,Neonatology Attending Triage Note
4,1678764,2,1,1,Baby [**Name (NI) 1**] [**Known lastname 2**] ...
5,1678764,2,1,2,Mother is 34 years old G1 P0-1.
6,1678764,2,1,3,"PNS: A pos, Ab neg, HBSAg neg, RPR NR, RI, GB..."
7,1678764,2,1,4,"PE - Baby is [**Name2 (NI) 5**] and vigorous, ..."
8,1678764,2,1,5,DS - 72
9,1678764,2,1,6,Assessment/plan:\nTerm male infant with increa...
