In [2]:
###Mount data directory for MIMIC
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [3]:
###Read dataset
import pandas as pd
from pathlib import Path

###init dataset path
dataset_dir = Path('/content/gdrive/MyDrive/hackathon/')
interim_directory = Path('/content/gdrive/MyDrive/hackathon')

cvd_df = pd.read_csv(dataset_dir / "cvd_note.csv")

column_names = cvd_df.columns
print(column_names)

###Give a placeholder ID for each text
cvd_df.columns.values[0] = 'note_ID'

print(cvd_df['note_ID'])

  cvd_df = pd.read_csv(dataset_dir / "cvd_note.csv")


Index(['Unnamed: 0', 'row_id', 'subject_id', 'hadm_id', 'seq_num', 'icd9_code',
       'row_id.1', 'subject_id.1', 'hadm_id.1', 'chartdate', 'charttime',
       'storetime', 'category', 'description', 'cgid', 'iserror', 'text'],
      dtype='object')
0              0
1              1
2              2
3              3
4              4
           ...  
364074    364074
364075    364075
364076    364076
364077    364077
364078    364078
Name: note_ID, Length: 364079, dtype: int64


In [4]:
###MIMIC headings
known_headings = [
    "Name:",
    "Unit No:",
    "Admission Date:",
    "Discharge Date:",
    "Date of Birth:",
    "Sex:",
    "Service:",
    "Allergies:",
    "Attending:",
    "Chief Complaint:",
    "History of Present Illness:",
    "Past Medical History:",
    "Social History:",
    "Family History:",
    "Physical Exam:",
    "Pathology:",
    "Brief Hospital Course:",
    "Medications on Admission:",
    "Discharge Medications:",
    "Discharge Disposition:",
    "Discharge Diagnosis:",
    "Discharge Condition:",
    "Discharge Instructions:",
    "Followup Instructions:",
    "Discharge:",
    "Pertinent Results:",
    "Studies:",
    "Pending Results:",
    "Transitional Issues:",
    "PAST SURGICAL HISTORY:",
    "ADMISSION PHYSICAL EXAM:",
    "DISCHARGE PHYSICAL EXAM:",
    "PERTINENT LABS:",
    "DISCHARGE LABS:",
    "MICROBIOLOGY:",
    "IMAGING:",
    "ACTIVE ISSUES:",
    "CHRONIC ISSUES:",
    "Review of Systems:",
    "Major Surgical or Invasive Procedure:",
    "ADMISSION CXR:",
    "FOLLOW UP CXR:",
    "VASCULAR SURGERY ADMISSION EXAM:",
    "ADMISSION LABS:",
    "DEATH EXAM:",
    "CXR:",
    "CXR ___:",
    "SECONDARY:",
    "LABS:"
]

In [5]:
###Splitting MIMIC notes by heading
import re
import numpy as np

print("Splitting notes and annotations based on subheadings...")

def extract_subsections(x):
    section_dict = {}
    # for heading in known_headings:
    for heading in headings_to_extract:
        #print(f"Extracting subsection for heading: {heading}")
        pattern = r"(^|\s\s+)" + re.escape(heading)

        if not re.search(pattern, x):
            continue

        match = re.search(pattern, x)

        start_index_extract = match.start()

        # find closest next section, starting from end of note
        next_section_index = len(x) - 1
        for next_heading in known_headings:
            if next_heading.__eq__(heading):
                continue

            pattern_next = r"(^|\s\s+)" + re.escape(next_heading)
            match_next = re.search(pattern_next, x)

            if not re.search(pattern_next, x):
                continue

            if next_section_index > match_next.start() > start_index_extract:
                next_section_index = match_next.start()

        # extract section between start and next section, store
        section_dict[heading] = [start_index_extract, next_section_index, x[start_index_extract:next_section_index]]

    return section_dict

########
### parameters to call it
########
headings_to_extract = [
    "History of Present Illness:",
    "Medications on Admission:",
    "Discharge Medications:"
]

# do a subset of the notes
cvd_subset_df = cvd_df.head(50)

# for each note_id, extract sections and save as [note_id | section | section_begin | section_end | section_type]
note_ids = cvd_subset_df["note_ID"].unique()

# subsections = []
subsection_texts = []
i=0
for note_id in note_ids:
    print(f"Extracting subsection for: {i}th note.")
    text = cvd_subset_df.loc[cvd_subset_df["note_ID"] == note_id, 'text'].item()
    subsections_dict = extract_subsections(text)

    subnote_text = ""

    for key in subsections_dict.keys():
        start_index = subsections_dict[key][0]
        end_index = subsections_dict[key][1]

        # concat the text and codes into single entry
        subnote_text = subnote_text + subsections_dict[key][2]

    # remove duplicate codes and descriptions TODO: optional
    subsection_texts.append(subnote_text.lstrip())
    i+=1

# bring subsection text into dataframe + add ids
notes_sections_df = pd.DataFrame(subsection_texts)
notes_sections_df.insert(0, "note_ID", note_ids)
notes_sections_df.columns = ['note_ID', 'text']

# remove rows where note did not contain subsections
drop_rows = notes_sections_df[notes_sections_df['text']==''].index
notes_sections_df.drop(drop_rows, inplace=True)

print(notes_sections_df.columns.values)
notes_sections_df.to_csv(interim_directory /"mimic_text_subsections.csv", index=False,sep = "\t")

print("NOTES SECT:")
#print(notes_sections_df.iloc[0])

Splitting notes and annotations based on subheadings...
Extracting subsection for: 0th note.
Extracting subsection for: 1th note.
Extracting subsection for: 2th note.
Extracting subsection for: 3th note.
Extracting subsection for: 4th note.
Extracting subsection for: 5th note.
Extracting subsection for: 6th note.
Extracting subsection for: 7th note.
Extracting subsection for: 8th note.
Extracting subsection for: 9th note.
Extracting subsection for: 10th note.
Extracting subsection for: 11th note.
Extracting subsection for: 12th note.
Extracting subsection for: 13th note.
Extracting subsection for: 14th note.
Extracting subsection for: 15th note.
Extracting subsection for: 16th note.
Extracting subsection for: 17th note.
Extracting subsection for: 18th note.
Extracting subsection for: 19th note.
Extracting subsection for: 20th note.
Extracting subsection for: 21th note.
Extracting subsection for: 22th note.
Extracting subsection for: 23th note.
Extracting subsection for: 24th note.
Extr