In [1]:
import os
import json
import hashlib
from datetime import datetime, timedelta

import pandas as pd
import spacy
from spacy.util import compile_infix_regex
# !python -m spacy download en_core_web_sm

In [2]:
# Please set the following variables first.
mimicdir = "<path to your mimic 3 v1.4 dir>"

## 1. Structured data file

This section creates the table for our dataset (`timeline_i2b2_5col_pakdd2024.csv`).
- First, run the `mimic3buildtimeline_pakdd.R` R script to create the initial table (`pakdd2024timeline.csv.gz`). You have to set the MIMIC-III directory and the current directory in the script before running it.
- Second, run the below cells to check the output file and extract the table.

In [4]:
# Validate the initial table (the output file of mimic3buildtimeline_pakdd.R) by the MD5 hash
r_output_fpath = 'r/extractions/pakdd2024/pakdd2024timeline.csv.gz'

def compute_md5(file_path):
    hash_md5 = hashlib.md5()
    with open(file_path, "rb") as f:
        # Read and update hash string value in blocks of 4K
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()
assert compute_md5(r_output_fpath) == '965bc0d3568776da0e93072b7463c499'

In [5]:
# Load the initial table
df_struct = pd.read_csv(r_output_fpath)

In [6]:
# The indices in the initial table to extract the structred data our code uses (1031 in total).
struct_idx = [28, 29, 63, 62, 64, 145, 146, 147, 260, 153, 152, 266, 1315, 1229, 1228, 3538, 3453, 3452, 3682, 3819, 3909, 3910, 3911, 4000, 4001, 3999, 4051, 4050, 4052, 4054, 4055, 4056, 4057, 4060, 4061,
              4062, 4063, 4069, 4073, 4124, 4123, 4125, 4118, 4120, 4121, 4126, 4129, 24487, 24488, 24489, 24490, 24491, 24492, 24493, 24494, 24495, 24510, 24511, 24497, 24498, 24499, 24500, 24501, 24502,
              24503, 24504, 24505, 24506, 24507, 24508, 24534, 24535, 24536, 24537, 24538, 24539, 24540, 24599, 24615, 24628, 25172, 25306, 25328, 25872, 25873, 25898, 25913, 25914, 25915, 25929, 25941, 25944,
              25942, 25960, 25959, 25961, 25956, 25957, 13055, 13056, 13057, 13059, 13060, 13061, 13062, 13063, 13064, 13065, 13066, 13067, 13069, 13070, 13072, 13073, 13074, 13080, 13081, 13083, 13086, 13087,
              13088, 13094, 13097, 13098, 13099, 13100, 13102, 13121, 13128, 13129, 13132, 13133, 13134, 13135, 13141, 13144, 13145, 13219, 13194, 13249, 13238, 13257, 13282, 13264, 13319, 13321, 13322, 13323,
              13326, 13327, 13330, 13369, 13356, 13371, 13498, 13524, 13647, 13650, 13651, 13648, 13642, 13645, 13652, 13643, 13653, 13649, 13872, 13910, 13936, 14008, 14080, 14081, 14231, 14232, 14261, 14330,
              14329, 14331, 14364, 14367, 14371, 14359, 14361, 14362, 14365, 14372, 14360, 14374, 14366, 14376, 14373, 14368, 14389, 14390, 14419, 14429, 14442, 14447, 14448, 14451, 14456, 14460, 14465, 14490,
              14491, 14492, 14493, 14495, 14496, 14497, 14498, 14502, 14503, 14505, 14469, 14470, 14471, 14472, 14473, 14474, 14475, 14476, 14477, 14478, 14479, 14481, 14482, 14483, 14484, 14485, 14486, 14487,
              14488, 14511, 14510, 14512, 14506, 14509, 14507, 14934, 14935, 107730, 107731, 107732, 107733, 107734, 107735, 107736, 107737, 107738, 107739, 107741, 107742, 107743, 107744, 107745, 107746,
              107748, 107749, 107750, 107751, 107752, 107754, 107755, 107756, 107757, 107758, 107759, 107760, 107761, 107762, 107763, 107764, 107765, 107766, 107767, 107768, 107769, 107770, 107771, 107772,
              107773, 107774, 107775, 107776, 107777, 107778, 107779, 107783, 107784, 107785, 107786, 107787, 107789, 107790, 107798, 107799, 107805, 107812, 107915, 107916, 107930, 108023, 108022, 108133,
              108137, 108135, 108136, 108199, 108308, 108365, 108388, 108549, 108550, 108622, 108618, 108617, 109000, 109071, 109093, 109192, 109371, 109389, 109390, 109395, 109392, 109414, 109382, 109383,
              109391, 109444, 109446, 109462, 109493, 109509, 109519, 109520, 109558, 109557, 109559, 109556, 109550, 109555, 109549, 109546, 109545, 109553, 109551, 109548, 15171, 15181, 15243, 15242, 15244,
              15281, 15395, 15357, 15358, 15359, 15362, 15363, 15424, 15503, 15506, 15531, 15541, 15557, 15567, 15641, 15717, 16037, 16259, 16260, 16238, 16254, 16257, 16388, 17137, 17318, 17720, 17813, 17814,
              18527, 18531, 18550, 18960, 18963, 18965, 19222, 19574, 19568, 19571, 19661, 19705, 19677, 19680, 19683, 19809, 19808, 19810, 19814, 19818, 19812, 19815, 19813, 19819, 19821, 19823, 19824, 19825,
              19828, 19829, 19830, 19831, 19832, 19833, 19837, 19838, 19839, 19842, 19849, 19846, 19850, 19859, 19858, 19860, 19852, 19854, 19856, 19855, 35451, 35472, 35473, 35510, 35511, 35541, 35546, 35635,
              35580, 36735, 37848, 37994, 38015, 38000, 38065, 38120, 38151, 38163, 38169, 38179, 38180, 38207, 38285, 39935, 39940, 40013, 40031, 40030, 40032, 40044, 40043, 40045, 40038, 40041, 40042, 25973,
              25980, 26023, 26029, 26037, 26056, 26110, 26242, 26330, 26425, 26420, 26427, 26410, 26415, 26416, 26432, 26418, 26417, 26426, 26428, 26411, 26500, 26818, 27032, 27231, 27373, 27384, 27426, 27449,
              27504, 27610, 27703, 28292, 28548, 28547, 28565, 28577, 28576, 28578, 28570, 28567, 28568, 28569, 28574, 8599, 8602, 8632, 8606, 8607, 8616, 8613, 8618, 8619, 8622, 8624, 8631, 8641, 8643, 8647,
              8651, 8640, 8610, 8617, 8609, 8615, 8673, 8674, 8688, 8689, 8710, 8713, 8725, 8733, 8739, 9129, 9215, 9281, 9280, 9360, 9388, 9403, 9412, 9552, 9645, 9683, 9843, 9866, 10262, 10523, 10608, 10675,
              10808, 11077, 11081, 11083, 11301, 11349, 11415, 11473, 11532, 11637, 11753, 11787, 11852, 11854, 11963, 12015, 12180, 12208, 12209, 12294, 12416, 12429, 12430, 12428, 12466, 12555, 12717, 12854,
              12881, 12892, 12893, 12919, 12926, 12936, 12932, 12927, 12957, 12956, 12958, 12959, 21487, 21497, 21498, 21505, 21511, 21624, 21867, 22148, 22099, 22100, 22101, 22173, 22444, 22439, 22510, 22730,
              22733, 24089, 24091, 24173, 24199, 24204, 24208, 24224, 24302, 24286, 24348, 24350, 24349, 24406, 24408, 24398, 24404, 28589, 28602, 28603, 28605, 28600, 28604, 28621, 28634, 28635, 28633, 28680,
              28788, 28786, 28789, 28939, 28962, 28987, 29035, 30624, 30622, 30620, 30604, 30609, 30621, 30597, 30598, 30605, 30619, 30623, 30790, 30825, 32778, 32779, 32782, 33705, 33877, 33951, 34106, 34112,
              34115, 34290, 34405, 34406, 34997, 35124, 35094, 35205, 35239, 35242, 35259, 35276, 35284, 35283, 35309, 35288, 35306, 35307, 35308, 35289, 35290, 35314, 35291, 35300, 35312, 35305, 35292, 35299,
              35310, 35313, 35315, 35346, 35338, 35374, 35388, 35389, 35391, 35392, 35393, 35394, 35397, 35398, 35399, 35404, 35409, 35410, 35411, 35412, 35414, 35415, 35417, 35418, 35419, 35422, 35423, 35424,
              35439, 35438, 35440, 35437, 35436, 35433, 35434, 105452, 105472, 105473, 105474, 105475, 105476, 105477, 105478, 105479, 105480, 105481, 105482, 105460, 105461, 105462, 105463, 105464, 105465,
              105466, 105467, 105468, 105469, 105470, 105471, 105501, 105499, 105533, 105532, 105534, 105681, 106047, 106036, 106038, 106934, 106933, 106935, 106938, 106942, 106941, 106960, 106982, 106981,
              106983, 106975, 106974, 106976, 106978, 106980, 106979, 106977, 103781, 103797, 103794, 103795, 103796, 103799, 103800, 103801, 103803, 103804, 103805, 103806, 103808, 103809, 103811, 103814,
              103816, 103817, 103818, 103820, 103855, 103858, 103871, 103874, 103870, 103882, 103890, 103886, 103898, 103899, 103895, 103902, 103827, 103832, 103835, 103831, 103849, 103863, 103889, 103848,
              103917, 103923, 103931, 103937, 103943, 103948, 103949, 103950, 103947, 103946, 103953, 103957, 103959, 103961, 104025, 104028, 104029, 104078, 104350, 104403, 104407, 104432, 104436, 104598,
              104672, 104822, 104841, 104847, 104882, 104883, 104885, 104884, 105356, 105358, 105375, 105379, 105380, 105411, 105438, 105437, 105439, 105433, 105434, 105435, 105436, 19884, 19886, 19887, 19888,
              19889, 19893, 19894, 19896, 19897, 19898, 19901, 19903, 19904, 19905, 19877, 19878, 19880, 19881, 19883, 19922, 19926, 19933, 19934, 19948, 19967, 19992, 19970, 20008, 20064, 20061, 20065, 20094,
              20091, 20136, 20143, 20142, 20188, 20241, 20242, 20243, 20244, 20332, 20375, 20391, 20384, 20376, 20393, 20386, 20389, 20390, 20378, 20377, 20374, 20381, 20520, 20655, 20683, 20778, 21069, 21149,
              21150, 21151, 21152, 21249, 21246, 21293, 21310, 21329, 21350, 21354, 21351, 21357, 21356, 21359, 21367, 21368, 21369, 21376, 21386, 21404, 21387, 21393, 21402, 21407, 21408, 21409, 21411, 21412,
              21413, 21414, 21415, 21416, 21417, 21418, 21419, 21420, 21421, 21422, 21423, 21424, 21425, 21426, 21427, 21428, 21429, 21430, 21439, 21438, 21440, 21433, 21437, 21435, 112520, 112521, 112522,
              112523, 112524, 112525, 112526, 112527, 112530, 112531, 112533, 112534, 112535, 112538, 112539, 112540, 112541, 112544, 112545, 112546, 112548, 112549, 112550, 112551, 112552, 112553, 112558,
              112562, 112563, 112571, 112579, 112619, 112628, 112640, 112649, 112648, 112754, 112755, 112757, 112766, 112777, 112781, 112785, 112789, 112802, 112806, 112818, 112825, 112950, 113145, 113174,
              113173, 113175]
struct_hids = [112982] * 47 + [133706] * 51 + [145181] * 143 + [145785] * 108 + [152818] * 81 + [158331] * 35 + [160019] * 44 + [162755] * 87 + [163261] * 33 + [164788] * 102 + [178300] * 50 + [184834] * 85 + \
              [125310] * 112 + [157739] * 53

In [7]:
# Select rows from the R script output
df_struct_new = df_struct.iloc[struct_idx].copy()

In [8]:
# Mapping from HADM_ID to SUBJECT_ID
df_admissions = pd.read_csv(os.path.join(mimicdir, 'ADMISSIONS.csv'))
hid_to_subject = {
    row.HADM_ID: row.SUBJECT_ID for _, row in df_admissions.iterrows()
}

# Validate the output file one more time by HADM_ID
assert list(map(hid_to_subject.get, struct_hids)) == df_struct_new.pt.tolist()

In [9]:
# Add admission id column
df_struct_new['hid'] = struct_hids

In [10]:
# Drop the subject id column and rearrange
df_struct_final = df_struct_new[['hid', 't', 'event', 'value']]

In [11]:
# Output the final table
df_struct_final.to_csv('data/timeline_i2b2_5col_pakdd2024.csv', index=False)

## 2. Generate annotations

This section creates the dataset files under `data/event_pakdd2024_cv/` using the deidentified annotations in `data/annotations_pakdd2024/`.

In [12]:
anno_dir = 'data/annotations_pakdd2024/'
dataset_dir = 'data/event_pakdd2024_cv/'
hids_all = [112982, 125310, 133706, 145181, 145785, 152818, 157739, 158331, 160019, 162755, 163261, 164788, 178300, 184834]
cv_splits = [[163261, 125310, 162755], [160019, 157739, 184834], [158331, 145181, 164788], [145785, 152818, 112982], [133706, 178300]]
max_tokens = 110

In [13]:
# HADM_ID to admission time
hid_to_admit_time = {
    row.HADM_ID: row.ADMITTIME for _, row in df_admissions.iterrows()
}

In [14]:
# Mapping from HADM_ID to SUBJECT_ID
df_notes = pd.read_csv(os.path.join(mimicdir, 'NOTEEVENTS.csv'), low_memory=False)
df_disch = df_notes[df_notes.CATEGORY == 'Discharge summary']
hid_to_note = {
    row.HADM_ID: row.TEXT.replace('"', '""') for _, row in df_disch.iterrows()
}

In [15]:
# Spacy pipeline
nlp = spacy.load("en_core_web_sm")
infixes = ['-[**'] + list(nlp.Defaults.infixes)+ ['\('] + ['\.\n'] + ['-\d+'] + ['-<'] + [':'] + ['].']
infix_regex = spacy.util.compile_infix_regex(infixes)
nlp.tokenizer.infix_finditer = infix_regex.finditer

In [16]:
# Match spacy span using char bounds
def get_event_doc_span(st, ed, doc, e):
    if doc.text[st:ed] != "\n":
        span = doc.char_span(st, ed)
        if span is not None:
            doc_span = [span.start, span.end]
            assert doc[doc_span[0]:doc_span[1]].text== doc.text[st:ed]
        else:
            # print("None Error:", doc.text[st:ed], st, ed)
            if span is None:
                new_st, new_ed = st, ed - 1
                # print("ed - 1")
                if new_st < new_ed:
                    span = doc.char_span(new_st, new_ed)

            if span is None:
                new_st, new_ed = st, ed + 1
                if new_st < new_ed:
                    span = doc.char_span(new_st, new_ed)
                # print("ed + 1")

            if span is None:
                new_st, new_ed = st + 1, ed
                # print("st + 1")
                if new_st < new_ed:
                    span = doc.char_span(new_st, new_ed)

            if span is None:
                new_st = st + 1
                new_ed = ed - 1 
                # print("st + 1, ed -1")
                if new_st < new_ed:
                    span = doc.char_span(new_st, new_ed) 
            
            if span is None:
                new_st = st - 1
                new_ed = ed + 1
                # print("st - 1, ed +1")
                if new_st < new_ed:
                    span = doc.char_span(new_st, new_ed)  
                
            if span is None:
                new_st = st - 1
                new_ed = ed - 1 
                # print("st - 1, ed - 1")
                if new_st < new_ed:
                    span = doc.char_span(new_st, new_ed) 
                
            if span is None:
                new_st = st - 2
                new_ed = ed    
                # print("st - 2, ed ")
                if new_st < new_ed:
                    span = doc.char_span(new_st, new_ed)
                # 1)Bilateral LE doppler

            
            # print(f"st:{st}, ed:{ed}")
            # print("event", e)
            if span is None:
                return None
            doc_span = [span.start, span.end]

            try:
                assert doc[doc_span[0]:doc_span[1]].text== doc.text[new_st:new_ed]
            except AssertionError as msg:
                # print(msg)
                
                # print("doc[doc_span[0]:doc_span[1]].text", doc[doc_span[0]:doc_span[1]].text)     
                # print("doc.text[st:ed]", doc.text[st:ed]) 
                # print(f"{st}:st, {ed}:ed")
                sys.exit()
    else:
        return None
    return doc_span

# Extract spacy sentences and annotations aligned to each sentence
def get_sentence_entities(one_js, doc):
    hadm_id = int(one_js[0]['pt'])
    
    entities = []
    for i, e in enumerate(one_js):
        st, ed = e['text_position']['start'], e['text_position']['end']
        doc_span = get_event_doc_span(st, ed, doc, e)
        if doc_span is not None:
            e['doc_span'] = doc_span
            e['anno_level'] = 'phrase'
            entities.append(e)

    entities = sorted(entities, key=lambda x: x['doc_span'][0])

    sentence_tokens = []
    sent_span = []
    entities_out =[]
    e_idx = 0
    ne = 0
    for sent_id, sent in enumerate(doc.sents):
        sentence_tokens.append([token.text for token in sent])
        sent_span.append([sent.start, sent.end])
        entities_out.append([])
     
        while e_idx < len(entities) and entities[e_idx]['doc_span'][0] < sent.end: 
            e_v = entities[e_idx]
            e_v['doc_span'][-1] = e_v['doc_span'][-1] - 1
            entities_out[sent_id].append(e_v['doc_span'] + [e_v['timeline_label_relative']] + [e_v['selected_row_idxs']] + [e_v['anno_level']])
            e_idx += 1
            ne+=1

    assert len(entities) == e_idx == ne
    return sentence_tokens,  entities_out

def get_abs_time_str(rel_time_sec, admit_time_str):
    admit_time = datetime.strptime(admit_time_str, "%Y-%m-%d %H:%M:%S")
    event_time = admit_time + timedelta(seconds=rel_time_sec)
    return datetime.strftime(event_time, "%Y-%m-%d %H:%M:%S")

def get_timeline_label(timeline_label_relative, admit_time_str):
    ret = timeline_label_relative.copy()
    lb_rel, ub_rel = ret[0], ret[1]
    if not isinstance(lb_rel, str) or 'inf' not in lb_rel:
        ret[0] = get_abs_time_str(lb_rel, admit_time_str)
    if not isinstance(ub_rel, str) or 'inf' not in ub_rel:
        ret[1] = get_abs_time_str(ub_rel, admit_time_str)
    return ret

In [17]:
max_tokens = 110
examples = []

for i, hid in enumerate(hids_all):
    anno_fname = f"annotatinos_{hid}_yes_structured.json"
    with open(os.path.join(anno_dir, anno_fname), 'r') as f:
        one_js = json.load(f)
    original_text = hid_to_note[hid]
    admit_time = hid_to_admit_time[hid]
    # print("\nhid", hid, "text_len", len(original_text))
    doc = nlp(original_text)

    # sentences: list of sentences split by spacy
    # entities: examples in each sentences
    sentences, entities = get_sentence_entities(one_js, doc)

    sentence_start = 0
    for sent_idx, pevents in enumerate(entities):
        for anno in pevents:
            start, end = anno[0], anno[1]
            anno_type, timeline_label_relative = anno[2]
            selected_row_idxs = anno[3]
            anno_level = anno[4]           
            
            cur_sentence = sentences[sent_idx].copy()            
            subj_start = start - sentence_start
            subj_end = end - sentence_start
            
            # First, append next sentence until the end is included in the sentence
            sent_idx2 = sent_idx
            while subj_end > len(cur_sentence) and len(cur_sentence) < max_tokens:
                sent_idx2 += 1
                cur_sentence = cur_sentence + sentences[sent_idx2]
            
            # Increase left context
            margin = (max_tokens - len(cur_sentence)) // 2
            left = 0
            sent_idx_left = sent_idx-1
            for sent_idx_left in range(sent_idx-1, -1, -1):
                if left + len(sentences[sent_idx_left]) <= margin:
                    cur_sentence = sentences[sent_idx_left] + cur_sentence
                    left += len(sentences[sent_idx_left])
                    subj_start += len(sentences[sent_idx_left])
                    subj_end += len(sentences[sent_idx_left])
                else:
                    sent_idx_left += 1
                    break
            # Increase right context
            margin = max_tokens - len(cur_sentence)
            right = 0
            for sent_idx_right in range(sent_idx2+1, len(sentences)):
                if right + len(sentences[sent_idx_right]) <= margin:
                    cur_sentence = cur_sentence + sentences[sent_idx_right]
                    right += len(sentences[sent_idx_right])
                else:
                    break      
            # Increase left context again
            margin = max_tokens - len(cur_sentence) 
            for sent_idx_left2 in range(sent_idx_left-1, -1, -1):
                if left + len(sentences[sent_idx_left2]) <= margin:
                    cur_sentence = sentences[sent_idx_left2] + cur_sentence
                    left += len(sentences[sent_idx_left2])
                    subj_start += len(sentences[sent_idx_left2])
                    subj_end += len(sentences[sent_idx_left2])
                else:
                    break
                    
            if subj_end >= len(cur_sentence):
                subj_end = len(cur_sentence) - 1
            
            examples.append({
                'docid': hid,
                'doc_admit_time': admit_time,
                'id': f'{hid}@{admit_time}::({sent_idx})-({start},{end})',
                'pevent': get_timeline_label(timeline_label_relative, admit_time),
                'subj_start': subj_start,
                'subj_end': subj_end,
                'subj_label_type': anno_type,
                'token': cur_sentence,
                'sent_start': 0,
                'sent_end': len(cur_sentence),
                'selected_row_idxs': selected_row_idxs,
                'anno_level': anno_level,
            })
        sentence_start += len(sentences[sent_idx])

print(f"{len(examples)} examples")

3472 examples


In [18]:
hids_all2 = [hid for split in cv_splits for hid in split]

for cv_idx in range(5):
    output_dir = os.path.join(dataset_dir, str(cv_idx))
    print(output_dir)
    os.makedirs(output_dir, exist_ok=True)
    
    dev_hadm_ids = cv_splits[cv_idx]
    train_hadm_ids = [h for h in hids_all2 if h not in dev_hadm_ids]

    train_examples = [ex for ex in examples if ex['docid'] in train_hadm_ids]
    with open(os.path.join(output_dir, 'train.json'), 'w') as fd:
        json.dump(train_examples, fd)
    print(f'train: {len(train_examples)}')
        
    dev_examples = [ex for ex in examples if ex['docid'] in dev_hadm_ids]
    with open(os.path.join(output_dir, 'dev.json'), 'w') as fd:
        json.dump(dev_examples, fd)
    print(f'dev: {len(dev_examples)}')

    with open(os.path.join(output_dir, 'test.json'), 'w') as fd:
        json.dump([], fd)

data/event_pakdd2024_cv/0
train: 2808
dev: 664
data/event_pakdd2024_cv/1
train: 2819
dev: 653
data/event_pakdd2024_cv/2
train: 2656
dev: 816
data/event_pakdd2024_cv/3
train: 2696
dev: 776
data/event_pakdd2024_cv/4
train: 2909
dev: 563
