In [2]:
import os
import pandas as pd
import re
import torch
from joblib import dump, load
pd.set_option('display.max_colwidth', None)

# Pre-processing test data for temporal relation extraction in medical texts

In this approach, the data from the i2b2 dataset will be employed (published within a shared task in 2012, see Sun et al. 2013)

* The test-dataset of the i2b2 challenge contains 120 clinical notes and is published in ./i2b2_dataset/ground_truth/merged_i2b2 as [note_id].xml.txt files. In this notebook, the merged_i2b2 test-data is used, as in this dataset the temporal links are merged to 3 temporal classes (BEFORE, OVERLAP, AFTER), instead of using all 8 classes.

* labels: the tlink labels for the 120 clinical notes are in the same folder as the clinical notes as [note_id].xml.tlink files

When loading the test data, the NE from the test data will be assumed and the sentences will be tagged according to the labeled NE. This reduces errors of temporal links due to wrongly labeled or missing NEs. In the training data however, the NEs are labeled using keyword lists (clinical events) and regular expressions (temporal expressions).

Additionally, within the shared task i2b2, a training dataset was released, containing 190 labeled clinical notes. These data could either be used as a dev dataset, or as a training dataset. If used as a training dataset, the labeled NE tags could be assumed to focus only on the temporal relation extraction. In this approach, the dev data were not employed.

In [3]:
#all_files = os.listdir("./data/ground_truth/merged_i2b2")
path = "C:/Users/stephanie/Uni/Data_Science_project/Code/data/ground_truth/merged_i2b2"
all_files = os.listdir(path)

txt_files = list()
tlink_files = list()
test_notes = pd.DataFrame(columns=('note_id', 'SENT'))
test_tlinks = pd.DataFrame(columns=('note_id', 'TLINK'))
for f in all_files:
    if f.endswith('.txt'):
        txt_files.append(f)
        with open(path+'/'+f) as fd:
            note_id = f.split('.xml')
            lines = fd.readlines()
            for l in lines:
                l = l.strip()
                new_row = {'note_id':note_id[0], 'SENT':l}
                test_notes = test_notes.append(new_row, ignore_index=True)
    elif f.endswith('.tlink'):
        tlink_files.append(f)
        with open(path+'/'+f) as fd:
            note_id = f.split('.xml')
            lines = fd.readlines()
            for l in lines:
                l = l.strip()
                new_row = {'note_id':note_id[0], 'TLINK':l}
                test_tlinks = test_tlinks.append(new_row, ignore_index=True)

In [4]:
test_n = test_notes.copy()
test_n.SENT = test_n.SENT.apply(lambda x: x.lower())
test_n['sent_id'] = test_n.groupby(by = 'note_id').cumcount()
test_n.head(15)

Unnamed: 0,note_id,SENT,sent_id
0,101,admission date :,0
1,101,07/10/1991,1
2,101,discharge date :,2
3,101,07/18/1991,3
4,101,procedures :,4
5,101,thoracentesis was performed on 7-12-91 .,5
6,101,reason for admission :,6
7,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis .,7
8,101,"he returns from the nursing home with fever , leukocytosis , and azotemia .",8
9,101,present illness :,9


## Preprocess TLINKs

In [5]:
test_tlinks.head()

Unnamed: 0,note_id,TLINK
0,101,"EVENT=""An abdominal CAT scan"" 54:0 54:3||EVENT=""The abdominal CAT scan"" 62:0 62:3||type=""OVERLAP"""
1,101,"EVENT=""sepsis"" 47:12 47:12||EVENT=""sepsis"" 60:10 60:10||type=""OVERLAP"""
2,101,"EVENT=""revealed"" 30:1 30:1||EVENT=""Thoracentesis"" 30:0 30:0||type=""OVERLAP"""
3,101,"EVENT=""a pH"" 30:2 30:3||EVENT=""Thoracentesis"" 30:0 30:0||type=""BEFORE"""
4,101,"EVENT=""glucose"" 30:7 30:7||EVENT=""Thoracentesis"" 30:0 30:0||type=""BEFORE"""


Information from the TLINKs is extracted in order to merge them with sentences from the clinical notes.

In [6]:
test_tl = test_tlinks.copy()
test_tl.TLINK = test_tl.TLINK.apply(lambda x: x.lower())

def extract_NE(x):
    first_spl = x.split('="')
    ne_1 = first_spl[1].split('"')
    ne_2 = first_spl[2].split('"')
    nes = [ne_1[0], ne_2[0]]
    #print('nes', nes)
    return nes

# insert a column with NEs
test_tl['NE'] = test_tl.TLINK.apply(lambda x: extract_NE(x))

test_tl.head()

Unnamed: 0,note_id,TLINK,NE
0,101,"event=""an abdominal cat scan"" 54:0 54:3||event=""the abdominal cat scan"" 62:0 62:3||type=""overlap""","[an abdominal cat scan, the abdominal cat scan]"
1,101,"event=""sepsis"" 47:12 47:12||event=""sepsis"" 60:10 60:10||type=""overlap""","[sepsis, sepsis]"
2,101,"event=""revealed"" 30:1 30:1||event=""thoracentesis"" 30:0 30:0||type=""overlap""","[revealed, thoracentesis]"
3,101,"event=""a ph"" 30:2 30:3||event=""thoracentesis"" 30:0 30:0||type=""before""","[a ph, thoracentesis]"
4,101,"event=""glucose"" 30:7 30:7||event=""thoracentesis"" 30:0 30:0||type=""before""","[glucose, thoracentesis]"


In [7]:
red_tlinks = test_tl.copy()      # reduced links (only those per sentence)

def ne_same_sent(x):
    y = None
    sent_spl = x.split('\" ')
    first_id = sent_spl[1].split(':')
    second_id = sent_spl[2].split(':')
    if first_id[0] == second_id[0]:
        y = x
    return y
    
red_tlinks.TLINK = red_tlinks.TLINK.apply(lambda x: ne_same_sent(x))

red_tlinks.head(10)

Unnamed: 0,note_id,TLINK,NE
0,101,,"[an abdominal cat scan, the abdominal cat scan]"
1,101,,"[sepsis, sepsis]"
2,101,"event=""revealed"" 30:1 30:1||event=""thoracentesis"" 30:0 30:0||type=""overlap""","[revealed, thoracentesis]"
3,101,"event=""a ph"" 30:2 30:3||event=""thoracentesis"" 30:0 30:0||type=""before""","[a ph, thoracentesis]"
4,101,"event=""glucose"" 30:7 30:7||event=""thoracentesis"" 30:0 30:0||type=""before""","[glucose, thoracentesis]"
5,101,"event=""white blood cell count"" 30:10 30:13||event=""thoracentesis"" 30:0 30:0||type=""before""","[white blood cell count, thoracentesis]"
6,101,"event=""chest tube"" 30:22 30:23||event=""thoracentesis"" 30:0 30:0||type=""overlap""","[chest tube, thoracentesis]"
7,101,,"[loculated pleural effusions, c. difficile colitis]"
8,101,,"[serratia urosepsis, urosepsis]"
9,101,,"[admission, hospitalized]"


In [8]:
keeper_id = red_tlinks['TLINK'].apply(lambda x: not (x is None))
red_tlinks = red_tlinks[keeper_id]
len(red_tlinks)

8747

In [9]:
# insert a column with the sent_id

def ext_sent_id(x):
    y = None
    sent_spl = x.split('\" ')
    first_id = sent_spl[1].split(':')
    second_id = sent_spl[2].split(':')
    if first_id[0] == second_id[0]:
        y = int(first_id[0])-1
    return y

red_tlinks['sent_id'] = red_tlinks.TLINK.apply(lambda x: ext_sent_id(x))
red_tlinks

Unnamed: 0,note_id,TLINK,NE,sent_id
2,101,"event=""revealed"" 30:1 30:1||event=""thoracentesis"" 30:0 30:0||type=""overlap""","[revealed, thoracentesis]",29
3,101,"event=""a ph"" 30:2 30:3||event=""thoracentesis"" 30:0 30:0||type=""before""","[a ph, thoracentesis]",29
4,101,"event=""glucose"" 30:7 30:7||event=""thoracentesis"" 30:0 30:0||type=""before""","[glucose, thoracentesis]",29
5,101,"event=""white blood cell count"" 30:10 30:13||event=""thoracentesis"" 30:0 30:0||type=""before""","[white blood cell count, thoracentesis]",29
6,101,"event=""chest tube"" 30:22 30:23||event=""thoracentesis"" 30:0 30:0||type=""overlap""","[chest tube, thoracentesis]",29
...,...,...,...,...
27647,88,"event=""inpatient cmed facility"" 27:10 27:12||event=""discharged"" 27:8 27:8||type=""after""","[inpatient cmed facility, discharged]",26
27648,88,"event=""stable"" 27:14 27:14||event=""inpatient cmed facility"" 27:10 27:12||type=""before""","[stable, inpatient cmed facility]",26
27650,88,"event=""oob"" 29:2 29:2||event=""assist"" 29:4 29:4||type=""overlap""","[oob, assist]",28
27652,88,"timex3=""one hour"" 8:35 8:36||event=""pills"" 8:43 8:43||type=""after""","[one hour, pills]",7


In [10]:
def ext_tlink(x):
    split_1 = x.split('type="')
    tl_type = split_1[1].split('"')
    return tl_type[0]
    
red_tlinks.TLINK = red_tlinks.TLINK.apply(lambda x: ext_tlink(x))
red_tlinks

Unnamed: 0,note_id,TLINK,NE,sent_id
2,101,overlap,"[revealed, thoracentesis]",29
3,101,before,"[a ph, thoracentesis]",29
4,101,before,"[glucose, thoracentesis]",29
5,101,before,"[white blood cell count, thoracentesis]",29
6,101,overlap,"[chest tube, thoracentesis]",29
...,...,...,...,...
27647,88,after,"[inpatient cmed facility, discharged]",26
27648,88,before,"[stable, inpatient cmed facility]",26
27650,88,overlap,"[oob, assist]",28
27652,88,after,"[one hour, pills]",7


## Merge sentences with TLINKS

In [11]:
test_n.head()

Unnamed: 0,note_id,SENT,sent_id
0,101,admission date :,0
1,101,07/10/1991,1
2,101,discharge date :,2
3,101,07/18/1991,3
4,101,procedures :,4


In [12]:
merged_data = pd.merge(
    left = test_n, 
    right = red_tlinks, 
    how = 'outer', 
    on = ['note_id', 'sent_id'])
merged_data.head()

Unnamed: 0,note_id,SENT,sent_id,TLINK,NE
0,101,admission date :,0,,
1,101,07/10/1991,1,,
2,101,discharge date :,2,,
3,101,07/18/1991,3,,
4,101,procedures :,4,,


In this notebook, only temporal links within sentences are considered. Of course this can be easily changed.

In [13]:
# filter only sentences with 2 NE in one sentence
clean_data = merged_data.copy()

no_tlink = (clean_data.TLINK.isna())
clean_data = clean_data[~no_tlink]

In [14]:
clean_data.head()

Unnamed: 0,note_id,SENT,sent_id,TLINK,NE
5,101,thoracentesis was performed on 7-12-91 .,5,overlap,"[thoracentesis, 7-12-91]"
7,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis .,7,before,"[ischemic bowel, urosepsis]"
8,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis .,7,before,"[urosepsis, admission]"
9,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis .,7,before,"[c. difficile colitis, admission]"
10,101,"he returns from the nursing home with fever , leukocytosis , and azotemia .",8,overlap,"[leukocytosis, fever]"


In [15]:
clean_data.SENT = clean_data.SENT.replace(to_replace='\*', value='', regex=True)

In [16]:
clean_data['sent_NE'] = list(zip(clean_data.SENT, clean_data.NE))

In [17]:
clean_data.head()

Unnamed: 0,note_id,SENT,sent_id,TLINK,NE,sent_NE
5,101,thoracentesis was performed on 7-12-91 .,5,overlap,"[thoracentesis, 7-12-91]","(thoracentesis was performed on 7-12-91 ., [thoracentesis, 7-12-91])"
7,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis .,7,before,"[ischemic bowel, urosepsis]","(the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis ., [ischemic bowel, urosepsis])"
8,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis .,7,before,"[urosepsis, admission]","(the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis ., [urosepsis, admission])"
9,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis .,7,before,"[c. difficile colitis, admission]","(the patient is an 85-year-old white male with a history of ischemic bowel status post recent admission for urosepsis and c. difficile colitis ., [c. difficile colitis, admission])"
10,101,"he returns from the nursing home with fever , leukocytosis , and azotemia .",8,overlap,"[leukocytosis, fever]","(he returns from the nursing home with fever , leukocytosis , and azotemia ., [leukocytosis, fever])"


In [18]:
def ann_sent(x):
    """tagging NEs occurring within sentences (two per instance)"""
    sent, k = x
    ann_k1 = '*' + k[0] + '*'
    new_sent = sent.replace(k[0], ann_k1)
    ann_k2 = '*' + k[1] + '*'
    new_sent = new_sent.replace(k[1], ann_k2)
    return new_sent

clean_data.SENT = clean_data.sent_NE.apply(lambda x: ann_sent(x))
clean_data = clean_data.drop(columns=['sent_NE'])

In [19]:
len(clean_data)

8747

In [20]:
clean_data.head()

Unnamed: 0,note_id,SENT,sent_id,TLINK,NE
5,101,*thoracentesis* was performed on *7-12-91* .,5,overlap,"[thoracentesis, 7-12-91]"
7,101,the patient is an 85-year-old white male with a history of *ischemic bowel* status post recent admission for *urosepsis* and c. difficile colitis .,7,before,"[ischemic bowel, urosepsis]"
8,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent *admission* for *urosepsis* and c. difficile colitis .,7,before,"[urosepsis, admission]"
9,101,the patient is an 85-year-old white male with a history of ischemic bowel status post recent *admission* for urosepsis and *c. difficile colitis* .,7,before,"[c. difficile colitis, admission]"
10,101,"he returns from the nursing home with *fever* , *leukocytosis* , and azotemia .",8,overlap,"[leukocytosis, fever]"


In [21]:
# filter only sentences with 2 NE marked
clean_data_2ne = clean_data.copy()

def only_2NE(x):
    result = None
    numb_ne = x.split('*')
    if len(numb_ne) == 5:
        result = x
    return result

clean_data_2ne.SENT = clean_data.SENT.apply(lambda x: only_2NE(x))

In [22]:
not_2NE = (clean_data_2ne.SENT.isna())
clean_data_2ne = clean_data_2ne[~not_2NE]
len(clean_data_2ne)

8411

In [23]:
# save annotated sentences in joblib format
pickle_file = './test_data.joblib'
with open(pickle_file, 'wb') as f:
    dump(clean_data_2ne.SENT, f, compress='zlib')

In [24]:
y_test = clean_data_2ne.TLINK

labels = []
for l in y_test:
    x = []
    if re.match(r'after', l):
        x = [1, 0, 0]
    elif re.match(r'before', l):
        x = [0, 1, 0]
    elif re.match(r'overlap', l):
        x = [0, 0, 1]
    labels.append(x)

In [25]:
y_test = torch.tensor(labels)
# save labels in joblib format
pickle_file4 = './test_data_labels.joblib'
with open(pickle_file4, 'wb') as f:
    dump(y_test, f, compress='zlib')

# References

Sun, W., Rumshisky, A., & Uzuner, O. (2013). Evaluating temporal relations in clinical text: 2012 i2b2 challenge. Journal of the American Medical Informatics Association, 20(5), 806-813.

Sohn, S., Wagholikar, K. B., Li, D., Jonnalagadda, S. R., Tao, C., Komandur Elayavilli, R., & Liu, H. (2013). Comprehensive temporal information detection from clinical text: medical events, time, and TLINK identification. Journal of the American Medical Informatics Association, 20(5), 836-842.