# Obtain contextual embeddings of sentences containing 2 NEs each

In this Jupyter-notebook, contextualised embeddings are obtained of sentences with 2 NEs tagged, which will be further used for temporal relation extraction. This approach builds on Zhou et al. (2021).

In [7]:
import torch
import numpy as np
import pandas as pd
import torch.nn as nn
from joblib import load, dump 
import re
from tqdm.notebook import tqdm

import transformers
from sklearn.metrics import *
from transformers import AdamW
from tqdm.notebook import tqdm
from scipy.special import softmax
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split as tts
from transformers import BertTokenizerFast, BertConfig, BertForSequenceClassification, AutoModel, BertModel, BertConfig, AutoConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

pd.set_option('display.max_colwidth', None)

In [8]:
# Define the device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [9]:
# load file:
# with open('./test_data.joblib', 'rb') as f:    # embeddings test data
with open('./ast_ann_sent.joblib', 'rb') as f:    # embeddings training data
    ast_ann_sent = load(f)
data_ast = pd.DataFrame(ast_ann_sent)

In [10]:
data_ast.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SENT
Unnamed: 0_level_1,note_id,sent_id,Unnamed: 3_level_1
0,0,0,admission date *2151-7-16* *discharge* date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process tuberculosis
2,0,0,admission date *2151-7-16* discharge date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*
3,0,0,admission date 2151-7-16 *discharge* date *2151-8-4* service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process tuberculosis
4,0,0,admission date 2151-7-16 *discharge* date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*
5,0,0,admission date 2151-7-16 discharge date *2151-8-4* service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*


In [11]:
# Construct a BERT tokenizer based on WordPiece
bert_tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [12]:
# A sanity check of the tokenizer
encoded_instance = bert_tokenizer.batch_encode_plus([data_ast.iloc[0].SENT], padding=True)
print(encoded_instance)

{'input_ids': [[101, 9634, 3058, 1008, 17405, 2487, 1011, 1021, 1011, 2385, 1008, 1008, 11889, 1008, 3058, 17405, 2487, 1011, 1022, 1011, 1018, 2326, 5587, 10497, 2819, 2557, 27179, 2913, 2557, 27179, 2913, 2036, 2443, 1037, 3108, 14931, 2029, 4484, 6187, 28403, 2854, 22520, 1999, 1996, 2187, 11192, 13450, 8335, 2007, 16514, 2832, 15877, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}


In [13]:
print("Original text:", data_ast.iloc[0].SENT)
print("BERT BPEs:", bert_tokenizer.convert_ids_to_tokens(encoded_instance["input_ids"][0]))
a = bert_tokenizer.batch_encode_plus([data_ast.iloc[0].SENT], padding=True)
tokens = bert_tokenizer.convert_ids_to_tokens(encoded_instance["input_ids"][0])
print(len(tokens))

Original text:  admission date *2151-7-16* *discharge* date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process tuberculosis 
BERT BPEs: ['[CLS]', 'admission', 'date', '*', '215', '##1', '-', '7', '-', '16', '*', '*', 'discharge', '*', 'date', '215', '##1', '-', '8', '-', '4', 'service', 'add', '##end', '##um', 'radio', '##logic', 'studies', 'radio', '##logic', 'studies', 'also', 'included', 'a', 'chest', 'ct', 'which', 'confirmed', 'ca', '##vita', '##ry', 'lesions', 'in', 'the', 'left', 'lung', 'apex', 'consistent', 'with', 'infectious', 'process', 'tuberculosis', '[SEP]']
53


In [14]:
print(a.word_ids())

[None, 0, 1, 2, 3, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 14, 15, 16, 17, 18, 19, 19, 19, 20, 20, 21, 22, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, None]


Comment: in this case, it is not adequate to use .word_ids() to refer to the index of the whitespace of the entity before bert_tokenizer, as the .word_ids()-index does not always refer to words separated by a whitespace. In the upper example, the date (1 NE) has 5 different word_ids. Therefore I used the following approach: bert_tokenize the sentences (including 4 '\*' separating the 2 NE) to get the entity positions. In a second step, remove the '\*' from the sentences to tokenize the data NOT including the '\*' to generate the embeddings of the sentences.

In [15]:
# filter only sentences with two NE marked with an * --> just to be sure that there are only sentences with 2 NEs tagged in the data

def only_2NE(x):
    result = None
    numb_ne = x.split('*')
    if len(numb_ne) == 5:
        result = x
    return result

data_ast.SENT = data_ast.SENT.apply(lambda x: only_2NE(x))


not_2NE = (data_ast.SENT.isna())
data_ast = data_ast[~not_2NE]
len(data_ast)

3704

In [16]:
# get the positions of the named entities

ent_pos_df = data_ast.copy()

def get_embeddings(x):
    z = 0
    y = 0
    encoded_instance = bert_tokenizer.batch_encode_plus([x], padding=True)
    sent = bert_tokenizer.convert_ids_to_tokens(encoded_instance["input_ids"][0])
    entity_pos = []
    indices = []
    ent1, ent2 = [], []
    for t in sent:
        # first find all four '*'
        if re.match('\*', t):
            indices.append(z-y)
            y +=1
        z+=1
    if len(indices)==4:
        ent1 = indices[:2] # index / indices of the first NE
        ent2 = indices[2:] # index / indices of the second NE
    entity_pos.append(ent1)
    entity_pos.append(ent2)
    if len(ent1) != 2 or len(ent2) != 2:
        entity_pos = None
    return entity_pos  

ent_pos_df['embeddings'] = data_ast.SENT.apply(lambda x: get_embeddings(x))

#def get_tok(x):
#    encoded_instance = bert_tokenizer.batch_encode_plus([x], padding=True)
#    sent = bert_tokenizer.convert_ids_to_tokens(encoded_instance["input_ids"][0])
#    return sent

#ent_pos_df.SENT = data_ast.SENT.apply(lambda x: get_tok(x))


In [17]:
entity_pos = ent_pos_df.embeddings.values.tolist()
ent_pos_df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SENT,embeddings
Unnamed: 0_level_1,note_id,sent_id,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,admission date *2151-7-16* *discharge* date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process tuberculosis,"[[3, 9], [9, 10]]"
2,0,0,admission date *2151-7-16* discharge date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*,"[[3, 9], [47, 48]]"
3,0,0,admission date 2151-7-16 *discharge* date *2151-8-4* service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process tuberculosis,"[[9, 10], [11, 17]]"
4,0,0,admission date 2151-7-16 *discharge* date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*,"[[9, 10], [47, 48]]"
5,0,0,admission date 2151-7-16 discharge date *2151-8-4* service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*,"[[11, 17], [47, 48]]"
16,1,1,*two days* prior to *admission* she was started on a prednisone taper and one day prior to admission she required oxygen at home in order to maintain oxygen saturation greater than 90,"[[1, 3], [5, 6]]"
21,1,1,two days prior to admission she was started on a prednisone taper and *one day* prior to *admission* she required oxygen at home in order to maintain oxygen saturation greater than 90,"[[18, 20], [22, 23]]"
22,1,5,review of systems is negative for the following *fevers* *chills* nausea vomiting night sweats change in weight gastrointestinal complaints neurologic changes rashes palpitations orthopnea,"[[9, 11], [11, 13]]"
23,1,5,review of systems is negative for the following *fevers* chills *nausea* vomiting night sweats change in weight gastrointestinal complaints neurologic changes rashes palpitations orthopnea,"[[9, 11], [13, 14]]"
24,1,5,review of systems is negative for the following *fevers* chills nausea *vomiting* night sweats change in weight gastrointestinal complaints neurologic changes rashes palpitations orthopnea,"[[9, 11], [14, 15]]"


In [18]:
data_ast.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SENT
Unnamed: 0_level_1,note_id,sent_id,Unnamed: 3_level_1
0,0,0,admission date *2151-7-16* *discharge* date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process tuberculosis
2,0,0,admission date *2151-7-16* discharge date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*
3,0,0,admission date 2151-7-16 *discharge* date *2151-8-4* service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process tuberculosis
4,0,0,admission date 2151-7-16 *discharge* date 2151-8-4 service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*
5,0,0,admission date 2151-7-16 discharge date *2151-8-4* service addendum radiologic studies radiologic studies also included a chest ct which confirmed cavitary lesions in the left lung apex consistent with infectious process *tuberculosis*


In [19]:
data = data_ast.copy()
data.SENT = data.SENT.apply(lambda x: x.replace('*', ''))
#data.head()

In [20]:
# Set max_len to the maximum length of the training data 
max_len = max([len(bert_tokenizer.encode(s)) for s in data.SENT.to_list()])
print("The maximum sentence length in the data based on BERT BPEs is", max_len)

The maximum sentence length in the data based on BERT BPEs is 89


In [21]:
# Tokenize and encode the sentences
embed = bert_tokenizer.batch_encode_plus(
    data.SENT.tolist(),
    max_length = max_len,
    padding=True,
    truncation=True
)

In [22]:
# Convert lists to tensors 
embed_seq = torch.tensor(embed['input_ids'])
embed_mask = torch.tensor(embed['attention_mask'])
embed_ent_pos = torch.tensor(entity_pos)

In [23]:
embed_ent_pos[0]

tensor([[ 3,  9],
        [ 9, 10]])

In [24]:
embed_mask[0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [25]:
print(embed_seq.size())
print(embed_mask.size())
print(embed_ent_pos.size())

torch.Size([3704, 89])
torch.Size([3704, 89])
torch.Size([3704, 2, 2])


In [26]:
batch_size = 4

# Create a dataloader for each set
# TensorDataset: Creates a PyTorch dataset object to load data from
embed_data = TensorDataset(embed_seq, embed_mask, embed_ent_pos)

# DataLoader: a Python iterable over a dataset
embed_dataloader = DataLoader(embed_data, batch_size=batch_size, shuffle=False)

print(len(embed_data))

3704


In [27]:
model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
# all entity embeddings
all_embeddings = []

for batch in tqdm(embed_dataloader, desc="Iteration"):  
    batch = [r.to(device) for r in batch] 
    sent_id, mask, ent_pos = batch
    output = model(sent_id, attention_mask=mask)
    sequence_output = output[0]  # Tensor (batch_size x input_length x 768)
    for i in range(len(batch)):          # for each instance = sentence
        entity_embs = []   # entity embeddings for each sentence
        entries = ent_pos[i].tolist()
        # 2 entries per scentence
        for start, end in entries:        # for start and end position of each mention
            for y in range(start,end):
                entity_embs.append(sequence_output[i, y + 1])
        if len(entity_embs) > 0:
            entity_embs = torch.logsumexp(torch.stack(entity_embs, dim=0), dim=0)
        else:                                                           # should not be the case
            entity_embs = torch.zeros(self.hidden_size).to(sequence_output)
        all_embeddings.append(entity_embs)

Iteration:   0%|          | 0/926 [00:00<?, ?it/s]

torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])
torch.Size([4, 89, 768])


RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:75] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1093632 bytes. Buy new RAM!

train embeddings

In [29]:
train_embeddings = all_embeddings 
len(train_embeddings)

540

Due to a runtime error, only a part of the embeddings is saved.

In [30]:
pickle_file4 = './train_embeddings_540.joblib'
with open(pickle_file4, 'wb') as f:
    dump(train_embeddings, f, compress='zlib')

In [None]:
train_embeddings = joblib.load('./train_embeddings_540.joblib')

In [31]:
train_emb_tens = []
for t in train_embeddings:
    a =t.detach()
    train_emb_tens.append(a.tolist())

train_emb_torch = torch.tensor(train_emb_tens)

In [32]:
train_emb_torch.size()

torch.Size([540, 768])

In [33]:
# save tokenized sentences in joblib format

pickle_file5 = './train_embeddings_torch_540.joblib'
with open(pickle_file5, 'wb') as f:
    dump(train_emb_torch, f, compress='zlib')   

test embeddings

In [None]:
test_embeddings = all_embeddings 
len(test_embeddings)

In [None]:
test_emb_tens = []
for t in test_embeddings:
    a =t.detach()
    test_emb_tens.append(a.tolist())

test_emb_torch = torch.tensor(test_emb_tens)

In [None]:
test_emb_torch.size()

In [None]:
# save tokenized sentences in joblib format

pickle_file6 = './test_embeddings.joblib'
with open(pickle_file6, 'wb') as f:
    dump(test_emb_torch, f, compress='zlib')

# References

Zhou, W., Huang, K., Ma, T., & Huang, J. (2021, May). Document-level relation extraction with adaptive thresholding and localized context pooling. In Proceedings of the AAAI conference on artificial intelligence (Vol. 35, No. 16, pp. 14612-14620).