# Compute embeddings for the encoders of CNEP

 * https://github.com/ncbi-nlp/BioSentVec#biosentvec
 * https://github.com/epfml/sent2vec
 * https://github.com/ncbi-nlp/BioSentVec/blob/master/BioSentVec_tutorial.ipynb
 * https://arxiv.org/abs/1810.09302

In [None]:
import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance
import pickle
from tqdm import tqdm
import numpy as np
import torch
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
stop_words = set(stopwords.words('english'))
punctuation_less = '"#$%&\'()*+,-/:;<=>@[\\]^_`{|}~'

def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

def preprocess_sentence_leave_dot(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation_less and token not in stop_words]

    return ' '.join(tokens)

def mean_pooling(model_output, attention_mask):
    # Extract the token embeddings
    token_embeddings = model_output[0]
    # Compute the attention mask
    input_mask_expanded = (attention_mask
                           .unsqueeze(-1)
                           .expand(token_embeddings.size())
                           .float())
    # Sum the embeddings, but ignore masked tokens
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    # Return the average as a single vector
    return sum_embeddings / sum_mask

# def embed_text(examples):
#     inputs = tokenizer(examples["notes"], padding=True, truncation=True,
#                         max_length=510, return_tensors="pt")
#     with torch.no_grad():
#         model_output = model(**inputs)
#     pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
#     return {"embedding": pooled_embeds.cpu().numpy()}

def windowsEmbedding(model, tokens, use_pooler=True, use_mean_pooling=False, chunksize=512):
    # split into chunks of 510 tokens, we also convert to list (default is tuple which is immutable)
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))

    # loop through each chunk
    for i in range(len(input_id_chunks)):
        # add CLS and SEP tokens to input IDs
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])
        # add attention tokens to attention mask
        mask_chunks[i] = torch.cat([
            torch.tensor([1]), mask_chunks[i], torch.tensor([1])
        ])
        # get required padding length
        pad_len = chunksize - input_id_chunks[i].shape[0]
        # check if tensor length satisfies required chunk size
        if pad_len > 0:
            # if padding length is more than 0, we must add padding
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_len)
            ])
            mask_chunks[i] = torch.cat([
                mask_chunks[i], torch.Tensor([0] * pad_len)
            ])

    # check length of each tensor
    #for chunk in input_id_chunks:
    #    print(len(chunk))
    # print final chunk so we can see 101, 102, and 0 (PAD) tokens are all correctly placed
    #chunk

    model.eval()
    
    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(mask_chunks)

    input_dict = {
        'input_ids': input_ids.long().to(device),
        'attention_mask': attention_mask.int().to(device)
    }

    with torch.no_grad():
        if use_pooler:
            output = model(**input_dict).pooler_output.mean(dim=0).detach().cpu().numpy()
        else:
            if use_mean_pooling:
                chunk_size = 4
                input_ids_list = torch.split(input_dict['input_ids'], chunk_size, dim=0)
                attention_mask_list = torch.split(input_dict['attention_mask'], chunk_size, dim=0)

                output_list = []
                for i_ids, am in zip(input_ids_list, attention_mask_list):
                    input_dict = {
                        'input_ids': i_ids.to(device),
                        'attention_mask': am.to(device)
                    }
                    model_output = model(**input_dict)
                    pooled_embeds = mean_pooling(model_output, input_dict["attention_mask"])
                    output = pooled_embeds.detach().mean(dim=0).cpu().numpy()
                    output_list.append(output)

                output = np.array(output_list).mean(axis=0)
            else:
                output = model(**input_dict)[0][:,0,:].detach().mean(dim=0).cpu().numpy()

    return output


# 1. Doc2Vec Model

* from gensim.models.doc2vec import Doc2Vec, TaggedDocument
* https://radimrehurek.com/gensim/models/doc2vec.html

In [None]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np

In [None]:
seq_len = None # 2000
USE_CHUNKS = False
USE_PREPRO = False
ext_attr = '_prepro' if USE_PREPRO else ''

if USE_PREPRO:
    preprodata = lambda x: x
    preprodata_dot = lambda x: x
else:
    preprodata = preprocess_sentence
    preprodata_dot = preprocess_sentence_leave_dot

# Tokenization of each document
tokenized_sent = []

# data_path = '/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/'
data_path = '../data/mimic3/'

datasets = ['train'] #,'val','test']

for dataset in datasets:
    embeds = []
    embeds_events = []
    train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP{ext_attr}.pickle', 'rb'))

    for i in tqdm(range(len(train_data['notes']))):
        inputs = train_data['notes'][i][:seq_len]
        tokenized_sent.append(word_tokenize(inputs.lower()))
    
    for i in tqdm(range(len(train_data['eventsnotes']))):
        inputs = train_data['eventsnotes'][i][:seq_len]
        tokenized_sent.append(word_tokenize(inputs.lower()))
        
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_sent)]


In [None]:
## doc2vec model
model = Doc2Vec(tagged_data, vector_size = 768, window = 2, min_count = 2, epochs = 10)
# model = Doc2Vec(tagged_data, vector_size=768, window=5, min_count=3, negative=0, workers=10, epochs=10) 
model_name = 'Doc2Vec'

## Print model vocabulary
# model.wv.key_to_index

In [None]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

for i in range(10):
    u = model.infer_vector("Kerry was discovered by researchers on the remote Cornwallis Island. They picked up the signal and decided to try to find him.".lower().split())
    v = model.infer_vector("A young humpback whale remained tangled in a shark net off the Gold Coast yesterday, despite valiant efforts by marine rescuers.".lower().split())

    print(cosine(u, v), end=' ')

In [None]:
print(f"Run this session with the following parameters: {seq_len=}, {USE_CHUNKS=}, {USE_PREPRO=}.")

# data_path = '/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/'
data_path = '../data/mimic3/'

datasets = ['train','val','test']

with torch.no_grad():
    for dataset in datasets:
        embeds = []
        embeds_events = []
        train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP{ext_attr}.pickle', 'rb'))

        for i in tqdm(range(len(train_data['notes']))):
            if USE_CHUNKS:
                inputs = word_tokenize(train_data['notes'][i][:seq_len].lower())
                sentence_vector = model.infer_vector(inputs)
            else:
                inputs = word_tokenize(train_data['notes'][i][:seq_len].lower())
                sentence_vector = model.infer_vector(inputs)
            embeds.append(sentence_vector.reshape(-1))

        for i in tqdm(range(len(train_data['eventsnotes']))):
            if USE_CHUNKS:
                inputs = word_tokenize(train_data['eventsnotes'][i][:seq_len].lower())
                sentence_vector = model.infer_vector(inputs)
            else:
                inputs = word_tokenize(train_data['eventsnotes'][i][:seq_len].lower())
                sentence_vector = model.infer_vector(inputs)
            embeds_events.append(sentence_vector.reshape(-1))

        embeds = np.array(embeds)
        embeds_events = np.array(embeds_events)
        print(train_data['inputs'].shape, embeds.shape, embeds_events.shape)
        train_data['embeds'] = embeds
        train_data['embeds_events'] = embeds_events
        del train_data['notes']
        del train_data['eventsnotes']

        attr_str = []
        if USE_CHUNKS:
            attr_str.append('chunked')
        # if USE_POOLER:
        #     attr_str.append('pooler')
        # if USE_MEAN_POOLING:
        #     attr_str.append('meanpooler')
        if USE_PREPRO:
            attr_str.append('prepro')
        if seq_len:
            attr_str.append(f'seq{seq_len}')
        
        pickle.dump(train_data, open(f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle', 'wb'))
        print(f'Finished {data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle')

# 2. Sent2Vec Model

* https://github.com/epfml/sent2vec

In [None]:
# load Sent2Vec model
# model_path = '/Users/jplasser/Downloads/BioSentVec_PubMed_MIMICIII-bigram_d700.bin'
model_path = '/home/thetaphipsi/Downloads/BioSentVec_PubMed_MIMICIII-bigram_d700.bin'
model = sent2vec.Sent2vecModel()

try:
    model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')
model_name = 's2v'

In [None]:
import re

MINWORDS = 3

def windowsSentenceEmbedding(model, inputs):
    # construct sentences from the given input with the following properties:
    # 1. sentence has a maximum of 384 words (to stay in the realm of maximum 510 tokens in average)
    # 2. sentence is no shorter than 10 words
    # 3. a sentence should be constructed from words and a stop character in the end, holding the constraints above.
    
    if inputs[-1] != '.':
        inputs += ' .'
    sentences = re.findall("[a-z].*?[\.!?]", inputs, re.MULTILINE | re.DOTALL )

    sentences_ltmw = [s for s in sentences if len(s.split()) > MINWORDS]
    if len(sentences_ltmw) > 0:
        sentences = sentences_ltmw
    
    embeds = np.asarray([model.embed_sentence(s) for s in sentences])
    embedding = embeds.mean(axis=0)
    
    return embedding


In [None]:
seq_len = None # 2000
USE_CHUNKS = False
USE_PREPRO = True

ext_attr = '_prepro' if USE_PREPRO else ''

if USE_PREPRO:
    preprodata = lambda x: x
    preprodata_dot = lambda x: x
else:
    preprodata = preprocess_sentence
    preprodata_dot = preprocess_sentence_leave_dot

print(f"Run this session with the following parameters: {seq_len=}, {USE_CHUNKS=}, {USE_PREPRO=}.")

# data_path = '/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/'
data_path = '../data/mimic3/'

datasets = ['train','val','test']

with torch.no_grad():
    for dataset in datasets:
        embeds = []
        embeds_events = []
        train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP{ext_attr}.pickle', 'rb'))

        for i in tqdm(range(len(train_data['notes']))):
            if USE_CHUNKS:
                inputs = preprodata_dot(train_data['notes'][i][:seq_len])
                sentence_vector = windowsSentenceEmbedding(model, inputs)
            else:
                inputs = preprodata(train_data['notes'][i][:seq_len])
                sentence_vector = model.embed_sentence(inputs)
            embeds.append(sentence_vector.reshape(-1))

        for i in tqdm(range(len(train_data['eventsnotes']))):
            
            if USE_CHUNKS:
                inputs = preprodata_dot(train_data['eventsnotes'][i][:seq_len])
                sentence_vector = windowsSentenceEmbedding(model, inputs)
            else:
                inputs = preprodata(train_data['eventsnotes'][i][:seq_len])
                sentence_vector = model.embed_sentence(inputs)
            embeds_events.append(sentence_vector.reshape(-1))

        embeds = np.array(embeds)
        embeds_events = np.array(embeds_events)
        print(train_data['inputs'].shape, embeds.shape, embeds_events.shape)
        train_data['embeds'] = embeds
        train_data['embeds_events'] = embeds_events
        del train_data['notes']
        del train_data['eventsnotes']
        
        attr_str = []
        if USE_CHUNKS:
            attr_str.append('chunked')
        # if USE_POOLER:
        #     attr_str.append('pooler')
        # if USE_MEAN_POOLING:
        #     attr_str.append('meanpooler')
        if USE_PREPRO:
            attr_str.append('prepro')
        if seq_len:
            attr_str.append(f'seq{seq_len}')
        
        pickle.dump(train_data, open(f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle', 'wb'))
        print(f'Finished {data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle')

# 3. SentenceTransformer Embeddings

 * https://github.com/UKPLab/sentence-transformers
     * https://github.com/UKPLab/sentence-transformers/issues/1300
 * https://github.com/yanzhangnlp/IS-BERT
     * https://github.com/yanzhangnlp/IS-BERT/blob/main/docs/pretrained_models.md

We can recommend this models as general purpose models. The best available models are:
- **roberta-large-nli-stsb-mean-tokens** - STSb performance: 86.39
- **roberta-base-nli-stsb-mean-tokens** - STSb performance: 85.44
- **bert-large-nli-stsb-mean-tokens** - STSb performance: 85.29
- **distilbert-base-nli-stsb-mean-tokens** - STSb performance:  85.16

[» Full List of STS Models](https://docs.google.com/spreadsheets/d/14QplCdTCDwEmTqrn1LH4yrbKvdogK4oQvYO1K1aPR5M/edit#gid=0)

I can recommend the **distilbert-base-nli-stsb-mean-tokens** model, which gives a nice balance between speed and performance.
     
## Models used
 
  * all-mpnet-base-v2
  * distilbert-base-nli-stsb-mean-tokens
  * roberta-base-nli-stsb-mean-tokens

In [None]:
# TODO
from sentence_transformers import SentenceTransformer, models

model = SentenceTransformer('all-mpnet-base-v2')
model_name = 'SentenceTransformer'
# model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# optional, not evaluated for now: model = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
# model = SentenceTransformer('stsb-mpnet-base-v2')

In [None]:
import re

MINWORDS = 3

def windowsSentenceTransformerEmbedding(model, inputs):
    # construct sentences from the given input with the following properties:
    # 1. sentence has a maximum of 384 words (to stay in the realm of maximum 510 tokens in average)
    # 2. sentence is no shorter than 10 words
    # 3. a sentence should be constructed from words and a stop character in the end, holding the constraints above.
    
    if inputs[-1] != '.':
        inputs += ' .'
    sentences = re.findall("[a-z].*?[\.!?]", inputs, re.MULTILINE | re.DOTALL )

    sentences_ltmw = [s for s in sentences if len(s.split()) > MINWORDS]
    if len(sentences_ltmw) > 0:
        sentences = sentences_ltmw
    
    embeds = np.asarray([model.encode(s) for s in sentences])
    embedding = embeds.mean(axis=0)
    
    return embedding

In [None]:
seq_len = None # 2000
USE_CHUNKS = True
USE_POOLER = False
USE_MEAN_POOLING = False and not USE_POOLER
USE_PREPRO = True

ext_attr = '_prepro' if USE_PREPRO else ''

if USE_PREPRO:
    preprodata = lambda x: x
    preprodata_dot = lambda x: x
else:
    preprodata = preprocess_sentence
    preprodata_dot = preprocess_sentence_leave_dot

print(f"Run this session with the following parameters: {seq_len=}, {USE_CHUNKS=}, {USE_POOLER=}, {USE_MEAN_POOLING=}, {USE_PREPRO=}.")

# data_path = '/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/'
data_path = '../data/mimic3/'

datasets = ['train','val','test']

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

with torch.no_grad():
    for dataset in datasets:
        embeds = []
        embeds_events = []
        train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP{ext_attr}.pickle', 'rb'))

        for i in tqdm(range(len(train_data['notes']))):
            if USE_CHUNKS:
                inputs = preprodata_dot(train_data['notes'][i][:seq_len])
                sentence_vector = windowsSentenceTransformerEmbedding(model, inputs)
            else:
                inputs = preprodata_dot(train_data['notes'][i][:seq_len])
                sentence_vector = model.encode(inputs)
            embeds.append(sentence_vector.reshape(-1))

        for i in tqdm(range(len(train_data['eventsnotes']))):
            if USE_CHUNKS:
                inputs = preprodata_dot(train_data['eventsnotes'][i][:seq_len])
                sentence_vector = windowsSentenceTransformerEmbedding(model, inputs)
            else:
                inputs = preprodata_dot(train_data['eventsnotes'][i][:seq_len])
                sentence_vector = model.encode(inputs)
            embeds_events.append(sentence_vector.reshape(-1))

        embeds = np.array(embeds)
        embeds_events = np.array(embeds_events)
        print(train_data['inputs'].shape, embeds.shape, embeds_events.shape)
        train_data['embeds'] = embeds
        train_data['embeds_events'] = embeds_events
        del train_data['notes']
        del train_data['eventsnotes']
        
        attr_str = []
        if USE_CHUNKS:
            attr_str.append('chunked')
        if USE_POOLER:
            attr_str.append('pooler')
        if USE_MEAN_POOLING:
            attr_str.append('meanpooler')
        if USE_PREPRO:
            attr_str.append('prepro')
        if seq_len:
            attr_str.append(f'seq{seq_len}')
        
        pickle.dump(train_data, open(f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle', 'wb'))
        print(f'Finished {data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle')
        
print("Merging train and val to extended...")
merge_datasets = ['train','val'] # , 'test']
target_dataset = 'extended'

dataset = merge_datasets[0]

template = f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle'
data = pickle.load(open(template, 'rb'))

for dataset in merge_datasets[1:]:
    template = f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle'
    data_ = pickle.load(open(template, 'rb'))

    for k in data.keys():
        if isinstance(data[k], np.ndarray):
            data[k] = np.concatenate((data[k], data_[k]), axis=0)
        else:
            data[k].extend(data_[k])

assert len(set([d.shape[0] if isinstance(d, np.ndarray) else len(d) for d in data.values()])) == 1

dataset = target_dataset
template = f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle'
pickle.dump(data, open(template, 'wb'))
print("Done.")

In [None]:
seq_len = None # 2000

# data_path = '/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/'
data_path = '../data/mimic3/'

datasets = ['train','val','test']

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

with torch.no_grad():
    for dataset in datasets:
        embeds = []
        embeds_events = []
        train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP.pickle', 'rb'))

        for i in tqdm(range(len(train_data['notes']))):
            inputs = preprocess_sentence(train_data['notes'][i][:seq_len])
            sentence_vector = model.encode(inputs)
            embeds.append(sentence_vector.reshape(-1))

        for i in tqdm(range(len(train_data['eventsnotes']))):
            inputs = preprocess_sentence(train_data['eventsnotes'][i][:seq_len])
            sentence_vector = model.encode(inputs)
            embeds_events.append(sentence_vector.reshape(-1))

        embeds = np.array(embeds)
        embeds_events = np.array(embeds_events)
        print(train_data['inputs'].shape, embeds.shape, embeds_events.shape)
        train_data['embeds'] = embeds
        train_data['embeds_events'] = embeds_events
        del train_data['notes']
        del train_data['eventsnotes']
        pickle.dump(train_data, open(f'{data_path}new_{dataset}_data_unique_embed_ST_stsb-mpnet-base-v2.pickle', 'wb'))

# 4. Use the embeddings of Transformer models

 * bert-base-uncased
 * https://huggingface.co/bert-base-uncased
 * https://huggingface.co/bert-large-uncased

 * dmis-lab/biobert-base-cased-v1.2
 * https://huggingface.co/dmis-lab/biobert-base-cased-v1.2
 

In [None]:
# dmis-lab/biobert-base-cased-v1.2
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# BERT model
# model_name = "BERT"
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# model = AutoModel.from_pretrained("bert-base-uncased")

# BERT large model
# model_name = "BERT_large"
# tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
# model = AutoModel.from_pretrained("bert-large-uncased")

# RoBERTa  model
model_name = "RoBERTa"
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

# PubMedBERT  model
# model_name = "PubMedBERT"
# tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")
# model = AutoModel.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

# 5. Use the embeddings of BioBERT model

# * dmis-lab/biobert-base-cased-v1.2
# * https://huggingface.co/dmis-lab/biobert-base-cased-v1.2

# BioBERT model
# model_name = "BioBERT"
# tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.2")
# model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.2")

# BioELECTRA model
# model_name = "BioELECTRA"
# tokenizer = AutoTokenizer.from_pretrained("kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt")
# model = AutoModel.from_pretrained("kamalkraj/bioelectra-base-discriminator-pubmed-pmc-lt")

chunksize = 512

# GPT-2-Large model
# model_name = "GPT-2"
# tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
# model = AutoModel.from_pretrained("gpt2-large")
# tokenizer.pad_token = tokenizer.eos_token
# chunksize=1024

In [None]:
seq_len = None # 2000
USE_CHUNKS = True
USE_POOLER = False
USE_MEAN_POOLING = True and not USE_POOLER
USE_PREPRO = True

ext_attr = '_prepro' if USE_PREPRO else ''

if USE_PREPRO:
    preprodata = lambda x: x
    preprodata_dot = lambda x: x
else:
    preprodata = preprocess_sentence
    preprodata_dot = preprocess_sentence_leave_dot

print(f"Run this session with the following parameters: {seq_len=}, {USE_CHUNKS=}, {USE_POOLER=}, {USE_MEAN_POOLING=}, {USE_PREPRO=}.")

# data_path = '/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/'
data_path = '../data/mimic3/'

datasets = ['train','val','test']

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

with torch.no_grad():
    for dataset in datasets:
        embeds = []
        embeds_events = []
        train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP{ext_attr}.pickle', 'rb'))

        for i in tqdm(range(len(train_data['notes']))):
            if USE_CHUNKS:
                inputs = tokenizer(preprodata(train_data['notes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
                sentence_vector = windowsEmbedding(model, inputs, USE_POOLER, USE_MEAN_POOLING, chunksize=chunksize)
            else:
                inputs = tokenizer(preprodata(train_data['notes'][i][:seq_len]), return_tensors="pt", max_length=chunksize-2).to(device)
                if USE_POOLER:
                    sentence_vector = model(**inputs).pooler_output.detach().cpu().numpy()
                else:
                    if USE_MEAN_POOLING:
                        model_output = model(**inputs)
                        pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
                        sentence_vector = pooled_embeds.detach().cpu().numpy()
                    else:
                        sentence_vector = model(**inputs).detach()[0][:,0,:].cpu().numpy()
            embeds.append(sentence_vector.reshape(-1))

        for i in tqdm(range(len(train_data['eventsnotes']))):
            if USE_CHUNKS:
                inputs = tokenizer(preprodata(train_data['eventsnotes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
                sentence_vector = windowsEmbedding(model, inputs, USE_POOLER, USE_MEAN_POOLING, chunksize=chunksize)
            else:
                inputs = tokenizer(preprodata(train_data['eventsnotes'][i][:seq_len]), return_tensors="pt", max_length=chunksize-2).to(device)
                if USE_POOLER:
                    sentence_vector = model(**inputs).pooler_output.detach().cpu().numpy()
                else:
                    if USE_MEAN_POOLING:
                        model_output = model(**inputs)
                        pooled_embeds = mean_pooling(model_output, inputs["attention_mask"])
                        sentence_vector = pooled_embeds.detach().cpu().numpy()
                    else:
                        sentence_vector = model(**inputs).detach()[0][:,0,:].cpu().numpy()
            embeds_events.append(sentence_vector.reshape(-1))

        embeds = np.array(embeds)
        embeds_events = np.array(embeds_events)
        print(train_data['inputs'].shape, embeds.shape, embeds_events.shape)
        train_data['embeds'] = embeds
        train_data['embeds_events'] = embeds_events
        del train_data['notes']
        del train_data['eventsnotes']
        
        attr_str = []
        if USE_CHUNKS:
            attr_str.append('chunked')
        if USE_POOLER:
            attr_str.append('pooler')
        if USE_MEAN_POOLING:
            attr_str.append('meanpooler')
        if USE_PREPRO:
            attr_str.append('prepro')
        if seq_len:
            attr_str.append(f'seq{seq_len}')
        
        pickle.dump(train_data, open(f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle', 'wb'))
        print(f'Finished {data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle')
        
print("Merging train and val to extended...")
merge_datasets = ['train','val'] # , 'test']
target_dataset = 'extended'

dataset = merge_datasets[0]

template = f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle'
data = pickle.load(open(template, 'rb'))

for dataset in merge_datasets[1:]:
    template = f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle'
    data_ = pickle.load(open(template, 'rb'))

    for k in data.keys():
        if isinstance(data[k], np.ndarray):
            data[k] = np.concatenate((data[k], data_[k]), axis=0)
        else:
            data[k].extend(data_[k])

assert len(set([d.shape[0] if isinstance(d, np.ndarray) else len(d) for d in data.values()])) == 1

dataset = target_dataset
template = f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle'
pickle.dump(data, open(template, 'wb'))
print("Done.")

In [None]:
train_data['embeds'].shape

# 5. Use the embeddings of BERT models: ClinicalBERT and Discharge Summary BERT

 * https://github.com/EmilyAlsentzer/clinicalBERT
 * https://huggingface.co/emilyalsentzer/Bio_ClinicalBERT
 * https://huggingface.co/emilyalsentzer/Bio_Discharge_Summary_BERT
 * https://arxiv.org/abs/1904.03323

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np

# Events Notes Model (EN)
tokenizer_EN = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model_EN = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Discharge Notes Model (DCN)
tokenizer_DCN = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")
model_DCN = AutoModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")

model_name = 'CliBERT'

In [None]:
seq_len = None # 2000

# when True only use the EN model, as it has been pretrained on the whole corpus of clinical notes from MIMIC-III
SINGLE_MODEL = False
USE_CHUNKS = True
USE_POOLER = False
USE_MEAN_POOLING = True and not USE_POOLER
USE_PREPRO = True

ext_attr = '_prepro' if USE_PREPRO else ''

if USE_PREPRO:
    preprodata = lambda x: x
    preprodata_dot = lambda x: x
else:
    preprodata = preprocess_sentence
    preprodata_dot = preprocess_sentence_leave_dot

print(f"Run this session with the following parameters: {seq_len=}, {USE_CHUNKS=}, {USE_POOLER=}, {USE_MEAN_POOLING=}, {USE_PREPRO=}.")

# data_path = '/Users/jplasser/Documents/AI Master/WS2021/MastersThesis/code.nosync/CNEP/src/data/mimic3/'
data_path = '../data/mimic3/'

datasets = ['train','val','test']

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model_DCN = model_DCN.to(device)
model_DCN.eval()
model_EN = model_EN.to(device)
model_EN.eval()

with torch.no_grad():
    for dataset in datasets:
        embeds = []
        embeds_events = []
        train_data = pickle.load(open(f'{data_path}new_{dataset}_data_unique_CNEP{ext_attr}.pickle', 'rb'))

        for i in tqdm(range(len(train_data['notes']))):
            if SINGLE_MODEL:
                if USE_CHUNKS:
                    inputs = tokenizer_EN(preprocess_sentence(train_data['notes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
                    sentence_vector = windowsEmbedding(model_EN, inputs, USE_POOLER, USE_MEAN_POOLING)
                else:
                    inputs = tokenizer_EN(preprocess_sentence(train_data['notes'][i][:seq_len]), return_tensors="pt", max_length=510).to(device)
                    sentence_vector = model_EN(**inputs).pooler_output.detach().cpu().numpy()
            else:
                if USE_CHUNKS:
                    inputs = tokenizer_DCN(preprocess_sentence(train_data['notes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
                    sentence_vector = windowsEmbedding(model_DCN, inputs, USE_POOLER, USE_MEAN_POOLING)
                else:
                    inputs = tokenizer_DCN(preprocess_sentence(train_data['notes'][i][:seq_len]), return_tensors="pt", max_length=510).to(device)
                    sentence_vector = model_DCN(**inputs).pooler_output.detach().cpu().numpy()
            embeds.append(sentence_vector.reshape(-1))

        for i in tqdm(range(len(train_data['eventsnotes']))):
            if USE_CHUNKS:
                inputs = tokenizer_EN(preprocess_sentence(train_data['eventsnotes'][i][:seq_len]), add_special_tokens=False, return_tensors='pt')
                sentence_vector = windowsEmbedding(model_EN, inputs, USE_POOLER, USE_MEAN_POOLING)
            else:
                inputs = tokenizer_EN(preprocess_sentence(train_data['eventsnotes'][i][:seq_len]), return_tensors="pt", max_length=510).to(device)
                sentence_vector = model_EN(**inputs).pooler_output.detach().cpu().numpy()
            embeds_events.append(sentence_vector.reshape(-1))

        embeds = np.array(embeds)
        embeds_events = np.array(embeds_events)
        print(train_data['inputs'].shape, embeds.shape, embeds_events.shape)
        train_data['embeds'] = embeds
        train_data['embeds_events'] = embeds_events
        del train_data['notes']
        del train_data['eventsnotes']
        
        attr_str = []
        if SINGLE_MODEL:
            attr_str.append('1m')
        else:
            attr_str.append('2m')
        if USE_CHUNKS:
            attr_str.append('chunked')
        if USE_POOLER:
            attr_str.append('pooler')
        if USE_MEAN_POOLING:
            attr_str.append('meanpooler')
        if USE_PREPRO:
            attr_str.append('prepro')
        if seq_len:
            attr_str.append(f'seq{seq_len}')
        
        pickle.dump(train_data, open(f'{data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle', 'wb'))
        print(f'Finished {data_path}new_{dataset}_data_unique_embed_{model_name}_{"_".join(attr_str)}.pickle')