In [None]:
import torch
import pickle
import pandas as pd
import random
import os
import numpy as np
import statistics
import matplotlib.pyplot as plt
import pickle

In [None]:
!pip install transformers
!wget -O scibert_uncased.tar https://github.com/naver/biobert-pretrained/releases/download/v1.1-pubmed/biobert_v1.1_pubmed.tar.gz
!tar -xvf scibert_uncased.tar

import torch
from transformers import BertTokenizer, BertModel
import argparse
import logging

import torch

from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert


logging.basicConfig(level=logging.INFO)


def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
convert_tf_checkpoint_to_pytorch("biobert_v1.1_pubmed/model.ckpt-1000000", "biobert_v1.1_pubmed/bert_config.json", "biobert_v1.1_pubmed/pytorch_model.bin")

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
!ls biobert_v1.1_pubmed
!mv biobert_v1.1_pubmed/bert_config.json biobert_v1.1_pubmed/config.json
!ls biobert_v1.1_pubmed
model_version = 'biobert_v1.1_pubmed'
do_lower_case = True
model = BertModel.from_pretrained(model_version).to(device)
tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=do_lower_case)
model.eval()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
def embed_text(text, model):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    # print(input_ids.shape)
    outputs = model(input_ids)
    last_hidden_states = outputs[0].mean(1)  # The last hidden-state is the first element of the output tuple
    # print(last_hidden_states.shape)
    return last_hidden_states

def embed_long_text(text, model):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # [1,number of tokens]
    chunks = input_ids.split(512, dim=1)
    embedding = torch.zeros(1, 768).to(device)
    for index, chunk in enumerate(chunks):
#         print(f'chunk : {index}/{len(chunks)-1}')
        with torch.no_grad():
            outputs = model(chunk.to(device))
        embedding += outputs[0].mean(1) # [1,768]
    return (embedding/len(chunks)).float()

# def get_similarity(em, em2):
#     return cosine_similarity(em.detach().numpy(), em2.detach().numpy()

In [None]:
dir = '/content/drive/MyDrive/mimic-iii-clinical-database-1.4/Merged_files/sentences.csv'
d = pd.read_csv(dir, lineterminator='\n', index_col=0)
d.head()

In [None]:
df = d[d['TEXT'].apply(lambda x: len(x.split())) < 3915].reset_index()

In [None]:
df.drop('index', inplace=True)
df.head()

In [None]:
stored_embb_temp = {
    "GENDER": {},
    "AGE": {},
    "RELIGION": {},
    "ETHNICITY": {},
    "ADMISSION_TYPE": {},
    "ADMISSION_LOCATION": {},
    "DISCHARGE_LOCATION": {}
}

available_sentences = {
    "GENDER": {},
    "AGE": {},
    "RELIGION": {},
    "ETHNICITY": {},
    "ADMISSION_TYPE": {},
    "ADMISSION_LOCATION": {},
    "DISCHARGE_LOCATION": {}
}

available = {
    "HADM_ID": False,
    "SUBJECT_ID": False,
    "GENDER": True,
    "AGE": True,
    "RELIGION": True,
    "ETHNICITY": True,
    "ADMISSION_TYPE": True,
    "ADMISSION_LOCATION": True,
    "DIAGNOSIS": False,
    "PROCEDURE": False,
    "CATEGORY": False,
    "DESCRIPTION": False,
    "TEXT": False,
    "DISCHARGE_LOCATION": True
}

In [None]:
for column in available_sentences.keys():
    uniques = df[column].unique()
    for unique in uniques:
        available_sentences[column][unique] = True
        stored_embb_temp[column][unique] = embed_long_text(unique, model).to('cpu')

In [None]:
columns = list(df.columns[:-2])
columns.remove('index')
count = 0

In [None]:
import json
for index, row in df.iterrows():
    print(index)
    temp_list = []
    for column in columns:
        torch.cuda.empty_cache()
        if available[column]:
            temp_list.append(stored_embb_temp[column][row[column]])
        else:
            temp_list.append(embed_long_text(row[column], model).to('cpu'))
    temp_list = torch.cat(temp_list, dim=0)
    with open('/content/drive/MyDrive/mimic-iii-clinical-database-1.4/embeddings/data.json', 'a') as f:
        json.dump({index: temp_list.tolist()}, f)
        f.write('\n')
    with open('/content/drive/MyDrive/mimic-iii-clinical-database-1.4/embeddings/target_los.json', 'a') as f:
        json.dump({index: row['LOS']}, f)
        f.write('\n')
    with open('/content/drive/MyDrive/mimic-iii-clinical-database-1.4/embeddings/target_jm.json', 'a') as f:
        json.dump({index: row['HOSPITAL_EXPIRE_FLAG']}, f)
        f.write('\n')
    del temp_list