In [1]:
import sys

In [2]:
sys.path.insert(1, '../')

In [3]:
from utils.dataset import EHRDataset
from model.tokenizer import EHRTokenizer
import pytorch_pretrained_bert as Bert
from torch.utils.data import DataLoader
from model.model import *
from utils.config import *
from utils.optimizer import adam
from model.trainers import TrainerMLM
import pytorch_lightning as pl
from sklearn.model_selection import KFold
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
import pytorch_lightning as pl
#from pytorch_lightning.strategies import DeepSpeedStrategy
warnings.filterwarnings('ignore')

from utils.vocabulary import *

In [4]:
#path = 'C:/Users/Johan/Documents/Skola/MasterThesis/Master-thesis/pre-processing/combined-csv-files.csv'
path_patients = '../data/datasets/readmission_data_synthea'
#path_prior = '../data/datasets/prior_table'

In [5]:
prior1 = pd.read_parquet('../data/datasets/prior_table_1')
prior2 = pd.read_parquet('../data/datasets/prior_table_2')

In [6]:
priordataset = pd.concat([prior1, prior2])
data = pd.read_parquet(path_patients)
dataset = data.merge(priordataset, on='subject_id', how='inner')

In [7]:
global_params = {
    'max_seq_len': 32,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 32,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cuda' #change this to run on cuda #'cuda:0'
}

In [8]:
dataset['subject_id'] = dataset['subject_id'].apply(lambda x: x.replace('-', ''))

In [9]:
#diagnos_codes = dataset['diagnos_code'].tolist()
#med_codes = dataset['medication_code'].tolist()
#diagnos_codes.extend(med_codes)
#file_path = '../data/vocabularies/Synthea/snomedrxnorm'
#write_codes_to_file(diagnos_codes, file_path)

In [10]:
#ages = dataset['age'].tolist()
#file_path = '../data/vocabularies/Synthea/age'
#write_age_to_file(ages, file_path)

In [11]:
files = {'code':'../data/vocabularies/Synthea/snomedrxnorm.npy',
         'age':'../data/vocabularies/Synthea/age.npy'
        }
tokenizer = EHRTokenizer(task='MLM', filenames=files)

In [12]:
len(tokenizer.getVoc('code').keys())

433

In [13]:
len(tokenizer.getVoc('age').keys())

113

In [14]:
#Shallow
model_config = {
    'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding,
    'gender_vocab_size': 3,
    'max_position_embeddings': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 4, # number of multi-head attention layers required
    'num_attention_heads': 6, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
    'use_prior':True,
}

In [15]:
conf = BertConfig(model_config)

In [16]:
tensorboarddir = '../logs/'

In [17]:
#kfold=KFold(n_splits=5,shuffle=True)

In [18]:
trainer = pl.Trainer(
            max_epochs=5, 
            gpus=-1,
            logger=pl.loggers.TensorBoardLogger(save_dir=tensorboarddir),
            callbacks=[pl.callbacks.TQDMProgressBar()], 
            progress_bar_refresh_rate=1,
            weights_summary=None, # Can be None, top or full
            num_sanity_val_steps=10,
        )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [19]:
trainset, testset = train_test_split(dataset, test_size=0.2)

In [20]:
traind = EHRDataset(trainset, max_len=train_params['max_len_seq'], tokenizer=tokenizer)
testd = EHRDataset(testset, max_len=train_params['max_len_seq'], tokenizer=tokenizer)

In [21]:
trainloader = torch.utils.data.DataLoader(traind, batch_size=32, shuffle=True, num_workers=4)
testloader = torch.utils.data.DataLoader(testd, batch_size=32, shuffle=True, num_workers=4)

In [22]:
model = BertForMaskedLM(conf) #BertForMaskedLM(conf)
params = list(model.named_parameters())
optim = adam(params, optim_param)

t_total value of -1 results in schedule not being applied


In [23]:
patienttrajectory = TrainerMLM(model, optim, optim_param, 0.1)

trainer.fit(
    patienttrajectory, 
    train_dataloaders=trainloader,
);

predictions = trainer.predict(patienttrajectory, dataloaders=testloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training: 0it [00:00, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 13802it [00:00, ?it/s]

In [25]:
avg_acc = sum([ stats['precision'] for stats in predictions ]) / len(predictions)
avg_acc*100

81.54469492325049

In [37]:
PATH = '../saved_models/MLM/model_with_prior_82test'
torch.save(model.state_dict(), PATH)

### Visualize latent-space, namely the encoded space for each layer

In [13]:
hfcodes = [59621000,88805009,431855005,53741008,431856006,433144002,40095003,86175003,84114007]

In [28]:
def load_model(path, model):
    # load pretrained model and update weights
    pretrained_dict = torch.load(path)
    model_dict = model.state_dict()
    # 1. filter out unnecessary keys
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    # 2. overwrite entries in the existing state dict
    model_dict.update(pretrained_dict)
    # 3. load the new state dict
    model.load_state_dict(model_dict)
    return model

In [29]:
model = BertForMaskedLM(conf) 
PATH = "../saved_models/MLM/model_with_prior_82test"
model = load_model(PATH, model)
params = list(model.named_parameters())
optim = adam(params, optim_param)

t_total value of -1 results in schedule not being applied


In [30]:
patienttrajectory = TrainerMLM(model, optim, optim_param, 0.1)

In [37]:
predictions = trainer.predict(patienttrajectory, dataloaders=testloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 0it [00:00, ?it/s]

In [38]:
avg_acc = sum([ stats['precision'] for stats in predictions ]) / len(predictions)
avg_acc*100

81.85273875492544