In [1]:
import sys
sys.path.insert(1, '../')
from utils.packages import *

In [2]:
#train, val, test = train_test_val_split(dataset, train_ratio=0.6, validation_ratio=0.2, test_ratio=0.2)
path='../data/datasets/synthea/'
#train.to_parquet(path + 'train')
#val.to_parquet(path + 'val')
#test.to_parquet(path + 'test')

train = pd.read_parquet(path + 'train')
val = pd.read_parquet(path + 'val')
test = pd.read_parquet(path + 'test')

In [20]:
global_params = {
    'max_seq_len': 32,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 32,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cuda' #change this to run on cuda #'cuda:0'
}

In [21]:
files = {'code':'../data/vocabularies/Synthea/snomedrxnorm.npy',
         'age':'../data/vocabularies/Synthea/age.npy'
        }
tokenizer = EHRTokenizer(task='readmission', filenames=files)

In [22]:
#Shallow
model_config = {
    'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding,
    'gender_vocab_size': 3,
    'max_position_embeddings': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 4, # number of multi-head attention layers required
    'num_attention_heads': 6, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
    'use_prior':True,
}

In [23]:
condfiles = {'dd':'../data/train_stats/dd_cond_probs.empirical.p', 
             'dp':'../data/train_stats/dp_cond_probs.empirical.p', 
             'pp':'../data/train_stats/pp_cond_probs.empirical.p' , 
             'pd':'../data/train_stats/pd_cond_probs.empirical.p'
            }

In [24]:
conf = BertConfig(model_config)

In [8]:
tensorboarddir = '../logs/'

In [13]:
trainer = pl.Trainer(
            max_epochs=1, 
            gpus=-1,
            logger=pl.loggers.TensorBoardLogger(save_dir=tensorboarddir),
            callbacks=[pl.callbacks.TQDMProgressBar()], 
            progress_bar_refresh_rate=1,
            weights_summary=None, # Can be None, top or full
            num_sanity_val_steps=10,
        )

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [10]:
traind = EHRDatasetReadmission(train, max_len=train_params['max_len_seq'], conditional_files=condfiles, tokenizer=tokenizer, run_type='train', nvisits=3)
vald = EHRDatasetReadmission(val, max_len=train_params['max_len_seq'], tokenizer=tokenizer, conditional_files=condfiles, run_type='val', nvisits=3)
testd = EHRDatasetReadmission(test, max_len=train_params['max_len_seq'], tokenizer=tokenizer, conditional_files=condfiles, run_type='test', nvisits=3)

Transforming data
Saving data
Transforming data
Saving data
Transforming data
Saving data


In [11]:
trainloader = torch.utils.data.DataLoader(traind, batch_size=32, shuffle=True, num_workers=4)
valloader = torch.utils.data.DataLoader(vald, batch_size=32, shuffle=True, num_workers=4)
testloader = torch.utils.data.DataLoader(testd, batch_size=32, shuffle=True, num_workers=4)

In [14]:
model = BertSinglePrediction(conf, num_labels=1) 
PATH = "../saved_models/MLM/model_with_prior_82test"
model = load_model(PATH, model)
params = list(model.named_parameters())
optim = adam(params, optim_param)

t_total value of -1 results in schedule not being applied


In [15]:
patienttrajectory = TrainerBinaryPrediction(model, optim, optim_param)

trainer.fit(
    patienttrajectory, 
    train_dataloader=trainloader,
);

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Training: 0it [00:00, ?it/s]

In [16]:
predictions = trainer.predict(patienttrajectory, dataloaders=testloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 11172it [00:00, ?it/s]

In [17]:
avg_f1 = sum([ stats['f1-score'] for stats in predictions ]) / len(predictions)
avg_f1*100

0.0

In [18]:
avg_auc = sum([ stats['AUC'] for stats in predictions ]) / len(predictions)
avg_auc*100

nan

In [19]:
avg_aucpr = sum([ stats['AUCPR'] for stats in predictions ]) / len(predictions)
avg_aucpr*100

nan