In [33]:
import sys

In [34]:
sys.path.insert(1, '../')

In [35]:
from utils.dataset import EHRDataset
from model.tokenizer import EHRTokenizer
import pytorch_pretrained_bert as Bert
from torch.utils.data import DataLoader
from model.model2 import *
#from utils.config import BertConfig
from model.trainer import PatientTrajectoryPredictor
import pytorch_lightning as pl

In [36]:
from sklearn.model_selection import KFold

In [37]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
def adam(params, config=None):
    if config is None:
        config = {
            'lr': 3e-5,
            'warmup_proportion': 0.1,
            'weight_decay': 0.01
        }
        
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0}
    ]

    optim = Bert.optimization.BertAdam(optimizer_grouped_parameters,
                                       lr=config['lr'],
                                       warmup=config['warmup_proportion'])
    return optim

In [40]:
#path = 'C:/Users/Johan/Documents/Skola/MasterThesis/Master-thesis/pre-processing/combined-csv-files.csv'
path = '../processing/readmission_data_ccsr'

In [41]:
class TrainConfig(object):
    def __init__(self, config):
        self.batch_size = config.get('batch_size')
        self.use_cuda = config.get('use_cuda')
        self.max_len_seq = config.get('max_len_seq')
        self.train_loader_workers = config.get('train_loader_workers')
        self.test_loader_workers = config.get('test_loader_workers')
        self.device = config.get('device')
        self.output_dir = config.get('output_dir')
        self.output_name = config.get('output_name')
        self.best_name = config.get('best_name')

In [42]:
global_params = {
    'max_seq_len': 32,
    'max_age': 110,
    'month': 1,
    'age_symbol': None,
    'min_visit': 5,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 64,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cuda' #change this to run on cuda #'cuda:0'
}

In [43]:
data = pd.read_parquet(path)

In [44]:
data.head(2)

Unnamed: 0,subject_id,label,icd_code,ccsr,age,alcohol_abuse,tobacco_abuse,ndc,hadm_id,gender
0,10008245,[0],"[[S32391A, E870, F72, G40909, R509, J45909, M8...","[[INJ003, END011, MBD014, NVS009, SYM002, RSP0...",[47.0],[0],[0],"[[69543013120, 904645561, 54482014407, 6655300...",[26674944],M
1,10031358,"[0, 0, 0, 0, 0, 0]","[[E11618, M86179, I96, E871, L97509, N179, E11...","[[END003, MUS002, CIR028, END011, SKN004, GEN0...","[62.0, 63.0, 64.0, 64.0, 65.0, 65.0]","[1, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[[63323026201, 71015892, 51079045420, 74706811...","[27421511, 28279098, 24522342, 29498981, 29887...",M


In [45]:
len(data)

170296

In [46]:
tokenizer = EHRTokenizer()

In [47]:
len(tokenizer.getVoc('code').keys())

493

In [48]:
len(tokenizer.getVoc('age').keys())

87

In [49]:
model_config = {
    'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding,
    #'gender_vocab_size': 3,
    'max_position_embeddings': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 2, # number of multi-head attention layers required
    'num_attention_heads': 4, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 768, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
}

In [50]:
class BertConfig(Bert.modeling.BertConfig):
    def __init__(self, config):
        super(BertConfig, self).__init__(
            vocab_size_or_config_json_file=config.get('vocab_size'),
            hidden_size=config['hidden_size'],
            num_hidden_layers=config.get('num_hidden_layers'),
            num_attention_heads=config.get('num_attention_heads'),
            intermediate_size=config.get('intermediate_size'),
            hidden_act=config.get('hidden_act'),
            hidden_dropout_prob=config.get('hidden_dropout_prob'),
            attention_probs_dropout_prob=config.get('attention_probs_dropout_prob'),
            max_position_embeddings = config.get('max_position_embeddings'),
            initializer_range=config.get('initializer_range'),
        )
        self.seg_vocab_size = config.get('seg_vocab_size')
        self.age_vocab_size = config.get('age_vocab_size')

In [51]:
conf = BertConfig(model_config)

In [52]:
tensorboarddir = '../logs/'

In [53]:
#kfold=KFold(n_splits=5,shuffle=True)

In [54]:
#allacc = []
#dataset = EHRDataset(data, max_len=train_params['max_len_seq'], tokenizer=tokenizer) 

In [55]:
# This is for the crossval, might not be runnable since we have done some debugging

'''
for fold,(train_idx,test_idx) in enumerate(kfold.split(data)):
    print('------------fold no---------{}----------------------'.format(fold))
    
    trainer = pl.Trainer(
            max_epochs=50, 
            gpus=-1,
            logger=pl.loggers.TensorBoardLogger(save_dir=tensorboarddir),
            callbacks=[pl.callbacks.progress.TQDMProgressBar()], 
            progress_bar_refresh_rate=1,
            weights_summary=None, # Can be None, top or full
            num_sanity_val_steps=10,
        )
    
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)
    
    trainloader = torch.utils.data.DataLoader(dataset, batch_size=train_params['batch_size'], sampler=train_subsampler)
    testloader = torch.utils.data.DataLoader(dataset, batch_size=train_params['batch_size'], sampler=test_subsampler)
    
    
    model = BertForMaskedLM(conf) #BertForMaskedLM(conf)
    params = list(model.named_parameters())
    optim = adam(params, optim_param)
    
    patienttrajectory = PatientTrajectoryPredictor(model, optim, optim_param, metrics=True)
    
    trainer.fit(
        patienttrajectory, 
        train_dataloader=trainloader,
    );
  
    predictions = trainer.predict(patienttrajectory, dataloaders=testloader)
    avg_acc = sum([ stats['Accuracy'] for stats in predictions ]) / len(testloader)
    allacc.append(avg_acc)
    print("Average Test accuracy for fold {}: {} ".format(fold, avg_acc))
    break
'''

'\nfor fold,(train_idx,test_idx) in enumerate(kfold.split(data)):\n    print(\'------------fold no---------{}----------------------\'.format(fold))\n    \n    trainer = pl.Trainer(\n            max_epochs=50, \n            gpus=-1,\n            logger=pl.loggers.TensorBoardLogger(save_dir=tensorboarddir),\n            callbacks=[pl.callbacks.progress.TQDMProgressBar()], \n            progress_bar_refresh_rate=1,\n            weights_summary=None, # Can be None, top or full\n            num_sanity_val_steps=10,\n        )\n    \n    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)\n    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)\n    \n    trainloader = torch.utils.data.DataLoader(dataset, batch_size=train_params[\'batch_size\'], sampler=train_subsampler)\n    testloader = torch.utils.data.DataLoader(dataset, batch_size=train_params[\'batch_size\'], sampler=test_subsampler)\n    \n    \n    model = BertForMaskedLM(conf) #BertForMaskedLM(conf)\n    

### Testing with BEHRTs training approach (without pytorch lightning)

In [56]:
import sklearn.metrics as skm

In [57]:
def cal_acc(label, pred):
    logs = nn.LogSoftmax()
    label=label.cpu().numpy()
    ind = np.where(label!=-1)[0]
    truepred = pred.detach().cpu().numpy()
    truepred = truepred[ind]
    truelabel = label[ind]
    truepred = logs(torch.tensor(truepred))
    outs = [np.argmax(pred_x) for pred_x in truepred.numpy()]
    
   # print("Truelabel:")
   # print(truelabel)
    
   # print("Output:")
   # print(outs)
    precision = skm.precision_score(truelabel, outs, average='micro')
    return precision

In [58]:
conf = BertConfig(model_config)
model = BertForMaskedLM(conf) 
model = model.to('cuda')

In [59]:
params = list(model.named_parameters())
optim = adam(params, optim_param)

t_total value of -1 results in schedule not being applied


In [60]:
def train(e, loader):
    tr_loss = 0
    temp_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    cnt= 0
    start = time.time()

    for step, batch in enumerate(loader):
        cnt +=1
        batch = tuple(t.to(train_params['device']) for t in batch)
        age_ids, gender_ids, input_ids, posi_ids, segment_ids, attMask, masked_label, _ = batch
        #print()
        loss, pred, label = model(input_ids, age_ids, gender_ids, segment_ids, posi_ids,attention_mask=attMask, labels=masked_label)
        if global_params['gradient_accumulation_steps'] >1:
            loss = loss/global_params['gradient_accumulation_steps']
        loss.backward()
        
        temp_loss += loss.item()
        tr_loss += loss.item()
        
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        
        if step % 200==0:
            print("epoch: {}\t| cnt: {}\t|Loss: {}\t| precision: {:.4f}\t| time: {:.2f}".format(e, cnt, temp_loss/2000, cal_acc(label, pred), time.time()-start))
            temp_loss = 0
            start = time.time()
            
        if (step + 1) % global_params['gradient_accumulation_steps'] == 0:
            optim.step()
            optim.zero_grad()

   #print("** ** * Saving fine - tuned model ** ** * ")
    #model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    #create_folder(file_config['model_path'])
    #output_model_file = os.path.join(file_config['model_path'], file_config['model_name'])

    #torch.save(model_to_save.state_dict(), output_model_file)
        
    cost = time.time() - start
    return tr_loss, cost
    
    

In [61]:
trainset, testset = train_test_split(data, test_size=0.2)

In [62]:
trainset = EHRDataset(trainset, max_len=train_params['max_len_seq'], tokenizer=tokenizer)
trainload = DataLoader(dataset=trainset, batch_size=train_params['batch_size'], shuffle=True, num_workers=2)

In [63]:
import time

In [64]:
for e in range(100):
    loss, timecost = train(e, trainload)
    loss = loss / len(trainload)
    
    print("Loss after epoch {}: {}".format(e, loss))
    
    

epoch: 0	| cnt: 1	|Loss: 0.0031055378913879394	| precision: 0.0068	| time: 0.26
epoch: 0	| cnt: 201	|Loss: 0.46001285314559937	| precision: 0.1975	| time: 6.50
epoch: 0	| cnt: 401	|Loss: 0.4322800164222717	| precision: 0.2097	| time: 6.51


KeyboardInterrupt: 