In [1]:
from utils.dataset import EHRDataset
from model.tokenizer import EHRTokenizer
import pytorch_pretrained_bert as Bert
from torch.utils.data import DataLoader
from model.model import *

In [2]:
import pandas as pd

In [3]:
path = 'C:/Users/Johan/Documents/Skola/MasterThesis/Master-thesis/pre-processing/combined-csv-files.csv'

In [4]:
class BertConfig(Bert.modeling.BertConfig):
    def __init__(self, config):
        super(BertConfig, self).__init__(
            vocab_size_or_config_json_file=config.get('vocab_size'),
            hidden_size=config['hidden_size'],
            num_hidden_layers=config.get('num_hidden_layers'),
            num_attention_heads=config.get('num_attention_heads'),
            intermediate_size=config.get('intermediate_size'),
            hidden_act=config.get('hidden_act'),
            hidden_dropout_prob=config.get('hidden_dropout_prob'),
            attention_probs_dropout_prob=config.get('attention_probs_dropout_prob'),
            max_position_embeddings = config.get('max_position_embedding'),
            initializer_range=config.get('initializer_range'),
        )
        self.seg_vocab_size = config.get('seg_vocab_size')
        self.age_vocab_size = config.get('age_vocab_size')
        
class TrainConfig(object):
    def __init__(self, config):
        self.batch_size = config.get('batch_size')
        self.use_cuda = config.get('use_cuda')
        self.max_len_seq = config.get('max_len_seq')
        self.train_loader_workers = config.get('train_loader_workers')
        self.test_loader_workers = config.get('test_loader_workers')
        self.device = config.get('device')
        self.output_dir = config.get('output_dir')
        self.output_name = config.get('output_name')
        self.best_name = config.get('best_name')

In [5]:
global_params = {
    'max_seq_len': 10,
    'max_age': 110,
    'month': 1,
    'age_symbol': None,
    'min_visit': 5,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 2,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cuda:0'
}

In [29]:
data = pd.read_csv(path).iloc[:500]

In [30]:
len(data)

500

In [31]:
data.head()

Unnamed: 0,subject_id,icd_code,age
0,10028314,"Z3800,P2912,Z23,Q620,Z051,Z412,P284,P928,P590,...","0,0,0,0,0,0,0,0,0,0,0,0,0,0,SEP"
1,10052351,"R0789,F10129,SEP","56,56,SEP"
2,10092012,"Z051,Z23,Z3800,SEP","0,0,0,SEP"
3,10092020,"Z87891,Z8546,Z7901,G4089,I4820,Z8673,E872,M255...","69,69,69,69,69,69,69,69,SEP"
4,10126895,"Z30430,O80,Z3A39,Z370,SEP","24,24,24,24,SEP"


In [32]:
tokenizer = EHRTokenizer()

In [33]:
Dset = EHRDataset(data, max_len=train_params['max_len_seq'], tokenizer=tokenizer)
trainload = DataLoader(dataset=Dset, batch_size=train_params['batch_size'], shuffle=True)

In [34]:
model_config = {
    'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding
    'max_position_embedding': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 6, # number of multi-head attention layers required
    'num_attention_heads': 12, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
}

In [35]:
conf = BertConfig(model_config)

In [36]:
model = BertForMaskedLM(conf)

In [37]:
for epoch in range(10):
    los = 0
    for step, batch in enumerate(trainload):    
        batch = tuple(t for t in batch)
        age_ids, input_ids, posi_ids, segment_ids, attMask, labels = batch 
        loss, pred, label = model(input_ids, age_ids, segment_ids, posi_ids,attention_mask=attMask, masked_lm_labels=labels)
        loss.backward()
        
        los += loss.item()
        
    print("Avg loss {} after epoch {}".format(los / len(trainload), epoch))    

Avg loss 9.097714546203614 after epoch 0
Avg loss 9.159693798065186 after epoch 1
Avg loss 8.967929546356201 after epoch 2
Avg loss 9.086614185333252 after epoch 3
Avg loss 8.899548042297363 after epoch 4
Avg loss 8.935291358947755 after epoch 5
Avg loss 9.02836623764038 after epoch 6
Avg loss 9.005807250976563 after epoch 7
Avg loss 9.038513618469239 after epoch 8
Avg loss 9.12627414703369 after epoch 9


In [17]:
loss

tensor(9.9238, grad_fn=<NllLossBackward>)