In [1]:
from utils.dataset import EHRDataset
from model.tokenizer import EHRTokenizer
import pytorch_pretrained_bert as Bert
from torch.utils.data import DataLoader
from model.model import *

In [2]:
import pandas as pd

In [3]:
#path = 'C:/Users/Johan/Documents/Skola/MasterThesis/Master-thesis/pre-processing/combined-csv-files.csv'
path = 'C:/Users/Johan/Documents/Skola/MasterThesis/Master-thesis/pre-processing/dataframe.parquet'

In [4]:
class BertConfig(Bert.modeling.BertConfig):
    def __init__(self, config):
        super(BertConfig, self).__init__(
            vocab_size_or_config_json_file=config.get('vocab_size'),
            hidden_size=config['hidden_size'],
            num_hidden_layers=config.get('num_hidden_layers'),
            num_attention_heads=config.get('num_attention_heads'),
            intermediate_size=config.get('intermediate_size'),
            hidden_act=config.get('hidden_act'),
            hidden_dropout_prob=config.get('hidden_dropout_prob'),
            attention_probs_dropout_prob=config.get('attention_probs_dropout_prob'),
            max_position_embeddings = config.get('max_position_embedding'),
            initializer_range=config.get('initializer_range'),
        )
        self.seg_vocab_size = config.get('seg_vocab_size')
        self.age_vocab_size = config.get('age_vocab_size')
        
class TrainConfig(object):
    def __init__(self, config):
        self.batch_size = config.get('batch_size')
        self.use_cuda = config.get('use_cuda')
        self.max_len_seq = config.get('max_len_seq')
        self.train_loader_workers = config.get('train_loader_workers')
        self.test_loader_workers = config.get('test_loader_workers')
        self.device = config.get('device')
        self.output_dir = config.get('output_dir')
        self.output_name = config.get('output_name')
        self.best_name = config.get('best_name')

In [5]:
global_params = {
    'max_seq_len': 64,
    'max_age': 110,
    'month': 1,
    'age_symbol': None,
    'min_visit': 5,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-5,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 32,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cuda:0'
}

In [6]:
data = pd.read_parquet(path)

In [7]:
len(data)

107704

In [8]:
data.head()

Unnamed: 0,subject_id,icd_code,age
0,10028314,"[Z3800, P2912, Z23, Q620, Z051, Z412, P284, P9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,10052351,"[R0789, F10129, SEP]","[56, 56, 56]"
2,10092012,"[Z051, Z23, Z3800, SEP]","[0, 0, 0, 0]"
3,10092020,"[Z87891, Z8546, Z7901, G4089, I4820, Z8673, E8...","[69, 69, 69, 69, 69, 69, 69, 69, 69]"
4,10126895,"[Z30430, O80, Z3A39, Z370, SEP]","[24, 24, 24, 24, 24]"


In [9]:
data = data.dropna()

In [10]:
len(data)

107704

In [11]:
data.iloc[48168].head()

subject_id                                             11361364
icd_code      [Y838, K9161, M47812, Y92238, E785, E669, Z515...
age           [86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 8...
Name: 48168, dtype: object

In [12]:
data = data[data.subject_id != 'subject_id']

In [13]:
#data[data['age'] == 'age']

In [14]:
len(data)

107704

In [15]:
data

Unnamed: 0,subject_id,icd_code,age
0,10028314,"[Z3800, P2912, Z23, Q620, Z051, Z412, P284, P9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,10052351,"[R0789, F10129, SEP]","[56, 56, 56]"
2,10092012,"[Z051, Z23, Z3800, SEP]","[0, 0, 0, 0]"
3,10092020,"[Z87891, Z8546, Z7901, G4089, I4820, Z8673, E8...","[69, 69, 69, 69, 69, 69, 69, 69, 69]"
4,10126895,"[Z30430, O80, Z3A39, Z370, SEP]","[24, 24, 24, 24, 24]"
...,...,...,...
107699,19837828,"[Z3800, Z23, SEP]","[0, 0, 0]"
107700,19910693,"[I2510, D638, Z8619, J440, I739, I10, R0902, Z...","[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 6..."
107701,19963063,"[O081, D62, Z3A01, O99011, K661, O00102, SEP]","[35, 35, 35, 35, 35, 35, 35]"
107702,19979982,"[Y92239, I70201, M109, Z006, N400, I120, I2510...","[83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 8..."


In [16]:
data['code_len'] = data['icd_code'].apply(lambda x: len(x))
data['age_len'] = data['age'].apply(lambda x: len(x))

In [17]:
data

Unnamed: 0,subject_id,icd_code,age,code_len,age_len
0,10028314,"[Z3800, P2912, Z23, Q620, Z051, Z412, P284, P9...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",15,15
1,10052351,"[R0789, F10129, SEP]","[56, 56, 56]",3,3
2,10092012,"[Z051, Z23, Z3800, SEP]","[0, 0, 0, 0]",4,4
3,10092020,"[Z87891, Z8546, Z7901, G4089, I4820, Z8673, E8...","[69, 69, 69, 69, 69, 69, 69, 69, 69]",9,9
4,10126895,"[Z30430, O80, Z3A39, Z370, SEP]","[24, 24, 24, 24, 24]",5,5
...,...,...,...,...,...
107699,19837828,"[Z3800, Z23, SEP]","[0, 0, 0]",3,3
107700,19910693,"[I2510, D638, Z8619, J440, I739, I10, R0902, Z...","[64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 6...",48,48
107701,19963063,"[O081, D62, Z3A01, O99011, K661, O00102, SEP]","[35, 35, 35, 35, 35, 35, 35]",7,7
107702,19979982,"[Y92239, I70201, M109, Z006, N400, I120, I2510...","[83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 83, 8...",14,14


In [18]:
data = data[data['code_len'] < 500]

In [19]:
data = data[data['age_len'] < 500]

In [20]:
data.iloc[62713]

subject_id                                       12527977
icd_code      [O6981X0, Z370, R51, O701, Z3A40, O76, SEP]
age                          [32, 32, 32, 32, 32, 32, 32]
code_len                                                7
age_len                                                 7
Name: 62755, dtype: object

In [21]:
tokenizer = EHRTokenizer()

In [22]:
Dset = EHRDataset(data, max_len=train_params['max_len_seq'], tokenizer=tokenizer)
trainload = DataLoader(dataset=Dset, batch_size=train_params['batch_size'], shuffle=True)

In [23]:
model_config = {
    'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
    'hidden_size': 288, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding
    'max_position_embedding': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 6, # number of multi-head attention layers required
    'num_attention_heads': 12, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 512, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
}

In [24]:
conf = BertConfig(model_config)

In [25]:
model = BertForMaskedLM(conf)

In [26]:
len(trainload)

3364

In [27]:
for epoch in range(10):
    los = 0
    for step, batch in enumerate(trainload):    
        batch = tuple(t for t in batch)
        age_ids, input_ids, posi_ids, segment_ids, attMask, labels = batch 
        loss, pred, label = model(input_ids, age_ids = age_ids, seg_ids = segment_ids, posi_ids = posi_ids,attention_mask=attMask, masked_lm_labels=labels)
        loss.backward()
        print(loss)
        los += loss.item()
        
    print("Avg loss {} after epoch {}".format(los / len(trainload), epoch))    

tensor(9.9168, grad_fn=<NllLossBackward>)
tensor(9.8218, grad_fn=<NllLossBackward>)
tensor(9.8560, grad_fn=<NllLossBackward>)
tensor(9.8201, grad_fn=<NllLossBackward>)
tensor(9.8218, grad_fn=<NllLossBackward>)
tensor(9.8087, grad_fn=<NllLossBackward>)
tensor(9.8910, grad_fn=<NllLossBackward>)
tensor(9.8318, grad_fn=<NllLossBackward>)
tensor(9.8626, grad_fn=<NllLossBackward>)
tensor(9.8424, grad_fn=<NllLossBackward>)
tensor(9.8572, grad_fn=<NllLossBackward>)
tensor(9.8953, grad_fn=<NllLossBackward>)
tensor(9.8261, grad_fn=<NllLossBackward>)
tensor(9.8018, grad_fn=<NllLossBackward>)
tensor(9.8594, grad_fn=<NllLossBackward>)
tensor(9.8003, grad_fn=<NllLossBackward>)
tensor(9.8262, grad_fn=<NllLossBackward>)
tensor(9.8771, grad_fn=<NllLossBackward>)
tensor(9.7904, grad_fn=<NllLossBackward>)
tensor(9.8107, grad_fn=<NllLossBackward>)
tensor(9.8102, grad_fn=<NllLossBackward>)
tensor(9.8198, grad_fn=<NllLossBackward>)


KeyboardInterrupt: 

In [17]:
loss

tensor(9.9238, grad_fn=<NllLossBackward>)