In [160]:
from sklearn.model_selection import train_test_split

In [161]:
def load_model(path, model):
    # load pretrained model and update weights
    pretrained_dict = torch.load(path)
    model_dict = model.state_dict()
    # 1. filter out unnecessary keys
    pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
    # 2. overwrite entries in the existing state dict
    model_dict.update(pretrained_dict)
    # 3. load the new state dict
    model.load_state_dict(model_dict)
    return model

In [162]:
def adam(params, config=None):
    if config is None:
        config = {
            'lr': 3e-5,
            'warmup_proportion': 0.1,
            'weight_decay': 0.01
        }
        
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

    optimizer_grouped_parameters = [
        {'params': [p for n, p in params if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in params if any(nd in n for nd in no_decay)], 'weight_decay': 0}
    ]

    optim = Bert.optimization.BertAdam(optimizer_grouped_parameters,
                                       lr=config['lr'],
                                       warmup=config['warmup_proportion'])
    return optim

In [163]:
import sys

In [164]:
sys.path.insert(1, '../')

In [165]:
from utils.dataset import EHRDatasetReadmission
from model.tokenizer import EHRTokenizer
import pytorch_pretrained_bert as Bert
from torch.utils.data import DataLoader
from model.model2 import *
from utils.config import BertConfig
from model.trainer import PatientTrajectoryPredictor
import pytorch_lightning as pl
from sklearn.model_selection import KFold

In [166]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [167]:
path = '../processing/readmission_data_ccsr_'

In [168]:
data = pd.read_parquet(path)

In [169]:
len(data)

170296

In [170]:
data[data['subject_id'] == 10008245]

Unnamed: 0,subject_id,label,icd_code,ccsr,age,alcohol_abuse,tobacco_abuse,ndc,hadm_id,gender
0,10008245,[0],"[[S32391A, E870, F72, G40909, R509, J45909, M8...","[[INJ003, END011, MBD014, NVS009, SYM002, RSP0...",[47.0],[0],[0],"[[69543013120, 904645561, 54482014407, 6655300...",[26674944],M


In [171]:
data.head()

Unnamed: 0,subject_id,label,icd_code,ccsr,age,alcohol_abuse,tobacco_abuse,ndc,hadm_id,gender
0,10008245,[0],"[[S32391A, E870, F72, G40909, R509, J45909, M8...","[[INJ003, END011, MBD014, NVS009, SYM002, RSP0...",[47.0],[0],[0],"[[69543013120, 904645561, 54482014407, 6655300...",[26674944],M
1,10031358,"[0, 0, 0, 0, 0, 0]","[[E11618, M86179, I96, E871, L97509, N179, E11...","[[END003, MUS002, CIR028, END011, SKN004, GEN0...","[62.0, 63.0, 64.0, 64.0, 65.0, 65.0]","[1, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[[63323026201, 71015892, 51079045420, 74706811...","[27421511, 28279098, 24522342, 29498981, 29887...",M
2,10034667,[0],"[[O134, K5090, O7581, O631, O76, O701, Z3A38, ...","[[CIR008, DIG011, PRG023, PRG024, PRG023, PRG0...",[37.0],[0],[0],"[[904198861, 63029850401, 61553073503, 1152372...",[27995369],F
3,10034933,"[0, 0]","[[C7951, G9519, I639, C228, C792, E871, R29702...","[[NEO070, NVS020, CIR020, NEO017, NEO070, END0...","[63.0, 63.0]","[0, 0]","[0, 0]","[[60569049501, 68084019801, 63323010605, 11523...","[29594531, 28591708]",M
4,10040737,"[0, 0]","[[K562, I10, K219, E7439, F419, E039], [S82851...","[[DIG012, CIR007, DIG004, END016, MBD005, END0...","[51.0, 57.0]","[0, 0]","[0, 0]","[[78112001103, 60793011601, 63323026201, 90456...","[26871570, 20352299]",F


In [172]:
global_params = {
    'max_seq_len': 32,
    'gradient_accumulation_steps': 1
}

optim_param = {
    'lr': 3e-6,
    'warmup_proportion': 0.1,
    'weight_decay': 0.01
}

train_params = {
    'batch_size': 64,
    'use_cuda': True,
    'max_len_seq': global_params['max_seq_len'],
    'device': 'cuda' #change this to run on cuda #'cuda:0'
}

In [173]:
tokenizer = EHRTokenizer(task='readmission')

In [174]:
#trainset = EHRDatasetPredictionTask(data, max_len=train_params['max_len_seq'], tokenizer=tokenizer, prediction_task='readmission')

In [175]:
#trainload = DataLoader(dataset=trainset, batch_size=train_params['batch_size'], shuffle=True)

In [200]:
# Shallow
'''
model_config = {
    'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
    'hidden_size': 300, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding,
    'gender_vocab_size': 3,
    'max_position_embeddings': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 2, # number of multi-head attention layers required
    'num_attention_heads': 4, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 300, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
}
'''

# Deep
model_config = {
    'vocab_size': len(tokenizer.getVoc('code').keys()), # number of disease + symbols for word embedding
    'hidden_size': 300, # word embedding and seg embedding hidden size
    'seg_vocab_size': 2, # number of vocab for seg embedding
    'age_vocab_size': len(tokenizer.getVoc('age').keys()), # number of vocab for age embedding,
    'gender_vocab_size': 3,
    'max_position_embeddings': train_params['max_len_seq'], # maximum number of tokens
    'hidden_dropout_prob': 0.1, # dropout rate
    'num_hidden_layers': 6, # number of multi-head attention layers required
    'num_attention_heads': 12, # number of attention heads
    'attention_probs_dropout_prob': 0.1, # multi-head attention dropout rate
    'intermediate_size': 300, # the size of the "intermediate" layer in the transformer encoder
    'hidden_act': 'gelu', # The non-linear activation function in the encoder and the pooler "gelu", 'relu', 'swish' are supported
    'initializer_range': 0.02, # parameter weight initializer range
}

In [201]:
class BertConfig(Bert.modeling.BertConfig):
    def __init__(self, config):
        super(BertConfig, self).__init__(
            vocab_size_or_config_json_file=config.get('vocab_size'),
            hidden_size=config['hidden_size'],
            num_hidden_layers=config.get('num_hidden_layers'),
            num_attention_heads=config.get('num_attention_heads'),
            intermediate_size=config.get('intermediate_size'),
            hidden_act=config.get('hidden_act'),
            hidden_dropout_prob=config.get('hidden_dropout_prob'),
            attention_probs_dropout_prob=config.get('attention_probs_dropout_prob'),
            max_position_embeddings = config.get('max_position_embeddings'),
            initializer_range=config.get('initializer_range'),
        )
        self.seg_vocab_size = config.get('seg_vocab_size')
        self.age_vocab_size = config.get('age_vocab_size')
        self.gender_vocab_size = config.get('gender_vocab_size')

In [202]:
conf = BertConfig(model_config)
tensorboarddir = '../logs/'

In [203]:
#PATH = "checkpoint2"
#mode = load_model(PATH, model)

In [204]:
data['lenvisits'] = data['hadm_id'].apply(lambda x: len(x)) 

In [205]:
datawithout1visit = data[data['lenvisits'] > 1]

In [206]:
len(data)

170296

In [207]:
len(datawithout1visit)

72975

In [208]:
datawithout1visit.head()

Unnamed: 0,subject_id,label,icd_code,ccsr,age,alcohol_abuse,tobacco_abuse,ndc,hadm_id,gender,lenvisits
1,10031358,"[0, 0, 0, 0, 0, 0]","[[E11618, M86179, I96, E871, L97509, N179, E11...","[[END003, MUS002, CIR028, END011, SKN004, GEN0...","[62.0, 63.0, 64.0, 64.0, 65.0, 65.0]","[1, 1, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0]","[[63323026201, 71015892, 51079045420, 74706811...","[27421511, 28279098, 24522342, 29498981, 29887...",M,6
3,10034933,"[0, 0]","[[C7951, G9519, I639, C228, C792, E871, R29702...","[[NEO070, NVS020, CIR020, NEO017, NEO070, END0...","[63.0, 63.0]","[0, 0]","[0, 0]","[[60569049501, 68084019801, 63323010605, 11523...","[29594531, 28591708]",M,2
4,10040737,"[0, 0]","[[K562, I10, K219, E7439, F419, E039], [S82851...","[[DIG012, CIR007, DIG004, END016, MBD005, END0...","[51.0, 57.0]","[0, 0]","[0, 0]","[[78112001103, 60793011601, 63323026201, 90456...","[26871570, 20352299]",F,2
5,10046592,"[0, 0]","[[I639, N390, E1165, R1310, I10, E785, B9620, ...","[[CIR020, GEN004, END003, SYM005, CIR007, END0...","[73.0, 74.0]","[0, 0]","[0, 0]","[[38396055018, 904629261, 904645561, 613140144...","[29327270, 27003299]",F,2
10,10082965,"[0, 0]","[[M4712, D7811, I10], [C20, I214, I2699, G92, ...","[[MUS011, BLD009, CIR007], [NEO015, CIR009, CI...","[81.0, 86.0]","[0, 0]","[0, 0]","[[61553020648, 574705050, 121043130, 536338101...","[22046658, 25214863]",F,2


In [209]:
datawithout1visit['readmission_label'] = datawithout1visit['label'].apply(lambda x: x[1])

In [210]:
label1 = datawithout1visit[datawithout1visit['readmission_label'] == 1]

In [211]:
label0 = datawithout1visit[datawithout1visit['readmission_label'] == 0].sample(len(label1))

In [212]:
finaldata = pd.concat([label1, label0])

In [213]:
trainset, testset = train_test_split(finaldata, test_size=0.2)

In [214]:
traindata = EHRDatasetReadmission(trainset, max_len=train_params['max_len_seq'], tokenizer=tokenizer) 
testdata = EHRDatasetReadmission(testset, max_len=train_params['max_len_seq'], tokenizer=tokenizer) 

In [215]:
trainer = pl.Trainer(
        max_epochs=20, 
        gpus=-1,
        logger=pl.loggers.TensorBoardLogger(save_dir=tensorboarddir),
        callbacks=[pl.callbacks.progress.TQDMProgressBar()], 
        progress_bar_refresh_rate=1,
        weights_summary=None, # Can be None, top or full
        num_sanity_val_steps=10,
    )

trainloader = torch.utils.data.DataLoader(traindata, batch_size=train_params['batch_size'], shuffle=True, num_workers=3)
testloader = torch.utils.data.DataLoader(testdata, batch_size=train_params['batch_size'], shuffle=True, num_workers=3)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


In [216]:
model = BertForReadmission(conf, num_labels=1) 
PATH = "../saved_models/MLM/deep_notsuffled"
model = load_model(PATH, model)
params = list(model.named_parameters())
optim = adam(params, optim_param)

t_total value of -1 results in schedule not being applied


In [217]:
patienttrajectory = PatientTrajectoryPredictor(model, optim, optim_param, train_objective='readmission', metrics=True)

trainer.fit(
    patienttrajectory, 
    train_dataloader=trainloader,
);

predictions = trainer.predict(patienttrajectory, dataloaders=testloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 19: 100%|████████████████████████████████████████████████| 315/315 [00:16<00:00, 18.70it/s, loss=0.613, v_num=182]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]



Predicting: 315it [00:01, ?it/s]


In [218]:
PATH = '../saved_models/Readmission/deep_unshuffled'
torch.save(model.state_dict(), PATH)

In [95]:
#predictions = trainer.predict(patienttrajectory, dataloaders=testloader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting: 4258it [00:10, ?it/s]


In [219]:
f1 = 0

In [220]:
for preds in predictions:
    f1 += preds['F1-score']

In [221]:
f1 / len(predictions)

0.6171461114435639

In [222]:
aucpr = 0

for preds in predictions:
    aucpr += preds['AUCPR']
    
aucpr / len(predictions)

0.6765966287572949

In [223]:
auc = 0

for preds in predictions:
    auc += preds['AUROC']
    
auc / len(predictions)

0.6568882132483317