In [None]:
import os
import logging
from transformers import AutoTokenizer, AutoModel, BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam
import torch

from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
ROOT_DIR = '/'.join(os.getcwd().split('/')[:-1])
os.chdir(ROOT_DIR)
from modeling.modeling_utils import *
import json
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 32
EVAL_BATCH = 2
GRADIENT_ACCUMULATION_STEPS = 2
NUM_TRAIN_EPOCHS = 3
WARMUP_PROPORTION = 0.1
local_rank = -1
no_cuda = False
seed = 2022
output_dir = f'{ROOT_DIR}output/'
os.makedirs(output_dir, exist_ok=True)
local_test = False
## model folder named 'pretraining' should be in the root folder


def get_data_loader(features, max_seq_length, batch_size, shuffle=True, add_sampler=False): 

    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    if add_sampler == True:
        logger.info('add sampler')
        if local_rank == -1:
            sampler = SequentialSampler(data)
        else:
            sampler = DistributedSampler(data)
        dataloader = DataLoader(data, sampler=sampler, batch_size=EVAL_BATCH)
    else:
        dataloader = DataLoader(data, shuffle=shuffle, batch_size=batch_size)
    return dataloader

def train(Classifier, NUM_TRAIN_EPOCHS):
    total_train_loss_history = []
    total_val_loss_history = []
    total_learning_rate = []
     
    for real_epoch in trange(int(NUM_TRAIN_EPOCHS)):
        Classifier.train()
        tr_loss_epoch, train_loss_history, learning_rate_list, Classifier = train_one_epoch(Classifier, real_epoch)
        total_train_loss_history.extend(train_loss_history)
        total_learning_rate.extend(learning_rate_list)
        result, loss_history = evaluate(epoch, Classifier)
        total_val_loss_history.extend(loss_history)
        logging.info(result)
        model_name = f'clinicalbert_LEARNING_RATE_{LEARNING_RATE}_gradient_accu_{GRADIENT_ACCUMULATION_STEPS}_MAX_GRAD_NORM_{MAX_GRAD_NORM}_{str(real_epoch)}.pt'
        torch.save(Classifier.state_dict(), f'{output_dir}{model_name}')
    import matplotlib.pyplot as plt
    %matplotlib inline

    plt.plot(total_train_loss_history)
    plt.plot(total_val_loss_history)
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()


def train_one_epoch(Classifier, epoch):
    global_step = 0 
    global_step_check = 0
    no_improvement = 0
    train_loss_history = []
    tr_loss = 0 
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc=f'Training iteration: {str(epoch)}')):
        Classifier.to(device)
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        #with torch.set_grad_enabled(True):
        loss, logits = Classifier(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids)
        if n_gpu > 1:
            loss = loss.mean() # mean() to average on multi-gpu.
        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss / GRADIENT_ACCUMULATION_STEPS
        loss.backward() 
        train_loss_history.append(loss.item())
        tr_loss += loss.item()
        if ((step + 1) % GRADIENT_ACCUMULATION_STEPS == 0  or (step+1) == len(train_dataloader)):
            torch.nn.utils.clip_grad_norm_(Classifier.parameters(), MAX_GRAD_NORM)  
            optimizer.step()
            scheduler.step()
            Classifier.zero_grad()
            global_step += 1
        if (step+1) % 200 == 0:
            logging_str =  "***** epoch [{}]".format(epoch)
            logging_str += " global_step [{}]".format(global_step) 
            logging_str += " train loss [{}]".format(loss.item())  
            logging.info(logging_str)
        nb_tr_steps = nb_tr_steps + 1
    tr_loss_epoch = tr_loss / nb_tr_steps
    learning_rate_list = scheduler.get_lr()
    return tr_loss_epoch, train_loss_history, learning_rate_list, Classifier
            
            
def evaluate(epoch, Classifier):
    nb_eval_examples = 0
    nb_eval_steps = 0
    m = nn.Sigmoid()
    Classifier.eval()
    eval_loss = 0
    eval_accuracy = 0
    nb_eval_steps = 0
    pred_labels, true_labels, logits_history, pred_scores = [], [], [], []   
    loss_history = []
    dev_example = processor.get_dev_examples(f'{ROOT_DIR}/data/3days/')
    
    dev_features = convert_examples_to_features(
            dev_example, label_list, MAX_SEQ_LENGTH, tokenizer)

    dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, EVAL_BATCH, shuffle=False)
    for step, batch in enumerate(tqdm(dev_dataloader, desc="Evaluation iteration: {str(epoch)}'")): 
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, segment_ids, label_ids = batch
        with torch.no_grad():
            tmp_eval_loss, temp_logits = Classifier(input_ids, attention_mask=input_mask,
                                    token_type_ids=segment_ids, labels=label_ids)
            logits = Classifier(input_ids,segment_ids,input_mask) # we don't need twice
        
        logits = torch.squeeze(m(logits)).detach().cpu().numpy()    
        label_ids = np.array(np.array(label_ids.to('cpu')))
        outputs = np.asarray([1 if i else 0 for i in (logits.flatten()>=0.5)])
        tmp_eval_accuracy=np.sum(outputs == label_ids)    
        
        true_labels += list(label_ids)
        pred_labels += list(outputs)
        logits_history = logits_history + logits.flatten().tolist()
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        loss_history.append(tmp_eval_loss.item())
        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_examples
            
    logging_str =  "***** epoch [{}]".format(epoch)
    logging_str += " {} [{:.4}]".format('val_loss', eval_loss)
    
    df_test = pd.read_csv(f'{ROOT_DIR}/data/3days/val.csv')
    fpr, tpr, df_out, roc_auc = vote_score(df_test, logits_history, output_dir)
    pr_auc = pr_curve_plot(df_test['Label'].values, logits_history, output_dir)
    rp80 =  vote_pr_curve(df_test, logits_history, output_dir)
    result = {'eval_loss': eval_loss,
              'eval_accuracy': eval_accuracy,  
              'training loss': tr_loss_epoch,
              'AUROC': roc_auc,
              'AUPRC' : pr_auc,
              'RP80': rp80}     
    return result, loss_history


In [None]:
from bert_utils import BertForSequenceClassification
from transformers.optimization import get_linear_schedule_with_warmup
import json
local_rank  = 0
local_test = False
LEARNING_RATE=5e-05
file_path = f"{ROOT_DIR}/secrets.json"
if local_test:
    tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
else:
    with open(file_path, "r") as json_file:
        json_data = json.load(json_file)
        proxies = json_data['proxies']
        tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', proxies=proxies) 
processors = {
    "readmission": readmissionProcessor
}
processor = processors['readmission']()
label_list = processor.get_labels()

train_examples = processor.get_train_examples(f'{ROOT_DIR}/data/3days/')
test_example = processor.get_test_examples(f'{ROOT_DIR}/data/3days/')
dev_example = processor.get_dev_examples(f'{ROOT_DIR}/data/3days/')
train_features = convert_examples_to_features(
        train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = convert_examples_to_features(
        test_example, label_list, MAX_SEQ_LENGTH, tokenizer)
dev_features = convert_examples_to_features(
        dev_example, label_list, MAX_SEQ_LENGTH, tokenizer)

train_dataloader = get_data_loader(train_features, MAX_SEQ_LENGTH, BATCH_SIZE, shuffle=True)
dev_dataloader = get_data_loader(dev_features, MAX_SEQ_LENGTH, EVAL_BATCH, shuffle=False)
test_dataloader = get_data_loader(test_features, MAX_SEQ_LENGTH, EVAL_BATCH, shuffle=False)

if local_rank == -1 or no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
else:
    device = torch.device("cuda", local_rank)
    n_gpu = 1
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if n_gpu > 0:
    torch.cuda.manual_seed_all(seed)
## number of loss backward? 
num_train_steps = int(len(train_dataloader.dataset) / BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS * NUM_TRAIN_EPOCHS)   
num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)
Classifier = BertForSequenceClassification.from_pretrained(os.path.join(f'{ROOT_DIR}/', 'pretraining'), 1)
Classifier.to(device)
param_optimizer = list(Classifier.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
optimizer = BertAdam(optimizer_grouped_parameters, lr=LEARNING_RATE, warmup=WARMUP_PROPORTION, t_total=num_train_steps)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
MAX_GRAD_NORM = 3
NUM_TRAIN_EPOCHS = 3
total_train_loss_history = []
total_val_loss_history = []
total_learning_rate = []
m = nn.Sigmoid()
for real_epoch in trange(int(NUM_TRAIN_EPOCHS)):
    Classifier.train()
    tr_loss_epoch, train_loss_history, learning_rate_list, Classifier = train_one_epoch(Classifier, real_epoch)
    total_train_loss_history.extend(train_loss_history)
    total_learning_rate.extend(learning_rate_list)
    result, loss_history = evaluate(real_epoch, Classifier)
    total_val_loss_history.extend(loss_history)
    logging.info(result)
    model_name = f'clinicalbert_LEARNING_RATE_{LEARNING_RATE}_gradient_accu_{GRADIENT_ACCUMULATION_STEPS}_MAX_GRAD_NORM_{MAX_GRAD_NORM}_{str(real_epoch)}.pt'
    torch.save(Classifier.state_dict(), f'{output_dir}{model_name}')
   


In [None]:
 import matplotlib.pyplot as plt
    %matplotlib inline

    plt.plot(total_train_loss_history)
    plt.plot(total_val_loss_history)
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'val'], loc='upper left')
    plt.show()