In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import *
from torch.utils.data import TensorDataset, Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

import os
import pickle
import time

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix

import sklearn.metrics

tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [None]:
cuda = torch.cuda.is_available()
print("cuda: ", cuda)
# num_workers = 8 if cuda else 0 

<h1>Dataset and DataLoader</h1>

In [None]:
an_dataset=pickle.load(open("../nutella/datasets/predict-importance-an-sorted.pkl", "rb"))

train_texts=[]
train_labels=[]
for x in an_dataset["train_dataset"]:
    train_texts.extend([q["txt"] for q in x["data"]])
    train_labels.extend([q["important"] for q in x["data"]])

val_texts=[]
val_labels=[]    
for x in an_dataset["val_dataset"]:
    val_texts.extend([q["txt"] for q in x["data"]])
    val_labels.extend([q["important"] for q in x["data"]])    
    
test_texts=[]
test_labels=[]        
for x in an_dataset["test_dataset"]:
    test_texts.extend([q["txt"] for q in x["data"]])
    test_labels.extend([q["important"] for q in x["data"]])

In [None]:
counterasdf = 0
max_len = len(train_texts[0].split())
over_200 = 0
over_500 = 0
over_1000 = 0
for s in train_texts:
    if len(s.split()) > 32:
        over_200 += 1
    if len(s.split()) > 64:
        over_500 += 1
    if len(s.split()) > 128:
        over_1000 += 1
    if len(s.split()) > max_len:
        max_len = len(s.split())
        # print("=" * 40)
        # print("new max len: " + str(max_len))
        # print("=" * 40)
        # print(s)
print("total length: " + str(len(train_texts)))
print("num over 32: " + str(over_200))
print("num over 64: " + str(over_500))
print("num over 128: " + str(over_1000))
print("max len: " + str(max_len))

In [None]:
input_ids = []
attention_masks = []

max_seq_len = 64
print("maximum sequence length for this round: " + str(max_seq_len))

for i in range(len(train_texts)):
    if i == 0:
        s = train_texts[i] + train_texts[i+1]
    elif i == len(train_texts)-1:
        s = train_texts[i-1] + train_texts[i]
    else:
        s = train_texts[i-1] + train_texts[i] + train_texts[i+1]
    
    encoded_dict = tokenizer.encode_plus(
                    s,
                    add_special_tokens=True,
                    max_length=max_seq_len,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                    return_tensors='pt'
                )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

train_input_ids = torch.cat(input_ids, dim=0)
train_attention_masks = torch.cat(attention_masks, dim=0)
train_labels = torch.tensor(train_labels)

input_ids = []
attention_masks = []
for i in range(len(val_texts)):
    if i == 0:
        s = val_texts[i] + val_texts[i+1]
    elif i == len(val_texts)-1:
        s = val_texts[i-1] + val_texts[i]
    else:
        s = val_texts[i-1] + val_texts[i] + val_texts[i+1]
    encoded_dict = tokenizer.encode_plus(
                    s,
                    add_special_tokens=True,
                    max_length=max_seq_len,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                    return_tensors='pt'
                )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
val_input_ids = torch.cat(input_ids, dim=0)
val_attention_masks = torch.cat(attention_masks, dim=0)
val_labels = torch.tensor(val_labels)

print('done loading, this takes forever')

<h1>Train, Test, Metrics Utility Functions</h1>

In [None]:
def calc_metrics(y_true, predicted):
    this_results={}
    
    this_results["precision"]=sklearn.metrics.precision_score(y_true, predicted)
    this_results["recall"]=sklearn.metrics.recall_score(y_true, predicted)    
    this_results["f1"]=sklearn.metrics.f1_score(y_true, predicted)
    this_results["accuracy"]=sklearn.metrics.accuracy_score(y_true, predicted)    
    this_results["auc"]=sklearn.metrics.roc_auc_score(y_true, predicted)    

    return this_results

In [None]:
def train_epoch(model, train_loader, optimizer):
    model.train() 
    cumulative_loss = 0.0
    cumulative_lens = 0
    running_loss = 0.0
    
    print('='*40)
    print("Training", len(train_loader), "batches")
    print('='*40)
    
    # start timer and start iterating
    start_train = time.time()
    for batch_idx, (data, mask, target) in enumerate(train_loader):       
        optimizer.zero_grad()  
        data, mask, target = data.to(device), mask.to(device), target.to(device)
        
        loss, logits = model(data, token_type_ids=None, attention_mask=mask, labels=target)
        
        # accumulate loss
        cumulative_loss += loss.item()
        cumulative_lens += 1   # len(target)
        running_loss = cumulative_loss / cumulative_lens
        
        mid_train = time.time()
        if batch_idx % 40 == 39:
            print("Batch: ", batch_idx + 1)
            print('Cumulative Time: {:.4f}s\nLoss: {:.4f}'.format(mid_train - start_train, running_loss))
            print('='*40)
            
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step() 
        
        # delete variables and empty cache
        torch.cuda.empty_cache()
        del data
        del target
        del mask
    
    # end timer and take average loss
    end_train = time.time()
    time_train = end_train - start_train
    return time_train, running_loss


In [None]:
def test_model(model, test_loader):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        target_total = []
        predicted_total = []

        start_test = time.time()
        for batch_idx, (data, mask, target) in enumerate(test_loader): 
                
            data, mask, target = data.to(device), mask.to(device), target.to(device)
        
            loss, logits = model(data, token_type_ids=None, attention_mask=mask, labels=target)

            _, predicted = torch.max(logits, 1)
            # print('predicted: ', predicted)
            
            # loss = criterion(logits, target).detach()
            running_loss += loss.item()
            
            mid_test = time.time()
            if batch_idx % 40 == 39:
                print("Batch: ", batch_idx + 1)
                print('Cumulative Time: {:.4f}s\nLoss: {:.4f}'.format(mid_test - start_test, running_loss/batch_idx))
                print('='*40)
            
            target = target.data.cpu().numpy()
            predicted = predicted.data.cpu().numpy()
            
            target_total.extend(target)
            predicted_total.extend(predicted)
            
            # delete variables and empty cache
            torch.cuda.empty_cache()
            del data
            del mask
            del target
        
        results = calc_metrics(np.array(target_total), np.array(predicted_total))
        running_loss /= len(test_loader)
        print('Dev Loss: ', running_loss)
        print('Results', results)
        return running_loss, results

<h1>Hyperparameters and Runtime</h1>

In [None]:
# PARAMETERS 

epochs = 1
learningRate = 2e-5
weightDecay = 0.00004
momentum = 0.9
num_workers = 4

print("train_input_ids size: " + str(train_input_ids.size()))
print("train_attention_masks size: " + str(train_attention_masks.size()))
print("train_labels size: " + str(train_labels.size()))

train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

batch_size = 32
train_loader = DataLoader(
                train_dataset,
                shuffle=True,
                batch_size = batch_size,
                num_workers = num_workers,
                pin_memory = True)

val_loader = DataLoader(
                val_dataset,
                shuffle=True,
                batch_size = batch_size,
                num_workers = num_workers,
                pin_memory = True)

print("length of train loader: " + str(len(train_loader)))
print("length of val loader: " + str(len(val_loader)))

model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2, output_attentions=False, output_hidden_states=False) 
device = torch.device("cuda" if cuda else "cpu")
model.to(device)

total_steps = len(train_loader) * epochs
optimizer = AdamW(model.parameters(), lr=learningRate, eps=1e-8) 
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


In [None]:
f = open("bert_context_results_sorted_an_64.txt", "w")


for i in range(epochs):
    time_train, train_loss = train_epoch(model, train_loader, optimizer)
    print("done training for epoch: " + str(i))
    val_loss, val_results = test_model(model, val_loader)
    print("done calculating val loss for epoch: " + str(i))

    print('='*60)
    print('Epoch: {:.0f}\nTrain Time: {:.4f}s\nTrain Loss: {:.4f}\nVal Loss: {:.4f}'.format(i, time_train, train_loss, val_loss))
    print('='*60)
    f.write('\n')
    f.write("Stats for epoch " + str(i))
    f.write('Epoch: {:.0f}\nTrain Time: {:.4f}s\nTrain Loss: {:.4f}\nVal Loss: {:.4f}'.format(i, time_train, train_loss, val_loss))
    f.write('\n')
    f.write('Val Results' + str(val_results))
    f.write('\n')

f.close()

In [None]:
# SAVE/LOAD MODEL
# path = "./"
# torch.save(network.state_dict(), path)
# network.load_state_dict(torch.load(path))