In [11]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.nn.utils.rnn import *
from torch.utils.data import TensorDataset, Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

import os
import pickle
import time

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import csr_matrix

import sklearn.metrics

tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [12]:
cuda = torch.cuda.is_available()
print("cuda: ", cuda)
# num_workers = 8 if cuda else 0 

cuda:  True


In [27]:
print(len(train_texts))
print(len(val_texts))

1304607
111426


In [35]:
counterasdf = 0
max_len = len(train_texts[0].split())
over_200 = 0
over_500 = 0
over_1000 = 0
for s in train_texts:
    if len(s.split()) > 32:
        over_200 += 1
    if len(s.split()) > 64:
        over_500 += 1
    if len(s.split()) > 128:
        over_1000 += 1
    if len(s.split()) > max_len:
        max_len = len(s.split())
        # print("=" * 40)
        # print("new max len: " + str(max_len))
        # print("=" * 40)
        # print(s)
print("total length: " + str(len(train_texts)))
print("num over 32: " + str(over_200))
print("num over 64: " + str(over_500))
print("num over 128: " + str(over_1000))
print("max len: " + str(max_len))

total length: 1304607
num over 32: 17607
num over 64: 878
num over 128: 37
max len: 648


<h1>Dataset and DataLoader</h1>

In [4]:
an_dataset=pickle.load(open("../nutella/datasets/predict-importance-an.pkl", "rb"))

train_texts=[]
train_labels=[]
for x in an_dataset["train_dataset"]:
    train_texts.extend([q["txt"] for q in x["data"]])
    train_labels.extend([q["important"] for q in x["data"]])

val_texts=[]
val_labels=[]    
for x in an_dataset["val_dataset"]:
    val_texts.extend([q["txt"] for q in x["data"]])
    val_labels.extend([q["important"] for q in x["data"]])    
    
test_texts=[]
test_labels=[]        
for x in an_dataset["test_dataset"]:
    test_texts.extend([q["txt"] for q in x["data"]])
    test_labels.extend([q["important"] for q in x["data"]])

In [7]:
input_ids = []
attention_masks = []
for s in train_texts:
    encoded_dict = tokenizer.encode_plus(
                    s,
                    add_special_tokens=True,
                    max_length=128,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                    return_tensors='pt'
                )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

train_input_ids = torch.cat(input_ids, dim=0)
train_attention_masks = torch.cat(attention_masks, dim=0)
train_labels = torch.tensor(train_labels)

input_ids = []
attention_masks = []
for s in val_texts:
    encoded_dict = tokenizer.encode_plus(
                    s,
                    add_special_tokens=True,
                    max_length=128,
                    pad_to_max_length=True,
                    return_attention_mask=True,
                    return_tensors='pt'
                )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
val_input_ids = torch.cat(input_ids, dim=0)
val_attention_masks = torch.cat(attention_masks, dim=0)
val_labels = torch.tensor(val_labels)

print('done loading, this takes forever')

done loading, this takes forever


In [14]:
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)

batch_size = 32
train_loader = DataLoader(
                train_dataset,
                shuffle=True,
                batch_size = batch_size)

val_loader = DataLoader(
                val_dataset,
                shuffle=True,
                batch_size = batch_size)

In [10]:
print(train_input_ids.shape)

torch.Size([1304607, 128])


In [28]:
print(len(train_loader))
print(len(val_loader))

40769
3483


<h1>Train, Test, Metrics Utility Functions</h1>

In [17]:
def calc_metrics(y_true, predicted):
    this_results={}
    
    this_results["precision"]=sklearn.metrics.precision_score(y_true, predicted)
    this_results["recall"]=sklearn.metrics.recall_score(y_true, predicted)    
    this_results["f1"]=sklearn.metrics.f1_score(y_true, predicted)
    this_results["accuracy"]=sklearn.metrics.accuracy_score(y_true, predicted)    
    this_results["auc"]=sklearn.metrics.roc_auc_score(y_true, predicted)    

    return this_results

In [23]:
def train_epoch(model, train_loader, optimizer):
    model.train() 
    cumulative_loss = 0.0
    cumulative_lens = 0
    running_loss = 0.0
    
    print('='*40)
    print("Training", len(train_loader), "batches")
    print('='*40)
    
    # start timer and start iterating
    start_train = time.time()
    for batch_idx, (data, mask, target) in enumerate(train_loader):       
        optimizer.zero_grad()  
        data, mask, target = data.to(device), mask.to(device), target.to(device)
        
        loss, logits = model(data, token_type_ids=None, attention_mask=mask, labels=target)
        
        # accumulate loss
        cumulative_loss += loss.item()
        cumulative_lens += 1   # len(target)
        running_loss = cumulative_loss / cumulative_lens
        
        mid_train = time.time()
        if batch_idx % 40 == 39:
            print("Batch: ", batch_idx + 1)
            print('Cumulative Time: {:.4f}s\nLoss: {:.4f}'.format(mid_train - start_train, running_loss))
            print('='*40)
            
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step() 
        
        # delete variables and empty cache
        torch.cuda.empty_cache()
        del data
        del target
        del mask
    
    # end timer and take average loss
    end_train = time.time()
    time_train = end_train - start_train
    return time_train, running_loss


In [33]:
def test_model(model, test_loader):
    with torch.no_grad():
        model.eval()

        running_loss = 0.0
        target_total = []
        predicted_total = []

        start_test = time.time()
        for batch_idx, (data, mask, target) in enumerate(test_loader): 
                
            data, mask, target = data.to(device), mask.to(device), target.to(device)
        
            loss, logits = model(data, token_type_ids=None, attention_mask=mask, labels=target)

            _, predicted = torch.max(logits, 1)
            # print('predicted: ', predicted)
            
            # loss = criterion(logits, target).detach()
            running_loss += loss.item()
            
            mid_test = time.time()
            if batch_idx % 40 == 39:
                print("Batch: ", batch_idx + 1)
                print('Cumulative Time: {:.4f}s\nLoss: {:.4f}'.format(mid_test - start_test, running_loss/batch_idx))
                print('='*40)
            
            target = target.data.cpu().numpy()
            predicted = predicted.data.cpu().numpy()
            
            target_total.extend(target)
            predicted_total.extend(predicted)
            
            # delete variables and empty cache
            torch.cuda.empty_cache()
            del data
            del mask
            del target
        
        results = calc_metrics(np.array(target_total), np.array(predicted_total))
        running_loss /= len(test_loader)
        print('Dev Loss: ', running_loss)
        print('Results', results)
        return running_loss, results

<h1>Hyperparameters and Runtime</h1>

In [22]:
# PARAMETERS 

epochs = 5
learningRate = 2e-5
weightDecay = 0.00004
momentum = 0.9

batch_size = 32
num_layers = 5

# train_data = MyDataset(encoded_train, y_train, train_lens)
# val_data = MyDataset(encoded_val, y_val, val_lens)
# train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, collate_fn = collate)
# val_loader = DataLoader(val_data, shuffle=False, batch_size=16, collate_fn = collate)

model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=2, output_attentions=False, output_hidden_states=False) 
device = torch.device("cuda" if cuda else "cpu")
model.to(device)

total_steps = len(train_loader) * epochs
optimizer = AdamW(model.parameters(), lr=learningRate, eps=1e-8) # weightDecay?
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=385.0, style=ProgressStyle(description_…




In [34]:
val_loss, val_results = test_model(model, val_loader)

Batch:  40
Cumulative Time: 5.1672s
Loss: 0.6076
Batch:  80
Cumulative Time: 10.3889s
Loss: 0.6011
Batch:  120
Cumulative Time: 15.6429s
Loss: 0.5956
Batch:  160
Cumulative Time: 20.9123s
Loss: 0.5909
Batch:  200
Cumulative Time: 26.1127s
Loss: 0.5915
Batch:  240
Cumulative Time: 31.3923s
Loss: 0.5929
Batch:  280
Cumulative Time: 36.6947s
Loss: 0.5899
Batch:  320
Cumulative Time: 41.9859s
Loss: 0.5887
Batch:  360
Cumulative Time: 47.2876s
Loss: 0.5866
Batch:  400
Cumulative Time: 52.5584s
Loss: 0.5864
Batch:  440
Cumulative Time: 57.8780s
Loss: 0.5869
Batch:  480
Cumulative Time: 63.1883s
Loss: 0.5855
Batch:  520
Cumulative Time: 68.4996s
Loss: 0.5863
Batch:  560
Cumulative Time: 73.7668s
Loss: 0.5868
Batch:  600
Cumulative Time: 79.1046s
Loss: 0.5865
Batch:  640
Cumulative Time: 84.4212s
Loss: 0.5869
Batch:  680
Cumulative Time: 89.7871s
Loss: 0.5877
Batch:  720
Cumulative Time: 95.1089s
Loss: 0.5884
Batch:  760
Cumulative Time: 100.4717s
Loss: 0.5896
Batch:  800
Cumulative Time: 105.

In [24]:
for i in range(epochs):
    time_train, train_loss = train_epoch(model, train_loader, optimizer)
    val_loss, val_results = test_model(model, val_loader,)
    train_loss, train_results = test_model(model, train_loader)
    
    print('='*60)
    print('Epoch: {:.0f}\nTrain Time: {:.4f}s\nTrain Loss: {:.4f}\nVal Loss: {:.4f}'.format(i+1, time_train, train_loss, val_loss))
    print('='*60)

Training 40769 batches
Batch:  40
Cumulative Time: 13.7506s
Loss: 0.0206
Batch:  80
Cumulative Time: 27.6174s
Loss: 0.0204
Batch:  120
Cumulative Time: 41.6213s
Loss: 0.0203
Batch:  160
Cumulative Time: 55.8562s
Loss: 0.0201
Batch:  200
Cumulative Time: 69.9744s
Loss: 0.0200
Batch:  240
Cumulative Time: 84.1135s
Loss: 0.0198
Batch:  280
Cumulative Time: 98.0268s
Loss: 0.0197
Batch:  320
Cumulative Time: 112.3212s
Loss: 0.0197
Batch:  360
Cumulative Time: 126.3675s
Loss: 0.0196
Batch:  400
Cumulative Time: 140.3790s
Loss: 0.0196
Batch:  440
Cumulative Time: 154.3063s
Loss: 0.0196
Batch:  480
Cumulative Time: 168.1930s
Loss: 0.0195
Batch:  520
Cumulative Time: 182.0903s
Loss: 0.0195
Batch:  560
Cumulative Time: 196.0516s
Loss: 0.0195
Batch:  600
Cumulative Time: 210.1490s
Loss: 0.0194
Batch:  640
Cumulative Time: 224.3054s
Loss: 0.0194
Batch:  680
Cumulative Time: 238.4594s
Loss: 0.0194
Batch:  720
Cumulative Time: 252.5236s
Loss: 0.0194
Batch:  760
Cumulative Time: 266.6374s
Loss: 0.019

KeyboardInterrupt: 

In [None]:
# SAVE/LOAD MODEL
# path = "./"
# torch.save(network.state_dict(), path)
# network.load_state_dict(torch.load(path))