In [1]:
from transformers import BertModel, BertTokenizer, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim

from sklearn.model_selection import train_test_split

RANDOM_SEED = 10
MODEL = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(MODEL)
NUMBER_LABELS = 5 
BATCH_SIZE = 8

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()

# Load data

In [3]:
df =  pd.read_csv("../data/processed_data_v3.csv")[["Q2_Q","Q2_A"]].dropna()

In [4]:
df

Unnamed: 0,Q2_Q,Q2_A
0,Excellent education to me is the type of educa...,4.0
1,An Excellent Education is the process of knowl...,3.0
2,An excellent education is an education given t...,4.0
3,"Patty Murray once said, ""Good education learni...",4.0
4,An excellent education is one that exposes a s...,2.0
...,...,...
3112,An excellent education is something that bring...,3.0
3114,Excellent education in my opinion prepares the...,3.0
3119,An excellent education is an education given t...,2.0
3140,Education according to the Oxford dictionary i...,3.0


Transform labels for ordinal regression

In [5]:
transformed_Q2_A = []
for score in df.Q2_A:
    transformed = np.zeros(NUMBER_LABELS)
    idx = int(score)
    transformed[:idx] = np.ones(idx)
    transformed_Q2_A.append(transformed)

df.Q2_A = transformed_Q2_A

# Create a pytorch dataset class and dataloader

In [6]:
class LFG_dataset(Dataset):

  def __init__(self, answers, scores, tokenizer, max_len):
    self.answers = answers
    self.scores = scores
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.answers)
  
  def __getitem__(self, idx):
    answer = self.answers[idx]
    score = self.scores[idx]

    encoding = self.tokenizer.encode_plus(
      answer,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'answer_text': answer,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'score': torch.tensor(score, dtype=torch.float32)
    }

In [7]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = LFG_dataset(
    answers=df.Q2_Q.to_numpy(),
    scores=df.Q2_A.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size
  )

In [8]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

train_data_loader = create_data_loader(df_train, tokenizer, 512, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, 512, BATCH_SIZE)

In [9]:
data = next(iter(train_data_loader))

In [10]:
data.keys()

dict_keys(['answer_text', 'input_ids', 'attention_mask', 'score'])

In [11]:
data['score']

tensor([[1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0.],
        [1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0.]])

# Model and loss function

In [12]:
class LFG_grading(nn.Module):

  def __init__(self, n_classes):
    super(LFG_grading, self).__init__()
    self.bert = BertModel.from_pretrained(MODEL)
    # Freeze embeddings of BERT
    for param in self.bert.parameters():
      param.requires_grad = False
    self.drop = nn.Dropout(p=0.5)
    self.fc1 = nn.Linear(self.bert.config.hidden_size, 200)
    self.fc2 = nn.Linear(200,200)
    self.fc3 = nn.Linear(200,n_classes)
    self.sigmoid = nn.Softmax()
    self.relu = nn.ReLU()
  
  def forward(self, input_ids, attention_mask):
    output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    # Get the first element of output which is the hidden state
    # Get the embeddings of first token (CLS token)
    cls_embeddings = output[0][:,0,:]
    output = self.drop(cls_embeddings)
    output = self.fc1(output)
    output = self.relu(output)
    output = self.drop(output)
    output = self.fc2(output)
    output = self.relu(output)
    output = self.drop(output)
    output = self.fc3(output)
    return self.sigmoid(output)

In [13]:
def prediction2label(pred: np.ndarray):
    """Convert ordinal predictions to class labels, e.g.
    
    [0.9, 0.1, 0.1, 0.1] -> 1
    [0.9, 0.9, 0.1, 0.1] -> 2
    [0.9, 0.9, 0.9, 0.1] -> 3
    etc.
    """
    return (pred > 0.5).cumprod(axis=1).sum(axis=1)


class OrdinalLoss(nn.Module):
    def __init__(self):
        super(OrdinalLoss, self).__init__()

    def forward(self, output, target):
        """Ordinal regression with encoding as in https://arxiv.org/pdf/0704.1028.pdf"""

        # Take MSE loss of one hot encodings vector
        criterion = nn.MSELoss(reduction='none')
        loss = criterion(output, target).sum(axis=1)
        return loss.sum()

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3050 Laptop GPU'

### Test run of model and loss function

In [None]:
model = LFG_grading(5)
model = model.to(device)
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)
predictions = model(input_ids, attention_mask)
targets = data["score"].to(device)

In [None]:
criterion = OrdinalLoss()
loss = criterion(predictions,targets)

In [None]:
targets

In [None]:
loss.item()

In [None]:
loss.backward()

In [None]:
predictions

In [None]:
torch.sum(prediction2label(predictions) == prediction2label(targets))

In [None]:
prediction2label(predictions)
prediction2label(targets)

# Training loop 

In [15]:
EPOCHS = 10

model = LFG_grading(5).to(device)

optimizer = optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = OrdinalLoss().to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()

  losses = []
  correct_predictions = 0
  
  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["score"].to(device)

    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    # loss function
    loss = loss_fn(outputs, targets)
    losses.append(loss.item())
    loss.backward()

    predictions = prediction2label(outputs)
    targets = prediction2label(targets)
    correct_predictions += torch.sum(predictions == targets)
  
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [18]:
best_accuracy = 0
train_acc= []
train_loss = []
for epoch in range(EPOCHS):

  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)

  acc, loss = train_epoch(
    model,
    train_data_loader,    
    loss_fn, 
    optimizer, 
    device, 
    scheduler, 
    len(df_train)
  )
  train_acc.append(acc)
  train_loss.append(loss)
  print(f'Train loss {acc} accuracy {loss}')
  print()



Epoch 1/10
----------
Train loss 0.3447899618112384 accuracy 4.3339374357738025

Epoch 2/10
----------
Train loss 0.36224768139661756 accuracy 2.81546472885953

Epoch 3/10
----------
Train loss 0.3720676486633933 accuracy 2.6418205410323288

Epoch 4/10
----------
Train loss 0.36879432624113473 accuracy 2.627136587122156

Epoch 5/10
----------
Train loss 0.3753409710856519 accuracy 2.5851127990909752

Epoch 6/10
----------
Train loss 0.381342062193126 accuracy 2.5580704986886693

Epoch 7/10
----------
Train loss 0.3720676486633933 accuracy 2.560966005767074

Epoch 8/10
----------
Train loss 0.39498090561920346 accuracy 2.5461770117445277

Epoch 9/10
----------
Train loss 0.4069830878341516 accuracy 2.52199354808402

Epoch 10/10
----------
Train loss 0.38079650845608287 accuracy 2.540116934912731

