## Loading & tokenizing datasets from raw jsonl for fine-tuning pre-trained RoBERTA

In [None]:
import json

TRAIN_PATH = "/content/drive/MyDrive/CS699/data/ACL-ARC_train.jsonl"
TEST_PATH = "/content/drive/MyDrive/CS699/data/ACL-ARC_test.jsonl"

def load_json(json_path, text_key='text', label_key='label'):
  X = []
  Y = []
  with open(json_path, 'r') as f:
    for line in f:
      raw_json = json.loads(line)
      X.append(raw_json[text_key])
      Y.append(raw_json[label_key])
  return X, Y

trainX, trainY = load_json(TRAIN_PATH)
testX, testY = load_json(TEST_PATH)
num_classes = len(set(trainY))

print("# of total sentences:", len(trainX), len(testX))
print("An example sentence:", trainX[2])
print('# of labels:', num_classes)

# of total sentences: 1688 139
An example sentence: She evaluates 3,000 German verbs with a token frequency between 10 and 2,000 against the Duden ( Dudenredaktion 2001 ) .
# of labels: 6


- As a base case, pre-trained RoBERTa and its base tokenizer will be used to tokenizer our training dataset and assign int index for each of the token
- pre-trained RoBERTa weights and base tokenizers are used from the ones provided in Huggingface

In [None]:
!pip install transformers



In [None]:
from sklearn import preprocessing
import torch
from transformers import RobertaTokenizer

def convert_txt2tokenid(tokenizer, text):
  token_ids = []
  for sent in text:
    token_ids.append(tokenizer.encode(sent, padding='max_length', return_tensors = 'pt'))
  return torch.cat(token_ids, dim=0)

roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

encoded_trainX = convert_txt2tokenid(roberta_tokenizer, trainX)
encoded_testX = convert_txt2tokenid(roberta_tokenizer, testX)

label_encoder = preprocessing.LabelEncoder()

encoded_trainY = label_encoder.fit_transform(trainY)
encoded_testY = label_encoder.transform(testY)

encoded_trainY = torch.tensor(encoded_trainY)
encoded_testY = torch.tensor(encoded_testY)

print(encoded_trainX[0])
print(encoded_trainY[0])

tensor([    0, 42702,  2156,    81,     5,   375,   367,   107,  2156,   552,
           19,  9766,    11,     5,   304,     9,  2239,     8, 17325,  6448,
           13,  3857,     9,   455, 28564,   268,    36,  5415,  2156,  7528,
        25606,   732,  4422, 20082,  2156,  7528,   102, 25606,   732,  4422,
        20082,  2156,  7528,   428, 25606, 12041,   282,  1115,  3994,  3592,
         2156,  7528,  4839,  2156,  1233,  2017,    34,    57,   156,    15,
            5,   304,     9, 17325,  2239,  6448,     7,  5281, 16762, 46563,
         8117, 45774, 28201, 22810,    50,  1617,    14,  4064,    11,    10,
        45774, 28201,  1291,    36,  2197,  2156, 11151, 25606,  3513, 18086,
            8,  7380,  2156,  7969, 25606, 19021, 22704,  4400,  1076,     4,
         2156,  6708, 25606,  5866,   324,     8, 13891,  2156,  6708, 25606,
         6760,  3979,  4400,  1076,     4,  2156,  6193, 25606, 14687,   219,
          677,   260,  1638,     8, 13880,  2156,  5155, 25606, 

## Creating a dataloader for both training and test datasets
- Batch size set as indicated in the paper

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

batch_size = 16

train_dataset = TensorDataset(encoded_trainX, encoded_trainY)
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

test_dataset = TensorDataset(encoded_testX, encoded_testY)
test_dataloader = DataLoader(
            test_dataset,
            sampler = RandomSampler(test_dataset),
            batch_size = batch_size
        )

In [None]:
from transformers import RobertaModel, RobertaForSequenceClassification
from torch import nn

# RobertaForSequenceClassification could also be used.
# Drop out rate as used in the paper
class CustomRoberta(nn.Module):
    def __init__(self, num_classes=num_classes):
          super(CustomRoberta, self).__init__()
          self.robert = RobertaModel.from_pretrained("roberta-base", output_attentions = True, output_hidden_states = True)
          self.linear = nn.Linear(768, num_classes)
          self.dropout = nn.Dropout(0.1)
          self.activation = nn.Tanh()

    def forward(self, ids):
          # index 1 represents the pooled_output, the cls token.
          sequence_output = self.robert(ids)[1]
          
          linear_output = self.linear(sequence_output)
          dropout = self.dropout(linear_output)
          output = self.activation(dropout)

          return output

model = CustomRoberta(num_classes=num_classes)
model.cuda()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomRoberta(
  (robert): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768

In [None]:
import torch

# from Umang's code -- Note don't use Frequent Direction --> Buffer size is too large
class FrequentDirectionAccountant:
    """
    Frequent Directions algorithm (Alg 2 from the paper) for streaming SVD.
    """

    def __init__(self, k, l, n, device):
        """
        :param k: number of eigen vectors we want eventually (k should be less than l+1)
        :param l: buffer size
        :param n: number of parameters/dimension of vector
        :param device:
        """
        self.K = k
        self.L = l
        self.N = n

        self.step = 0
        self.buffer = torch.zeros(self.L, self.N, device=device)

    def update(self, vector):
        """
        run one step of Freq Direction
        :param vector:
        :return:
        """

        self.buffer[self.L - 1] = vector
        _, S, Vt = torch.linalg.svd(self.buffer, full_matrices=False)
        delta = S[-1] ** 2
        new_svd_vals = torch.sqrt(torch.clip(S ** 2 - delta, min=0, max=None))
        self.buffer = torch.diag(new_svd_vals) @ Vt
        self.step += 1

    def get_current_buffer(self):
        return self.buffer

    def get_current_directions(self):
        """return top k eigen vectors of A^TA"""
        _, _, Vt_B = torch.linalg.svd(self.buffer, full_matrices=False)
        return Vt_B[:self.K]

    def get_current_buffer(self):
        return self.buffer

    def get_current_directions(self):
        """return top k eigen vectors of A^TA"""
        _, _, Vt_B = torch.linalg.svd(self.buffer, full_matrices=False)
        return Vt_B[:self.K]

def count_params(model: torch.nn.Module, skip_bn_bias=False):
    count = 0
    for param in model.parameters():
        if param.requires_grad:
            if param.dim() <= 1 and skip_bn_bias:
                pass
            else:
                count += param.numel()
    return count

def flatten_grads(model, num_params, skip_bn_bias=False):
    flat_grads = torch.zeros(num_params, requires_grad=False)
    idx = 0
    for param in model.parameters():
        if param.requires_grad:
            if param.dim() <= 1 and skip_bn_bias:
                pass
            else:
                flat_grads[idx:idx + param.numel()] = torch.flatten(param.grad).data.cpu()
                idx += param.numel()
    return flat_grads

def get_loss_value(model, loader, device):
    """
    Evaluation loop for the multi-class classification problem.
    return (loss, accuracy)
    """

    model.eval()
    losses = []
    accuracies = []
    with torch.no_grad():
        for i, (images, labels) in enumerate(loader):
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = torch.nn.functional.cross_entropy(outputs, labels, reduce=None).detach()
            losses.append(loss.reshape(-1))

            acc = (torch.argmax(outputs, dim=1) == labels).float().detach()
            accuracies.append(acc.reshape(-1))

        losses = torch.cat(losses, dim=0).mean().cpu().data.numpy()
        accuracies = torch.cat(accuracies, dim=0).mean().cpu().data.numpy()
        return losses, accuracies

## Fine-tuning raw RoBERTa

In [None]:
import dill
import os
import time

import numpy as np
import torch
import torch.nn.functional as F
from torch.optim import AdamW

# hyperparameters as set by the paper
epochs = 10
optimizer = AdamW(model.parameters(), lr = 2e-5)

RESULT_FOLDER = "/content/drive/MyDrive/CS699/data/ROBERTA/"
os.makedirs(f"{RESULT_FOLDER}/ckpt", exist_ok=True)

device = torch.device("cuda")
#total_params = count_params(model, skip_bn_bias=True)
#fd = FrequentDirectionAccountant(k=2, l=10, n=total_params, device=device)
t0 = time.time()
for epoch in range(epochs):
  model.train()

  for i, batch in enumerate(train_dataloader):

    d_input_id = batch[0].to(device)
    d_labels = batch[1].to(device)
    outputs = model(d_input_id)
    loss = torch.nn.functional.cross_entropy(outputs, d_labels)

    model.zero_grad()
    loss.backward()
    optimizer.step()

    #fd.update(flatten_grads(model, total_params, skip_bn_bias=True))

  loss, acc = get_loss_value(model, test_dataloader, device=device)
  print(loss, acc)
  torch.save(
      model.state_dict(), f'{RESULT_FOLDER}/ckpt/{epoch + 1}_model.pt',
      pickle_module=dill
  )

training_time = time.time() - t0
#buffer = fd.get_current_buffer()
#directions = fd.get_current_directions()
#directions = directions.cpu().data.numpy()
'''
np.savez(
    f"{RESULT_FOLDER}/buffer.npy",
    buffer=buffer.cpu().data.numpy(), direction1=directions[0], direction2=directions[1]
)
'''

print(training_time)

1.2384826 0.6115108
1.2666544 0.5827338
1.3573583 0.5467626
1.3387839 0.55395687
1.2878712 0.5827338
1.3817556 0.51079136
1.3753235 0.51079136
1.3729696 0.51079136
1.3826815 0.51079136
1.3739272 0.51079136
970.9645256996155
