In [1]:
import sys

module_path = "../src"

if module_path not in sys.path:
    sys.path.append(module_path)

In [2]:
# Load dataset
from dataset import get_dataset
dataset = get_dataset()

In [3]:
# Load libraries
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn

model_name = 'roberta-base'
batch_size = 32
epochs = 5
num_labels = 4

In [4]:
# Define tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(input):
  return tokenizer(input['text'], padding="max_length", truncation=True)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize, batched=True)

# Shuffle and pick subset from dataset
train_dataset = tokenized_dataset['train'].shuffle(seed=442333+424714).select(range(5000))
eval_dataset = tokenized_dataset['test'].shuffle(seed=442333+424714).select(range(1000))

In [5]:
import evaluate
import numpy as np
metric = evaluate.load("accuracy")

# Prepare evaluation callback, metric = accuracy
def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [6]:
# Configure LoRA
from peft import LoraConfig, TaskType

lora_config = LoraConfig(
    # r = 8
    task_type=TaskType.SEQ_CLS, r=8, lora_alpha=1, lora_dropout=0.1
)

In [7]:
# Define custom classification head with smaller layer and changed activation function to relu
class RobertaClassificationHead(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size//2) # smaller size
        classifier_dropout = (
            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
        )
        self.dropout = nn.Dropout(classifier_dropout)
        self.out_proj = nn.Linear(config.hidden_size//2, config.num_labels)

    def forward(self, features, **kwargs):
        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.relu(x) # different activation function
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [8]:
# Create model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.classifier = RobertaClassificationHead(model.config) # replace classifer

from peft import get_peft_model
model = get_peft_model(model, lora_config) # setup lora

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    evaluation_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [10]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mkpierzynski[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.349146,0.636
2,No log,1.240878,0.855
3,No log,0.746952,0.882
4,1.212400,0.413322,0.885
5,1.212400,0.388902,0.888


TrainOutput(global_step=785, training_loss=0.9423960886183818, metrics={'train_runtime': 466.5304, 'train_samples_per_second': 53.587, 'train_steps_per_second': 1.683, 'total_flos': 6600544051200000.0, 'train_loss': 0.9423960886183818, 'epoch': 5.0})

In [11]:
model.print_trainable_parameters()
print(model)

trainable params: 591,748 || all params: 124,943,624 || trainable%: 0.4736120028021598
PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): RobertaForSequenceClassification(
      (roberta): RobertaModel(
        (embeddings): RobertaEmbeddings(
          (word_embeddings): Embedding(50265, 768, padding_idx=1)
          (position_embeddings): Embedding(514, 768, padding_idx=1)
          (token_type_embeddings): Embedding(1, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): RobertaEncoder(
          (layer): ModuleList(
            (0-11): 12 x RobertaLayer(
              (attention): RobertaAttention(
                (self): RobertaSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dr