In [1]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "true"

In [2]:
from transformers import RobertaModel
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
from functools import partial
import transformers
from transformers import RobertaModel, RobertaTokenizer

import os
os.environ["TRANSFORMERS_NO_TF"] = "true"

class RobertaBaseClassifier(torch.nn.Module):
    def __init__(self):
        super(RobertaBaseClassifier, self).__init__()
        self.roberta = RobertaModel.from_pretrained("roberta-base")
        self.linear = torch.nn.Linear(768, 768)
        self.activation = torch.nn.ReLU()
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_with_pooling = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        hidden_state = output_with_pooling[0]
        pooler = hidden_state[:, 0]
        pooler = self.linear(pooler)
        pooler = self.activation(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output


class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, r, alpha):
        super().__init__()
        self.r = r
        self.alpha = alpha

        # Initialize A to kaiming uniform following code: https://github.com/microsoft/LoRA/blob/main/loralib/layers.py
        self.A = torch.nn.Parameter(torch.empty(r, in_dim))
        # Initialize B to zeros.
        self.B = torch.nn.Parameter(torch.empty(out_dim, r))
        torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
        torch.nn.init.zeros_(self.B)

        self.scaling = self.alpha / self.r

    def forward(self, x):
        x = self.scaling * (x @ self.A.transpose(0, 1) @ self.B.transpose(0, 1))
        return x



class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, r, alpha):
        super().__init__()
        self.r = r
        self.alpha = alpha

        # Initialize A, B, and C matrices
        self.A = torch.nn.Parameter(torch.empty(r, in_dim))
        self.B = torch.nn.Parameter(torch.empty(out_dim, r))
        self.C = torch.nn.Parameter(torch.empty(r, r))

        # Initialize A and C with Kaiming uniform
        torch.nn.init.kaiming_uniform_(self.A, a=math.sqrt(5))
        torch.nn.init.kaiming_uniform_(self.C, a=math.sqrt(5))

        # Initialize B to zeros to start with a "no adaptation" setup
        torch.nn.init.zeros_(self.B)

        # Scaling factor for LoRA
        self.scaling = self.alpha / self.r

    def forward(self, x):
        # Perform tri-factorized transformation: x * A * C * B
        # Optional nonlinearity after A or C (e.g., ReLU), can be adjusted
        x = x @ self.A.T                    # Step 1: x * A.T
        x = F.relu(x @ self.C)               # Step 2: Apply C and optional ReLU for nonlinearity
        x = x @ self.B.T                     # Step 3: Apply B.T

        # Apply scaling
        x = self.scaling * x
        return x



class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, r, alpha):
        super().__init__()
        self.linear = linear
        self.lora = LoRALayer(linear.in_features, linear.out_features, r, alpha)

    def forward(self, x):
        return self.linear(x) + self.lora(x)


class LoraModel(nn.Module):
    def __init__(self, base_model_name="roberta-base", rank=16, dropout_rate=0.1):
        super(LoraModel, self).__init__()

        self.roberta = RobertaModel.from_pretrained(base_model_name)

        # Freeze RoBERTa parameters
        for param in self.roberta.parameters():
            param.requires_grad = False

        self.dropout = nn.Dropout(dropout_rate)

        # Initialize LoRA parameters
        self.rank = rank
        self.lora_A = nn.Linear(
            self.roberta.config.hidden_size, rank, bias=False
        )  # Size: 768 x 16
        self.lora_B = nn.Linear(
            rank, self.roberta.config.hidden_size, bias=False
        )  # Size: 16 x 768

        # Output layer for binary classification
        self.classifier = nn.Linear(self.roberta.config.hidden_size, 2)  # Size: 768 x 2

    def forward(self, input_ids, attention_mask=None):
        roberta_output = self.roberta(
            input_ids=input_ids, attention_mask=attention_mask
        )
        hidden_states = roberta_output.last_hidden_state

        # Apply LoRA
        lora_output = self.lora_B(self.lora_A(hidden_states))

        output = hidden_states + lora_output
        output = self.dropout(output)

        cls_output = output[:, 0, :]  # Use the [CLS] token representation
        logits = self.classifier(cls_output)
        return logits


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def create_lora_model():

    lora_model = RobertaBaseClassifier()

    for param in lora_model.parameters():
        param.requires_grad = False

    lora_r = 16
    lora_alpha = lora_r * 2
    
    assign_lora = partial(LinearWithLoRA, r=lora_r, alpha=lora_alpha)
    
    for layer in lora_model.roberta.encoder.layer:
        layer.attention.self.query = assign_lora(layer.attention.self.query)
        layer.attention.self.value = assign_lora(layer.attention.self.value)

    return lora_model

roberta_model = RobertaBaseClassifier()
base_param_count = count_parameters(roberta_model)

lora_model = create_lora_model()
lora_param_count = count_parameters(lora_model)
print("Model with LoRA param count:", lora_param_count)
print("Base model param count:", base_param_count)
print(str(base_param_count // lora_param_count) + " times smaller than base model")


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model with LoRA param count: 595968
Base model param count: 125237762
210 times smaller than base model


In [3]:
from torchinfo import summary
summary(roberta_model)

Layer (type:depth-idx)                                            Param #
RobertaBaseClassifier                                             --
├─RobertaModel: 1-1                                               --
│    └─RobertaEmbeddings: 2-1                                     --
│    │    └─Embedding: 3-1                                        38,603,520
│    │    └─Embedding: 3-2                                        394,752
│    │    └─Embedding: 3-3                                        768
│    │    └─LayerNorm: 3-4                                        1,536
│    │    └─Dropout: 3-5                                          --
│    └─RobertaEncoder: 2-2                                        --
│    │    └─ModuleList: 3-6                                       85,054,464
│    └─RobertaPooler: 2-3                                         --
│    │    └─Linear: 3-7                                           590,592
│    │    └─Tanh: 3-8                                             --

In [4]:
from torchinfo import summary
summary(lora_model)

Layer (type:depth-idx)                                            Param #
RobertaBaseClassifier                                             --
├─RobertaModel: 1-1                                               --
│    └─RobertaEmbeddings: 2-1                                     --
│    │    └─Embedding: 3-1                                        (38,603,520)
│    │    └─Embedding: 3-2                                        (394,752)
│    │    └─Embedding: 3-3                                        (768)
│    │    └─LayerNorm: 3-4                                        (1,536)
│    │    └─Dropout: 3-5                                          --
│    └─RobertaEncoder: 2-2                                        --
│    │    └─ModuleList: 3-6                                       85,650,432
│    └─RobertaPooler: 2-3                                         --
│    │    └─Linear: 3-7                                           (590,592)
│    │    └─Tanh: 3-8                                     

In [5]:
#for name, param in roberta_model.named_parameters():
#    print(f"{name}: {param.requires_grad}")

In [6]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer
from transformers import DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], return_token_type_ids=True, truncation=True)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base", truncation=True, do_lower_case=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
val_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [8]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 87]),
 'token_type_ids': torch.Size([8, 87]),
 'attention_mask': torch.Size([8, 87])}

In [9]:
model = lora_model

In [10]:
torch.backends.mps.is_available()

True

In [11]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)
token_type_ids = batch["token_type_ids"].to(device)
labels = batch["labels"].to(device)
model.to(device)

outputs = model(input_ids,
              attention_mask=attention_mask,
              token_type_ids=token_type_ids)


In [12]:
import time

def get_accuracy(y_pred, targets):
    predictions = torch.log_softmax(y_pred, dim=1).argmax(dim=1)
    accuracy = (predictions == targets).sum() / len(targets)
    return accuracy


def train(model, train_loader, val_loader, epochs, optimizer):
    total_time = 0

    for epoch in range(epochs):
        interval = len(train_loader) // 5

        total_train_loss = 0
        total_train_acc = 0
        total_val_loss = 0
        total_val_acc = 0

        start = time.time()

        model.train()
        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
            )

            loss = loss_function(outputs, labels)
            acc = get_accuracy(outputs, labels)

            total_train_loss += loss.item()
            total_train_acc += acc.item()

            loss.backward()
            optimizer.step()

            if (batch_idx + 1) % interval == 0:
                print(
                    "Batch: %s/%s | Training loss: %.4f | accuracy: %.4f"
                    % (batch_idx + 1, len(train_loader), loss, acc)
                )

        train_loss = total_train_loss / len(train_loader)
        train_acc = total_train_acc / len(train_loader)

        end = time.time()
        hours, remainder = divmod(end - start, 3600)
        minutes, seconds = divmod(remainder, 60)

        print(f"Epoch: {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        #print(f"Epoch: {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
        print(
            "Epoch time elapsed: {:0>2}:{:0>2}:{:05.2f}".format(
                int(hours), int(minutes), seconds
            )
        )
        print("")

        total_time += end - start

    # Get the average time per epoch
    average_time_per_epoch = total_time / epochs
    hours, remainder = divmod(average_time_per_epoch, 3600)
    minutes, seconds = divmod(remainder, 60)

    print(
        "Average time per epoch: {:0>2}:{:0>2}:{:05.2f}".format(
            int(hours), int(minutes), seconds
        )
    )

optimizer = torch.optim.Adam(params=model.parameters(), lr = 1e-5)
loss_function = torch.nn.CrossEntropyLoss()
train(model, train_dataloader, val_dataloader, 20, optimizer)

Batch: 91/459 | Training loss: 0.6170 | accuracy: 0.8750
Batch: 182/459 | Training loss: 0.7338 | accuracy: 0.5000
Batch: 273/459 | Training loss: 0.7314 | accuracy: 0.5000
Batch: 364/459 | Training loss: 0.6421 | accuracy: 0.7500
Batch: 455/459 | Training loss: 0.6508 | accuracy: 0.6250
Epoch: 1 | Train Loss: 0.6519 | Train Acc: 0.6743
Epoch time elapsed: 00:00:44.14

Batch: 91/459 | Training loss: 0.6932 | accuracy: 0.5000
Batch: 182/459 | Training loss: 0.6254 | accuracy: 0.6250
Batch: 273/459 | Training loss: 0.6159 | accuracy: 0.6250
Batch: 364/459 | Training loss: 0.5178 | accuracy: 0.8750
Batch: 455/459 | Training loss: 0.7187 | accuracy: 0.5000
Epoch: 2 | Train Loss: 0.6413 | Train Acc: 0.6743
Epoch time elapsed: 00:00:36.75

Batch: 91/459 | Training loss: 0.5279 | accuracy: 0.8750
Batch: 182/459 | Training loss: 0.3549 | accuracy: 1.0000
Batch: 273/459 | Training loss: 0.4660 | accuracy: 0.7500
Batch: 364/459 | Training loss: 0.5730 | accuracy: 0.6250
Batch: 455/459 | Training

In [13]:
summary(model)

Layer (type:depth-idx)                                            Param #
RobertaBaseClassifier                                             --
├─RobertaModel: 1-1                                               --
│    └─RobertaEmbeddings: 2-1                                     --
│    │    └─Embedding: 3-1                                        (38,603,520)
│    │    └─Embedding: 3-2                                        (394,752)
│    │    └─Embedding: 3-3                                        (768)
│    │    └─LayerNorm: 3-4                                        (1,536)
│    │    └─Dropout: 3-5                                          --
│    └─RobertaEncoder: 2-2                                        --
│    │    └─ModuleList: 3-6                                       85,650,432
│    └─RobertaPooler: 2-3                                         --
│    │    └─Linear: 3-7                                           (590,592)
│    │    └─Tanh: 3-8                                     

In [14]:
def evaluate(model, test_loader):
    interval = len(test_loader) // 5

    total_test_loss = 0
    total_test_acc = 0

    model.eval()
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids
            )
            loss = loss_function(outputs, labels)
            acc = get_accuracy(outputs, labels)

            total_test_loss += loss.item()
            total_test_acc += acc.item()

            if (batch_idx + 1) % interval == 0:
                print(
                    "Batch: %s/%s | Test loss: %.4f | accuracy: %.4f"
                    % (batch_idx + 1, len(test_loader), loss, acc)
                )

    test_loss = total_test_loss / len(test_loader)
    test_acc = total_test_acc / len(test_loader)

    print(f"Test loss: {test_loss:.4f} acc: {test_acc:.4f}")
    print("")


evaluate(model, val_dataloader)

Batch: 10/51 | Test loss: 0.4802 | accuracy: 0.8750
Batch: 20/51 | Test loss: 0.3231 | accuracy: 0.7500
Batch: 30/51 | Test loss: 0.6541 | accuracy: 0.5000
Batch: 40/51 | Test loss: 0.5715 | accuracy: 0.7500
Batch: 50/51 | Test loss: 0.1695 | accuracy: 0.8750
Test loss: 0.3642 acc: 0.8358



In [15]:
summary(lora_model)

Layer (type:depth-idx)                                            Param #
RobertaBaseClassifier                                             --
├─RobertaModel: 1-1                                               --
│    └─RobertaEmbeddings: 2-1                                     --
│    │    └─Embedding: 3-1                                        (38,603,520)
│    │    └─Embedding: 3-2                                        (394,752)
│    │    └─Embedding: 3-3                                        (768)
│    │    └─LayerNorm: 3-4                                        (1,536)
│    │    └─Dropout: 3-5                                          --
│    └─RobertaEncoder: 2-2                                        --
│    │    └─ModuleList: 3-6                                       85,650,432
│    └─RobertaPooler: 2-3                                         --
│    │    └─Linear: 3-7                                           (590,592)
│    │    └─Tanh: 3-8                                     