In [27]:
from transformers import AutoModelForCausalLM, AutoConfig, AutoModelForSequenceClassification
import torch

model_name = "facebook/opt-125m"
device = "mps" if torch.backends.mps.is_available() else "cpu"


config = AutoConfig.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
).to(device)
model

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,)

In [28]:
config = AutoConfig.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    config=config,
).to(device)
model

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-125m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OPTForSequenceClassification(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): La

In [29]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                             Param #
OPTForSequenceClassification                       --
├─OPTModel: 1-1                                    --
│    └─OPTDecoder: 2-1                             --
│    │    └─Embedding: 3-1                         38,608,896
│    │    └─OPTLearnedPositionalEmbedding: 3-2     1,574,400
│    │    └─LayerNorm: 3-3                         1,536
│    │    └─ModuleList: 3-4                        85,054,464
├─Linear: 1-2                                      1,536
Total params: 125,240,832
Trainable params: 125,240,832
Non-trainable params: 0

In [30]:
import math
from torch import nn

class LoRALayer(nn.Module):
    def __init__(self, weight, bias, lora_dim):
        super(LoRALayer, self).__init__()

        row, column = weight.shape

        # restore Linear
        if bias is None:
            self.linear = nn.Linear(column, row, bias=False)
            self.linear.load_state_dict({"weight": weight})
        else:
            self.linear = nn.Linear(column, row)
            self.linear.load_state_dict({"weight": weight, "bias": bias})

        # create LoRA weights (with initialization)
        self.lora_right = nn.Parameter(torch.zeros(column, lora_dim))
        nn.init.kaiming_uniform_(self.lora_right, a=math.sqrt(5))
        self.lora_left = nn.Parameter(torch.zeros(lora_dim, row))

    def forward(self, input):
        x = self.linear(input)
        y = input @ self.lora_right @ self.lora_left
        return x + y

In [31]:
lora_dim = 16

# get target module name
target_names = []
for name, module in model.named_modules():
    if isinstance(module, nn.Linear) and "decoder.layers." in name:
        target_names.append(name)

# replace each module with LoRA
for name in target_names:
    name_struct = name.split(".")
    # get target module
    module_list = [model]
    for struct in name_struct:
        module_list.append(getattr(module_list[-1], struct))
    # build LoRA
    lora = LoRALayer(
        weight = module_list[-1].weight,
        bias = module_list[-1].bias,
        lora_dim = lora_dim,
    ).to(device)
    # replace
    module_list[-2].__setattr__(name_struct[-1], lora)

In [32]:
model

OPTForSequenceClassification(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 768, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 768)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0-11): 12 x OPTDecoderLayer(
          (self_attn): OPTSdpaAttention(
            (k_proj): LoRALayer(
              (linear): Linear(in_features=768, out_features=768, bias=True)
            )
            (v_proj): LoRALayer(
              (linear): Linear(in_features=768, out_features=768, bias=True)
            )
            (q_proj): LoRALayer(
              (linear): Linear(in_features=768, out_features=768, bias=True)
            )
            (out_proj): LoRALayer(
              (linear): Linear(in_features=768, out_features=768, bias=True)
            )
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, ele

In [35]:
for name, param in model.named_parameters():
    print (name)
    if "lora_right" in name or "lora_left" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

model.decoder.embed_tokens.weight
model.decoder.embed_positions.weight
model.decoder.final_layer_norm.weight
model.decoder.final_layer_norm.bias
model.decoder.layers.0.self_attn.k_proj.lora_right
model.decoder.layers.0.self_attn.k_proj.lora_left
model.decoder.layers.0.self_attn.k_proj.linear.weight
model.decoder.layers.0.self_attn.k_proj.linear.bias
model.decoder.layers.0.self_attn.v_proj.lora_right
model.decoder.layers.0.self_attn.v_proj.lora_left
model.decoder.layers.0.self_attn.v_proj.linear.weight
model.decoder.layers.0.self_attn.v_proj.linear.bias
model.decoder.layers.0.self_attn.q_proj.lora_right
model.decoder.layers.0.self_attn.q_proj.lora_left
model.decoder.layers.0.self_attn.q_proj.linear.weight
model.decoder.layers.0.self_attn.q_proj.linear.bias
model.decoder.layers.0.self_attn.out_proj.lora_right
model.decoder.layers.0.self_attn.out_proj.lora_left
model.decoder.layers.0.self_attn.out_proj.linear.weight
model.decoder.layers.0.self_attn.out_proj.linear.bias
model.decoder.layer

In [34]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                             Param #
OPTForSequenceClassification                       --
├─OPTModel: 1-1                                    --
│    └─OPTDecoder: 2-1                             --
│    │    └─Embedding: 3-1                         (38,608,896)
│    │    └─OPTLearnedPositionalEmbedding: 3-2     (1,574,400)
│    │    └─LayerNorm: 3-3                         (1,536)
│    │    └─ModuleList: 3-4                        87,708,672
├─Linear: 1-2                                      (1,536)
Total params: 127,895,040
Trainable params: 2,654,208
Non-trainable params: 125,240,832

In [36]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import RobertaTokenizer
from transformers import DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")

def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], return_token_type_ids=True, truncation=True)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base", truncation=True, do_lower_case=True)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [37]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
val_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=data_collator
)

In [45]:
import time
from torch.nn import functional as F

def get_accuracy(y_pred, targets):
    predictions = torch.log_softmax(y_pred, dim=1).argmax(dim=1)
    accuracy = (predictions == targets).sum() / len(targets)
    return accuracy


def train(model, train_loader, val_loader, epochs, optimizer):
    total_time = 0

    for epoch in range(epochs):
        interval = len(train_loader) // 5

        total_train_loss = 0
        total_train_acc = 0
        total_val_loss = 0
        total_val_acc = 0

        start = time.time()

        model.train()
        for batch_idx, batch in enumerate(train_loader):
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids, attention_mask=attention_mask, 
            )

            #loss = loss_function(outputs, labels)
            loss = loss_function(outputs.logits, labels)
            acc = get_accuracy(outputs.logits, labels)

            total_train_loss += loss.item()
            total_train_acc += acc.item()

            loss.backward()
            optimizer.step()

            if (batch_idx + 1) % interval == 0:
                print(
                    "Batch: %s/%s | Training loss: %.4f | accuracy: %.4f"
                    % (batch_idx + 1, len(train_loader), loss, acc)
                )

        train_loss = total_train_loss / len(train_loader)
        train_acc = total_train_acc / len(train_loader)

        end = time.time()
        hours, remainder = divmod(end - start, 3600)
        minutes, seconds = divmod(remainder, 60)

        print(f"Epoch: {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        #print(f"Epoch: {epoch+1} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
        print(
            "Epoch time elapsed: {:0>2}:{:0>2}:{:05.2f}".format(
                int(hours), int(minutes), seconds
            )
        )
        print("")

        total_time += end - start

    # Get the average time per epoch
    average_time_per_epoch = total_time / epochs
    hours, remainder = divmod(average_time_per_epoch, 3600)
    minutes, seconds = divmod(remainder, 60)

    print(
        "Average time per epoch: {:0>2}:{:0>2}:{:05.2f}".format(
            int(hours), int(minutes), seconds
        )
    )

optimizer = torch.optim.Adam(params=model.parameters(), lr = 1e-5)
loss_function = torch.nn.CrossEntropyLoss()
train(model, train_dataloader, val_dataloader, 20, optimizer)

Batch: 91/459 | Training loss: 0.7976 | accuracy: 0.5000
Batch: 182/459 | Training loss: 0.6356 | accuracy: 0.6250
Batch: 273/459 | Training loss: 0.5163 | accuracy: 0.8750
Batch: 364/459 | Training loss: 0.6889 | accuracy: 0.5000
Batch: 455/459 | Training loss: 0.2945 | accuracy: 1.0000
Epoch: 1 | Train Loss: 0.6186 | Train Acc: 0.6778
Epoch time elapsed: 00:00:54.88

Batch: 91/459 | Training loss: 0.4864 | accuracy: 0.7500
Batch: 182/459 | Training loss: 0.7397 | accuracy: 0.5000
Batch: 273/459 | Training loss: 0.1362 | accuracy: 1.0000
Batch: 364/459 | Training loss: 0.6364 | accuracy: 0.6250
Batch: 455/459 | Training loss: 0.3255 | accuracy: 0.8750
Epoch: 2 | Train Loss: 0.5305 | Train Acc: 0.7350
Epoch time elapsed: 00:00:42.69

Batch: 91/459 | Training loss: 0.7279 | accuracy: 0.7500
Batch: 182/459 | Training loss: 0.2728 | accuracy: 1.0000
Batch: 273/459 | Training loss: 0.5541 | accuracy: 0.6250
Batch: 364/459 | Training loss: 0.7721 | accuracy: 0.6250
Batch: 455/459 | Training

In [47]:
def evaluate(model, test_loader):
    interval = len(test_loader) // 5

    total_test_loss = 0
    total_test_acc = 0

    model.eval()
    with torch.no_grad():
        for batch_idx, batch in enumerate(test_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids, attention_mask=attention_mask, 
            )
            loss = loss_function(outputs.logits, labels)
            acc = get_accuracy(outputs.logits, labels)

            total_test_loss += loss.item()
            total_test_acc += acc.item()

            if (batch_idx + 1) % interval == 0:
                print(
                    "Batch: %s/%s | Test loss: %.4f | accuracy: %.4f"
                    % (batch_idx + 1, len(test_loader), loss, acc)
                )

    test_loss = total_test_loss / len(test_loader)
    test_acc = total_test_acc / len(test_loader)

    print(f"Test loss: {test_loss:.4f} acc: {test_acc:.4f}")
    print("")


evaluate(model, val_dataloader)

Batch: 10/51 | Test loss: 0.7649 | accuracy: 0.8750
Batch: 20/51 | Test loss: 1.6647 | accuracy: 0.6250
Batch: 30/51 | Test loss: 2.0784 | accuracy: 0.6250
Batch: 40/51 | Test loss: 0.7182 | accuracy: 0.7500
Batch: 50/51 | Test loss: 0.0001 | accuracy: 1.0000
Test loss: 1.0370 acc: 0.7990

