In [87]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import random
import time
import torch
import torch.nn.functional as F
from tqdm import tqdm

In [88]:
torch.manual_seed(42)

<torch._C.Generator at 0x235ca1596f0>

In [89]:
class TestModel(torch.nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.embedding = torch.nn.Embedding(10, hidden_size)
        self.linear = torch.nn.Linear(hidden_size, hidden_size)
        self.lm_head = torch.nn.Linear(hidden_size, 10)
    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x = self.linear(x)
        x = self.lm_head(x)
        return x

In [90]:
# Set a reasonably large hidden size to illustrate the small fraction
# params needed to be added for LoRA
hidden_size = 1024
model = TestModel(hidden_size)
model

TestModel(
  (embedding): Embedding(10, 1024)
  (linear): Linear(in_features=1024, out_features=1024, bias=True)
  (lm_head): Linear(in_features=1024, out_features=10, bias=True)
)

In [91]:
# dummy inputs
input_ids = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7]])

In [92]:
# toy example of a detokenizer. the vocabular only consists of 10 words(colors)
detokenizer = [
    "red",
    "orange",
    "yellow",
    "green",
    "blue",
    "indigo",
    "violet",
    "magenta",
    "marigold",
    "chartreuse",
]

In [93]:
def generate_token(model, **kwargs):
    with torch.no_grad():
        logits = model(**kwargs)
    last_logits = logits[:, -1, :]
    next_token_ids = last_logits.argmax(dim=1)
    return [detokenizer[token_id] for token_id in next_token_ids]

In [94]:
# generate one token
next_token = generate_token(model, input_ids = input_ids)[0]
next_token

'magenta'

### 파인튜닝후에는 다른 토큰을 리턴할까?

In [95]:
# dummy input tensor
# shape: (batch_size, sequence_length, hidden_size)
X = torch.randn(1, 8, 1024)
X

tensor([[[ 0.4333, -1.3517, -0.8030,  ..., -1.0855,  3.2371, -1.4757],
         [ 0.9492,  0.5531,  0.8596,  ...,  1.1852, -0.9520,  1.4142],
         [-1.0038, -0.1944, -0.1730,  ...,  0.8175, -2.7022,  1.8578],
         ...,
         [ 0.4469,  2.3303,  1.6584,  ...,  0.6878, -1.8856, -1.2131],
         [ 1.2963,  2.1563, -1.4809,  ...,  0.4633, -0.4596, -1.7364],
         [ 0.8114, -0.1914, -0.0934,  ...,  0.4458,  1.4660, -0.1785]]])

In [96]:
# LoRA A and B tensors
# A has shape (hidden_size, rank)
# B has shape (rank, hidden_size)

lora_a = torch.rand(1024, 2)
lora_b = torch.randn(2, 1024)
lora_b

tensor([[ 0.3235,  0.7884, -0.9153,  ..., -0.4534,  0.8730,  0.3989],
        [ 1.1461,  0.3800,  1.0853,  ..., -0.9835, -0.0224, -0.7409]])

In [97]:
W = model.linear.weight
W, W.shape

(Parameter containing:
 tensor([[-0.0051, -0.0010,  0.0130,  ...,  0.0187, -0.0248, -0.0170],
         [ 0.0167,  0.0076, -0.0195,  ...,  0.0023, -0.0163, -0.0182],
         [-0.0007,  0.0109, -0.0289,  ...,  0.0252,  0.0312, -0.0050],
         ...,
         [ 0.0164,  0.0051,  0.0104,  ..., -0.0112, -0.0014, -0.0035],
         [ 0.0232,  0.0213,  0.0242,  ..., -0.0245, -0.0309,  0.0131],
         [ 0.0124, -0.0065, -0.0124,  ..., -0.0310,  0.0094,  0.0193]],
        requires_grad=True),
 torch.Size([1024, 1024]))

In [98]:
W2 = lora_a @ lora_b
W2.shape

torch.Size([1024, 1024])

In [99]:
# compare number of elements of A and B with number of elements of W
# W here has shape (hidden_size, hidden_size)

lora_numel = lora_a.numel() + lora_b.numel()
base_numel = W.numel()
print(f"|A+B| / |w|: {lora_numel / base_numel}")

|A+B| / |w|: 0.00390625


In [100]:
# compute the output of X @ W (the original linear layer)
base_output = model.linear(X)

# compute the output of X @ A @ B (the added lora adapter)
lora_output = X @ lora_a @ lora_b

# sum them together
total_output = base_output + lora_output

# output should have the same shape as the original output:
# (batch_size, squence_length, hidden_size)
total_output.shape

torch.Size([1, 8, 1024])

In [101]:
# class LoraLayer(torch.nn.Module):
#     def __init__(self, base_layer, r):
#         super().__init__()
#         self.base_layer = base_layer

#         d_in, d_out = self.base_layer.weight.shape
#         self.lora_a = torch.randn(d_in, r)
#         self.lora_b = torch.randn(r, d_out)

#     def forward(self, x):
#         y1 = self.base_layer(x)
#         y2 = x @ self.lora_a @ self.lora_b
#         return y1 + y2

In [102]:
class LoraLayer(torch.nn.Module):
    def __init__(self, base_layer, r):
        super().__init__()
        self.base_layer = base_layer

        if isinstance(base_layer, torch.nn.Embedding):
            # If base_layer is an embedding layer
            self.embedding = True
        elif isinstance(base_layer, torch.nn.Linear):
            # If base_layer is a linear layer
            d_in, d_out = self.base_layer.weight.shape
            self.lora_a = torch.randn(d_in, r)
            self.lora_b = torch.randn(r, d_out)
            self.embedding = False
        else:
            raise ValueError("Unsupported base_layer type")

    def forward(self, x):
        if self.embedding:
            y1 = self.base_layer(x)
            return y1
        else:
            y1 = self.base_layer(x)
            y2 = x @ self.lora_a @ self.lora_b
            return y1 + y2

In [103]:
# wrap the linear layer of our toy model, use rank 2
hidden_size = 1024
model = TestModel(hidden_size)
lora_layer = LoraLayer(model.linear, 2)
lora_layer(X).shape

torch.Size([1, 8, 1024])

In [104]:
lora_layer

LoraLayer(
  (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
)

In [105]:
model

TestModel(
  (embedding): Embedding(10, 1024)
  (linear): Linear(in_features=1024, out_features=1024, bias=True)
  (lm_head): Linear(in_features=1024, out_features=10, bias=True)
)

In [106]:
next_token = generate_token(model, input_ids = input_ids)[0]
next_token

'blue'