In [1]:
from huggingface_hub import login
import os
from dotenv import load_dotenv

load_dotenv()

login(new_session=False)

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-1B")
model = AutoModelForCausalLM.from_pretrained("unsloth/Llama-3.2-1B")

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

In [6]:
for key in model.state_dict().keys():
    print(f"{key}\n")

model.embed_tokens.weight

model.layers.0.self_attn.q_proj.weight

model.layers.0.self_attn.k_proj.weight

model.layers.0.self_attn.v_proj.weight

model.layers.0.self_attn.o_proj.weight

model.layers.0.mlp.gate_proj.weight

model.layers.0.mlp.up_proj.weight

model.layers.0.mlp.down_proj.weight

model.layers.0.input_layernorm.weight

model.layers.0.post_attention_layernorm.weight

model.layers.1.self_attn.q_proj.weight

model.layers.1.self_attn.k_proj.weight

model.layers.1.self_attn.v_proj.weight

model.layers.1.self_attn.o_proj.weight

model.layers.1.mlp.gate_proj.weight

model.layers.1.mlp.up_proj.weight

model.layers.1.mlp.down_proj.weight

model.layers.1.input_layernorm.weight

model.layers.1.post_attention_layernorm.weight

model.layers.2.self_attn.q_proj.weight

model.layers.2.self_attn.k_proj.weight

model.layers.2.self_attn.v_proj.weight

model.layers.2.self_attn.o_proj.weight

model.layers.2.mlp.gate_proj.weight

model.layers.2.mlp.up_proj.weight

model.layers.2.mlp.down_proj.

In [10]:
import torch
from torch import nn

torch.manual_seed(1334)

x = torch.rand(2, 3, 4)

In [21]:
class RMSNormLayer(nn.Module):
    """
    This class represents implementation of RMSNorm layer. PyTorch offers layer RMSNorm layer but I will implement it from scratch for educational purposes 
    """
    def __init__(self, embd_dim, epsilon: float = 1e-5):
        super().__init__()
        self.epsilon = epsilon
        self.embd_dim = embd_dim
        self.weight = nn.Parameter(torch.ones(embd_dim)).float()


    def forward(self, x: torch.Tensor):
        root_mean_squares = torch.sqrt(x.pow(2).mean(dim=-1, keepdim=True) + self.epsilon)
        x_normalized = x / root_mean_squares
        return (x_normalized * self.weight).to(dtype=x.dtype)

In [15]:
x.shape[-1]

4

In [17]:
rmsl = nn.RMSNorm(normalized_shape=x.shape[-1], eps=1e-5)
rmsl(x)

tensor([[[0.0407, 1.0997, 1.2548, 1.1020],
         [1.4200, 0.8225, 0.1628, 1.1317],
         [0.1437, 1.6313, 0.6554, 0.9426]],

        [[1.5116, 1.1063, 0.0312, 0.7000],
         [1.3568, 1.3647, 0.4776, 0.2612],
         [0.4313, 0.2693, 1.5692, 1.1309]]], grad_fn=<MulBackward0>)

In [22]:
my_rmsl = RMSNormLayer(embd_dim=x.shape[-1], epsilon=1e-5)
my_rmsl(x)

tensor([[[0.0407, 1.0997, 1.2548, 1.1020],
         [1.4200, 0.8225, 0.1628, 1.1317],
         [0.1437, 1.6313, 0.6554, 0.9426]],

        [[1.5116, 1.1063, 0.0312, 0.7000],
         [1.3568, 1.3647, 0.4776, 0.2612],
         [0.4313, 0.2693, 1.5692, 1.1309]]], grad_fn=<MulBackward0>)

In [None]:
def precompute_rope_params(head_dim):
    # intiger -> binary -> sinusoidal -> RoPE
    # RoPE does not polute the semantics of token embeddings because we do not add vector containing the positional info, we rotate queries and keys
    # Encodes both absolute and relative position information
    # theta = position * omega
    # omega = 1 / 500000 ^ (2i/dim)
    # rot matrix = [[cos(theta) -sin(theta)] [sin(theta) cos(theta)]]
    # Token embedding gets split into groups of 2 so there are dim // 2 groups of 2 that get rotated
    # Lower indeces in token embeddings oscilate more quickly capturing small changes
    position = torch.arange(0, 100).view(100,1)
    print(f'{position.shape}')
    i = torch.arange(0, head_dim, 2, dtype=torch.float32) # (head_dim/2, )
    print(f'{i.shape}')
    omega = 1.0 / torch.pow(10000, (i / head_dim)).view(1, int(head_dim/2)) # (1, head_dim/2)
    print(f'{omega.shape}')
    theta = position * omega
    # Theta has to be exapneded to each index because right now it only holds values for even indeces -> or maybe not if I do not want to have double the params - ROPEFormer uses only half
    print(theta.shape)
    return torch.cos(theta), torch.sin(theta)

In [23]:
precompute_rope_params(100)

torch.Size([100, 1])
torch.Size([50])
torch.Size([1, 50])
torch.Size([100, 50])


(tensor([[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         [ 0.5403,  0.6736,  0.7701,  ...,  1.0000,  1.0000,  1.0000],
         [-0.4161, -0.0926,  0.1860,  ...,  1.0000,  1.0000,  1.0000],
         ...,
         [-0.9251,  0.5400, -0.4229,  ...,  0.9999,  0.9999,  0.9999],
         [-0.8193,  0.9858,  0.2524,  ...,  0.9999,  0.9999,  0.9999],
         [ 0.0398,  0.7880,  0.8117,  ...,  0.9999,  0.9999,  0.9999]]),
 tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 8.4147e-01,  7.3912e-01,  6.3795e-01,  ...,  1.7378e-04,
           1.4454e-04,  1.2023e-04],
         [ 9.0930e-01,  9.9570e-01,  9.8254e-01,  ...,  3.4756e-04,
           2.8909e-04,  2.4045e-04],
         ...,
         [ 3.7961e-01, -8.4164e-01, -9.0618e-01,  ...,  1.6856e-02,
           1.4020e-02,  1.1662e-02],
         [-5.7338e-01, -1.6776e-01, -9.6761e-01,  ...,  1.7030e-02,
           1.4165e-02,  1.1782e-02],
         [-9.9921e-

In [26]:
cos_simulation = torch.arange(0, 100).reshape((50, 2))

In [29]:
cos = cos_simulation[:20, :].reshape((1, 1, 20, 2))
cos

tensor([[[[ 0,  1],
          [ 2,  3],
          [ 4,  5],
          [ 6,  7],
          [ 8,  9],
          [10, 11],
          [12, 13],
          [14, 15],
          [16, 17],
          [18, 19],
          [20, 21],
          [22, 23],
          [24, 25],
          [26, 27],
          [28, 29],
          [30, 31],
          [32, 33],
          [34, 35],
          [36, 37],
          [38, 39]]]])

In [30]:
-1 * cos

tensor([[[[  0,  -1],
          [ -2,  -3],
          [ -4,  -5],
          [ -6,  -7],
          [ -8,  -9],
          [-10, -11],
          [-12, -13],
          [-14, -15],
          [-16, -17],
          [-18, -19],
          [-20, -21],
          [-22, -23],
          [-24, -25],
          [-26, -27],
          [-28, -29],
          [-30, -31],
          [-32, -33],
          [-34, -35],
          [-36, -37],
          [-38, -39]]]])

In [37]:
# default dim for stacking is 0
torch.stack((torch.arange(0, 12, 2).view(3,2), torch.arange(1, 12, 2).view(3,2)), dim=0)

tensor([[[ 0,  2],
         [ 4,  6],
         [ 8, 10]],

        [[ 1,  3],
         [ 5,  7],
         [ 9, 11]]])

In [None]:
torch.stack((torch.arange(0, 12, 2).view(3,2), torch.arange(1, 12, 2).view(3,2)), dim=0).shape
# 3D tensor - 2 batches, each batch matrix with 3 rows and 2 columns

torch.Size([2, 3, 2])

In [41]:
torch.stack((torch.arange(0, 12, 2).view(3,2), torch.arange(1, 12, 2).view(3,2)), dim=1)

tensor([[[ 0,  2],
         [ 1,  3]],

        [[ 4,  6],
         [ 5,  7]],

        [[ 8, 10],
         [ 9, 11]]])

In [42]:
torch.stack((torch.arange(0, 12, 2).view(3,2), torch.arange(1, 12, 2).view(3,2)), dim=1).shape

torch.Size([3, 2, 2])

In [43]:
torch.stack((torch.arange(0, 12, 2).view(3,2), torch.arange(1, 12, 2).view(3,2)), dim=2)

tensor([[[ 0,  1],
         [ 2,  3]],

        [[ 4,  5],
         [ 6,  7]],

        [[ 8,  9],
         [10, 11]]])

In [46]:
torch.stack((torch.arange(0, 12, 2).view(3,2), torch.arange(1, 12, 2).view(3,2)), dim=2).reshape(1, 1, -1)

tensor([[[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]]])

In [47]:
torch.cat((torch.arange(0, 5), torch.arange(5, 10)), dim=-1)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [48]:
torch.chunk(torch.arange(0, 10), 2, dim=-1)

(tensor([0, 1, 2, 3, 4]), tensor([5, 6, 7, 8, 9]))

In [50]:
k = torch.arange(0, 12).view(3, 4)
k

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [61]:
k.repeat_interleave(2, dim=1)

tensor([[ 0,  0,  1,  1,  2,  2,  3,  3],
        [ 4,  4,  5,  5,  6,  6,  7,  7],
        [ 8,  8,  9,  9, 10, 10, 11, 11]])

In [60]:
k.repeat(2, 2)

tensor([[ 0,  1,  2,  3,  0,  1,  2,  3],
        [ 4,  5,  6,  7,  4,  5,  6,  7],
        [ 8,  9, 10, 11,  8,  9, 10, 11],
        [ 0,  1,  2,  3,  0,  1,  2,  3],
        [ 4,  5,  6,  7,  4,  5,  6,  7],
        [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [3]:
torch.arange(10)

tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
comp = torch.view_as_complex(torch.arange(10, dtype=torch.float).view(-1, 2))
comp

tensor([0.+1.j, 2.+3.j, 4.+5.j, 6.+7.j, 8.+9.j])

In [9]:
torch.view_as_real(comp)

tensor([[0., 1.],
        [2., 3.],
        [4., 5.],
        [6., 7.],
        [8., 9.]])

In [22]:
mask = torch.triu(torch.zeros(10, 10, device=x.device, dtype=torch.bool), diagonal=1)[5:7, :10]

In [23]:
mask

tensor([[False, False, False, False, False, False, False, False, False, False],
        [False, False, False, False, False, False, False, False, False, False]])

In [20]:
torch.ones(1,10).masked_fill(mask, -torch.inf)

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]])

In [29]:
torch.arange(0, 10).view(2,5)[:, :4]

tensor([[0, 1, 2, 3],
        [5, 6, 7, 8]])

In [37]:
torch.arange(0, 500).view(10, 50)[0:1, :]

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
         18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
         36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]])