In [620]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F

from dataclasses import dataclass

import tqdm
import math
import os
import urllib


import tiktoken

In [617]:
# My version of losses


# Naive implementation of softmax and cross entropy (numerically not stable - can underflow)
def softmax(x):
    # Operating on last dimension
    x = torch.exp(x - torch.max(x, dim=-1, keepdim=True)[0])
    x = x / x.sum(dim=-1, keepdim=True)
    return x

def cross_entropy(inp, targ, reduce=False):
    # operating on last dimension and assuming 2D matrices
    size, nclasses = inp.shape

    # Create 1 hot encoding 
    #targets = torch.empty((size, nclasses))
    #targets[torch.arange(size), targ] = 1

    # easier way? maybe faster?
    #targets.scatter_(-1, targets.view(-1,1), 1)

    # calculate loss as the sum of log probabilities 
    l = -1.0 * torch.log_(inp[torch.arange(size), targ])

    # targets.scatter_(-1, targets.view(-1,1), targets)

    if reduce:
        return l.mean()
        
    return l

# Better implementaion (matches pytorch)
def log_softmax(x):
    # Operating on last dimension
    x = x - torch.max(x, dim=-1, keepdim=True)[0] # shift by max
    xe = torch.exp(x).sum(dim=-1, keepdim=True) # exp
    return x - torch.log(xe)

def cross_entropy_of_log(inp, targ, reduce=False):
    # operating on last dimension and assuming 2D matrices
    size, nclasses = inp.shape
    l = -1.0 * inp[torch.arange(size), targ]

    if reduce:
        return l.mean()
        
    return l

# Or combine both softmax and cross entropy
def cross_entropy_logits(x, targ, reduce=False):
    # operating on last dimension and assuming 2D matrices
    x = x - torch.max(x, dim=-1, keepdim=True)[0] # shift by max
    x = x - torch.log(torch.exp(x).sum(dim=-1, keepdim=True)) # instead of division in exp space
    l = -1.0 * x[torch.arange(x.shape[0]), targ] # loss is taken from the correct class index

    if reduce:
        return l.mean()
        
    return l

# From karpathy
class NewGELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
    

nclasses = 5
seq_len = 10
examples = 3

g = torch.Generator().manual_seed(123)
inp = torch.rand((examples,seq_len, nclasses), dtype=torch.float64, generator=g)
targ = torch.randint(0,nclasses, size=(examples,seq_len), dtype=torch.long, generator=g)

inp = inp.view(-1, nclasses)
targ = targ.view(-1)

# check SM 
assert torch.allclose(softmax(inp), F.softmax(inp, dim=-1))

# Check CE
assert torch.allclose(F.cross_entropy(inp, targ, reduction='none'), cross_entropy(softmax(inp), targ, reduce=False))

# Will fial
# Check for numerical stability (scaling input will results in inf log(very small number) == -inf
inp *= 1000

assert torch.allclose(softmax(inp), F.softmax(inp, dim=-1))
torch.allclose(F.cross_entropy(inp, targ, reduction='none'), cross_entropy(softmax(inp), targ, reduce=False))

False

In [601]:

assert torch.allclose(log_softmax(inp), F.log_softmax(inp, dim=-1))
assert torch.allclose(F.cross_entropy(inp, targ, reduction='none'), cross_entropy_of_log(log_softmax(inp), targ, reduce=False))

# check combined version
assert torch.allclose(F.cross_entropy(inp, targ, reduction='none'), cross_entropy_logits(inp, targ, reduce=False))

In [712]:

class DummyDataset(Dataset):
    def __init__(self, size, seq_len, vocab):
        assert vocab < 2**16
        super().__init__()
        self.vocab = vocab
        self.size = size
        self.data = torch.randint(low=0, high=vocab, size=(size * seq_len,), dtype=torch.int).view((size, seq_len))
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # return self.data[idx,:-1], self.data[idx,1:]\
        # Decided to return the whole sequence
        return self.data[idx,:]


class MYDS(Dataset):
    import os
    import urllib.request
    
    def __init__(self, path, max_length, stride):
        max_length += 1
        if not os.path.exists(path):
            url = ("https://raw.githubusercontent.com/rasbt/"
                   "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
                   "the-verdict.txt")
            urllib.request.urlretrieve(url, path)
            
        with open(path, "r", encoding="utf-8") as f:
            self.raw_text = f.read()
            
        self.tokenizer = tiktoken.get_encoding("gpt2")
        
        token_ids = self.tokenizer.encode(self.raw_text, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        self.data = []
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            #target_chunk = token_ids[i + 1: i + max_length + 1]
            self.data.append(torch.tensor(input_chunk))

    def __getitem__(self, idx):
        # return self.data[idx,:-1], self.data[idx,1:]\
        # Decided to return the whole sequence
        return self.data[idx]

    def __len__(self):
        return len(self.data)
        
    

class SelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.nheads = config.nheads
        self.hdim = config.hdim
        # q, k, v combined in 1 matrix (to be split later).
        # We assume input dimension is same as output dimension after projections
        self.qkv = nn.parameter.Parameter(torch.rand(size=(config.hdim, 3 * config.nheads * config.hdim), dtype=torch.float32) / math.sqrt(config.hdim))
        self.projection = nn.parameter.Parameter(torch.rand(size=(config.hdim * config.nheads, config.hdim), dtype=torch.float32) / math.sqrt(config.hdim))
        # which cells to mask? the upper left by setting them to 0 (diagonal is not masked)
        mask = torch.tril(torch.ones(config.seq_len,config.seq_len))
        self.register_buffer("mask", mask)
        self.dropout = nn.Dropout(p=config.att_drop)
        

    def forward(self, x):
        # x: (batch x sequence x hdim)
        B, S, H = x.size()
        
        # split into Q K V matrices. Each of shape [hdim, nheads * hdim]. 
        # alternatively: x =  x @ self.qkv
        Q, K, V = torch.split(self.qkv, H * self.nheads, 1)

        # project input into q,k,v (batch x nheads x seq x hdim)
        q = (x @ Q).view(B,self.nheads,S,H)
        k = (x @ K).view(B,self.nheads,S,H)
        v = (x @ V).view(B,self.nheads,S,H)

        # calculate dot product (batch x heads x seq(queries) x seq(keys))
        att = torch.matmul(q, k.transpose(3,2)) * (1.0 / math.sqrt(self.hdim))

        # Only difference of causal self-att.
        # Causal masking (in place):  set all items to the right of index i,i to zero (-inf before softmax)
        att.masked_fill_(torch.eq(self.mask.view(1,1,S,S), torch.scalar_tensor(0)), -torch.inf)

        # Apply softmax then droptout
        att = F.softmax(att, dim=-1) # same shape
        att = self.dropout(att)
        
        out = att @ v # batch x heads x seq x hdim

        # Project from all heads back to hdim
        out = out.view(B,S,-1) # concat all heads: batch x seq x hdim*nheads
        out = out @ self.projection # batch x seq x hdim
        #print(out.mean())
        return out


class AttBlock(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.layer_norm1 = nn.LayerNorm(config.hdim)
        self.layer_norm2 = nn.LayerNorm(config.hdim)
        
        ffexpand = nn.Linear(config.hdim, 4 * config.hdim, bias=False)
        gelu = NewGELU()
        ffshrink = nn.Linear(4 * config.hdim, config.hdim, bias=False)
        self.res_drop = nn.Dropout(config.res_drop)
        
        self.att = SelfAttention(config)
        self.mlp = nn.Sequential(ffexpand, gelu, ffshrink, self.res_drop)
        

    def forward(self, x):
        # x: (batch x sequence)
        #print(x.size())

        out = self.att(self.layer_norm1(x))
        out = self.res_drop(out)
        out = self.layer_norm2(x + out)
        out = self.mlp(out)
        
        return out

# todo: positional encoding
class MyModel(nn.Module):
    def __init__(self, config, layers=3):
        super().__init__()
        self.vocab_size = config.vocab
        self.emb = torch.nn.Embedding(config.vocab, config.hdim)
        self.emb_drop = nn.Dropout(config.emb_drop)
        self.layers = nn.Sequential(*[AttBlock(config) for _ in range(config.layers)])
        self.last_layer_norm = nn.LayerNorm(config.hdim)
        self.llm_head = nn.parameter.Parameter(torch.rand(size=(config.hdim, config.vocab), dtype=torch.float32) / math.sqrt(config.vocab))
        self.loss = cross_entropy_logits
        print(f"Total parameters: {sum(p.numel() for p in self.parameters() if p.requires_grad)}")

    def forward(self, x):
        outs = self.emb(x[:,:-1])
        outs = self.emb_drop(outs)
        B, T, nh = outs.size()
        outs = self.layers(outs)
        outs = self.last_layer_norm(outs)
        outs = outs @ self.llm_head # B x S X vocab_size
        #print(f"Output shape: {outs.shape}")
        loss = self.loss(outs.view(-1,self.vocab_size), x[:,1:].contiguous().view(-1), reduce=True)
        return loss, torch.argmax(outs, dim=-1)

In [603]:


o = model(b)
o, b

Total parameters: 79002


((tensor(5.3012, grad_fn=<MeanBackward0>),
  tensor([[ 62,  55,  73, 193,  57,  96,  73,  73, 163, 193],
          [ 50, 193, 167,  73,  73, 174,  96,  12,  13,  11],
          [ 96,  68, 102,  73,  73, 140, 193, 193,  96,  96],
          [113,  96, 193,  96, 193, 193, 193, 163, 148,  99]])),
 tensor([[0, 2, 2, 0, 3, 4, 3, 3, 4, 0, 0],
         [3, 0, 0, 3, 3, 3, 2, 2, 4, 1, 2],
         [1, 4, 1, 4, 3, 2, 0, 3, 1, 3, 3],
         [0, 2, 3, 4, 0, 0, 0, 4, 3, 0, 0]], dtype=torch.int32))

In [757]:
## data configs

@dataclass
class MyModelConfig:
    data_size = 100
    seq_len = 10  #actual context is seq_len
    vocab = 5
    batch_size = 8
    epochs = 200
    layers = 2
    nheads = 3
    hdim = 16
    att_drop = .01
    res_drop = .01
    emb_drop = .01

# Load config
conf = MyModelConfig()

# Load dataset, Add 1 to context len since we truncate
#data = DummyDataset(conf.data_size, conf.seq_len + 1, conf.vocab)
data = MYDS("verdict.txt", conf.seq_len, stride=5)
conf.vocab = data.tokenizer.n_vocab
print(f"Vocab size: {conf.vocab}")
#data.data[0]


train_loader = DataLoader(data, batch_size=conf.batch_size)


model = MyModel(conf)
model.train()

optimizer = torch.optim.AdamW(params=model.parameters(), lr=0.001, )

Vocab size: 50257
Total parameters: 1618624


In [762]:
from peft import get_peft_model

from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1, get_modules=["q", "v"])


model = get_peft_model(model, peft_config)

ValueError: Target modules {'q', 'v'} not found in the base model. Please check the target modules and try again.

In [657]:


# Training loop
tsteps = -1
lsteps = 200
iters = 0
#for e in tqdm.tqdm(range(epochs)):
for i, e in enumerate(range(conf.epochs)):
    for b in train_loader:
        iters += 1
        model.zero_grad()
        loss, outs = model(b)
        loss.backward()
        optimizer.step()
        if iters % lsteps == 0:
            print(f"Epoch:{i+1}/step:{iters}:  loss: {loss.item()}")
        if tsteps> -1 and iters > tsteps:
            break

Vocab size: 50257
Total parameters: 1618624
Epoch:2/step:200:  loss: 6.431022644042969
Epoch:4/step:400:  loss: 6.165279388427734
Epoch:5/step:600:  loss: 5.853367805480957
Epoch:7/step:800:  loss: 6.129121780395508
Epoch:8/step:1000:  loss: 5.976624488830566
Epoch:10/step:1200:  loss: 5.519522190093994
Epoch:11/step:1400:  loss: 4.595290184020996
Epoch:13/step:1600:  loss: 4.9460248947143555
Epoch:14/step:1800:  loss: 4.727682590484619
Epoch:16/step:2000:  loss: 4.704626083374023
Epoch:18/step:2200:  loss: 5.030416011810303
Epoch:19/step:2400:  loss: 4.241650581359863
Epoch:21/step:2600:  loss: 4.089865207672119
Epoch:22/step:2800:  loss: 4.1062774658203125
Epoch:24/step:3000:  loss: 3.857464551925659
Epoch:25/step:3200:  loss: 4.203387260437012
Epoch:27/step:3400:  loss: 3.8177058696746826
Epoch:28/step:3600:  loss: 3.683701992034912
Epoch:30/step:3800:  loss: 3.565894603729248
Epoch:32/step:4000:  loss: 3.4561429023742676
Epoch:33/step:4200:  loss: 3.396604061126709
Epoch:35/step:44

In [658]:
b, outs


(tensor([[  329,   502,     0,   383,   520,  5493,    82,  1302,  3436,    11,
            290],
         [ 5493,    82,  1302,  3436,    11,   290,  1645,  1752,   438,  4360,
            612],
         [  290,  1645,  1752,   438,  4360,   612,   338,   645, 42393,   803,
            674]]),
 tensor([[  502,     0,   632,   520,  5493,    82,  1302,  3436,    11,   290],
         [   82,  1302,  3436,    11,   290,  1645,  1752,   438,  4360,   326],
         [ 1645,  1752,   438,  4360,   612,   338,   645, 42393,   803,   674]]))

In [678]:
model(data[2].view(1,-1))[1]

tensor([[ 1808, 15632,   438,  2016,   257,   922,  5891,  1576,   438,   568]])

In [711]:
idx = 26
print(data.tokenizer.decode_batch(data[idx].view(1,-1).numpy()))
model.eval()
print(data.tokenizer.decode_batch(model(data[idx].view(1,-1))[1].numpy()))

[" picture 'way up; but I don't think of"]
["--way up, and I looked't dab of"]


In [744]:

sum([t.numel() for t in optimizer.param_groups[0]['params']]) / sum(p.numel() for p in model.parameters() if p.requires_grad)

1.0

In [755]:
torch.cuda.memory_allocated()

0