### Data Preparation

In [1]:
import numpy as np
from data.prepare import get_charEncoding, create_splits
from config.config import *
from model.model import GPTModel
from model.utilities import evaluate_loss
import torch
import torch.nn as nn
from torch.nn import functional as F
if device_comp:
    device = torch.device(device_comp)
    
torch.manual_seed(1337)
print(device)

mps


In [2]:
encoded_train_data, encoded_test_data, encoder = get_charEncoding(path="./data/text.txt")
data_x, data_y = create_splits(encoded_train_data, encoded_test_data, mode='train')

In [3]:
encoded_train_data.tofile("./data/train.bin")
encoded_test_data.tofile("./data/test.bin")

In [4]:
encoder.n_vocab()

65

### Model Building

In [5]:
class AttentionHead(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        
    def forward(self, x):
        batch, time, single_embed_size = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        wei = q @ k.transpose(-2, -1) * single_embed_size**-0.5
        masked_output = wei.masked_fill(self.tril[:time, :time] == 0, float('-inf'))
        masked_softmax = F.softmax(masked_output, dim=1)
        output = masked_softmax @ v
        return output

In [6]:
class GPTModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embeddings = nn.Embedding(vocab_size, n_embed)
        self.position_embeddings = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
        self.att_head = AttentionHead()
        self.apply(self.__init_weights__)
        
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weights'):
                torch.nn.init.normal_(p, mean = 0.0, std = 0.02 / math.sqrt(2 * n_layer))
        
        
    def __init_weights__(self, module):    
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
                
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            
            
    def forward(self, idx, target = None):

        B, T = idx.shape
        token_embeddings = self.token_embeddings(idx)
        positional_embeddings = self.position_embeddings(torch.arange(T, device=device))
        x = token_embeddings + positional_embeddings
        x = self.att_head(x)
        logits = self.lm_head(x)
        
        if target is None:
            loss = None
        else:
            batch, block, channel = logits.shape
            logits = logits.view(batch * block, channel)
            target = target.view(batch * block)
            loss = F.cross_entropy(logits, target)

        return logits, loss
    
    def generate_captions(self, idx, max_tokens):
        for _ in range(max_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = 1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat([idx, idx_next], dim = 1)
        return idx

In [7]:
model = GPTModel()
if compile:
    model = torch.compile(model).to(device)
else:
    model = model.to(device)

In [8]:
model

GPTModel(
  (token_embeddings): Embedding(65, 32)
  (position_embeddings): Embedding(8, 32)
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
  (att_head): AttentionHead(
    (key): Linear(in_features=32, out_features=16, bias=False)
    (query): Linear(in_features=32, out_features=16, bias=False)
    (value): Linear(in_features=32, out_features=16, bias=False)
  )
)

In [9]:
data_x.shape, data_y.shape

(torch.Size([32, 8]), torch.Size([32, 8]))

In [None]:
model(data_x, data_y)

In [32]:
print(encoder.decode(model.generate_captions(torch.zeros((1, 1), dtype = torch.long).to(device), 100)[0].tolist()))


ur.FTV$KKQtlNQR;$RDqguEUjLukj3SgQr!f'u.lLp!j'Jn BpF&gQ3yFvEQf,m!nPoggMnF&ofB
'?qLTc&BdvDyle$'Qsuqiis


In [27]:
%run model/model.py

In [40]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-2)
for iter in range(max_iter):
    
    if iter % eval_iter == 0:
        output_loss = evaluate_loss(model, encoded_train_data, encoded_test_data)
        print("Current Step: {}, Train Loss: {}, Test Loss: {}".format(iter, round(output_loss['train'], 4), round(output_loss['test'], 4)))
    x, y = create_splits(encoded_train_data, encoded_test_data, mode='train')
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Final loss: {}".format(loss.item()))

Current Step: 0, Train Loss: 2.4901, Test Loss: 2.5613
Current Step: 30, Train Loss: 2.5778, Test Loss: 2.6762
Current Step: 60, Train Loss: 2.4785, Test Loss: 2.6866
Current Step: 90, Train Loss: 2.5958, Test Loss: 2.6625
Current Step: 120, Train Loss: 2.5539, Test Loss: 2.6991
Current Step: 150, Train Loss: 2.6502, Test Loss: 2.7153
Current Step: 180, Train Loss: 2.6576, Test Loss: 2.6905
Current Step: 210, Train Loss: 2.6473, Test Loss: 2.6696
Current Step: 240, Train Loss: 2.6502, Test Loss: 2.6816
Current Step: 270, Train Loss: 2.6427, Test Loss: 2.6435
Final loss: 2.5272274017333984


In [41]:
print(encoder.decode(model.generate_captions(torch.zeros((1, 1), dtype = torch.long).to(device), 100)[0].tolist()))


halupequt f kexANUNGSCKESAtARGRDIXELrof-prind.
M:
CINTHAn'
Helk-huthe quryon?
MDUCUCK:
3 HATh y;
Y:



In [51]:
model

GPTModel(
  (token_embeddings): Embedding(65, 32)
  (position_embeddings): Embedding(8, 32)
  (lm_head): Linear(in_features=32, out_features=65, bias=True)
)

In [50]:
model(data_x, data_y)

(tensor([[ 0.7113,  2.1091, -1.9051,  ..., -4.3773, -0.3147, -5.4562],
         [ 0.3476,  1.7917, -1.7693,  ..., -3.6318, -0.4090, -4.3735],
         [ 1.2363,  1.5802, -2.0612,  ..., -3.1681, -0.6099, -3.7819],
         ...,
         [-2.2400, -1.8373, -4.2619,  ..., -4.1628,  0.9269, -3.7841],
         [-0.3708,  2.5581, -1.8356,  ..., -5.4027,  2.0241, -6.6611],
         [-0.8319,  2.7027, -2.8058,  ..., -3.8215,  0.0690, -3.7780]],
        device='mps:0', grad_fn=<ViewBackward0>),
 tensor(2.5306, device='mps:0', grad_fn=<NllLossBackward0>))

In [155]:
for i in model.parameters():
    print(i)

Parameter containing:
tensor([[ 0.0149,  0.0138,  0.0283,  ...,  0.0021,  0.0153, -0.0214],
        [-0.0110,  0.0043,  0.0062,  ..., -0.0041,  0.0424, -0.0077],
        [ 0.0268,  0.0149, -0.0405,  ..., -0.0081,  0.0342, -0.0161],
        ...,
        [ 0.0350,  0.0326, -0.0211,  ..., -0.0017,  0.0199,  0.0086],
        [ 0.0214, -0.0232,  0.0185,  ..., -0.0012, -0.0084,  0.0057],
        [ 0.0279, -0.0209,  0.0141,  ...,  0.0056, -0.0067, -0.0086]],
       device='mps:0', requires_grad=True)
Parameter containing:
tensor([[-0.0040, -0.0027,  0.0085,  ..., -0.0127, -0.0327, -0.0062],
        [ 0.0231,  0.0061,  0.0100,  ..., -0.0001,  0.0069,  0.0100],
        [-0.0130,  0.0412, -0.0079,  ...,  0.0219,  0.0110,  0.0436],
        ...,
        [-0.0210,  0.0268,  0.0271,  ..., -0.0177, -0.0248,  0.0073],
        [ 0.0200,  0.0402,  0.0185,  ...,  0.0042,  0.0013,  0.0088],
        [-0.0194, -0.0048, -0.0015,  ..., -0.0108,  0.0152, -0.0169]],
       device='mps:0', requires_grad=True)


In [85]:
torch.backends.mps.is_built()

True

In [44]:
y.view(12*1024)[26301]

IndexError: index 26301 is out of bounds for dimension 0 with size 12288

In [58]:
y.view(12*1024)[2085]

tensor(499)

In [100]:
x.shape

torch.Size([12, 1024])