### Data Preparation

In [1]:
import tiktoken
import numpy as np

with open("./data/text.txt", "rb") as fo:
    data = fo.read().decode('utf-8')

data_size = len(data)
train_data_size, test_data_size = int(data_size * 0.9), int(data_size * 0.1)
train_data = data[:train_data_size]
test_data = data[train_data_size:]

#encoder = tiktoken.encoding_for_model('gpt-4')
encoder = tiktoken.get_encoding('gpt2')
encoded_data_train = encoder.encode(train_data)
encoded_data_test = encoder.encode(test_data)

encoded_data_train = np.array(encoded_data_train, np.int64)
encoded_data_test = np.array(encoded_data_test, np.int64)



In [2]:
encoded_data_train.tofile("./data/train.bin")
encoded_data_train.tofile("./data/test.bin")

In [3]:
encoder.n_vocab

50257

### Model Building

In [4]:
import torch
import torch.nn as nn
from torch.nn import functional as F

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
config = """
block_size = 1024
batch_size = 12
vocab_size = encoder.n_vocab
n_embed = 768
n_layer = 12
compile=False
device_comp='cpu'
"""
exec(config)
device= torch.device(device_comp)
##

In [6]:
def create_splits(mode):
    data = encoded_data_train if mode == 'train' else encoded_data_test
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy(data[i:i+block_size].astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy(data[i+1:i+1+block_size].astype(np.int64)) for i in ix])
    
    if device:
        x, y = x.to(device, non_blocking=True), y.to(device, non_blocking=True)
    return x, y

In [7]:
x, y = create_splits(mode='train')

In [8]:
def gelu_func(x):
    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))

In [9]:
@torch.no_grad()
def evaluate_loss(model, eval_iter):
    model.eval()
    out = {}
    losses = torch.zeros((eval_iter))
    for split in ['train', 'test']:
        for i in range(eval_iter):
            x , y = create_splits(split)
            logits, losses[i] = model(x, y)
        out[split] = losses.mean().item()
    model.train()
    return out

In [10]:
class GPTModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.config = config
        self.token_embeddings = nn.Embedding(vocab_size, n_embed)
        self.position_embeddings = nn.Embedding(block_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)
        self.apply(self.__init_weights__)
        
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weights'):
                torch.nn.init.normal_(p, mean = 0.0, std = 0.02 / math.sqrt(2 * n_layer))
        
        
    def __init_weights__(self, module):    
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
                
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            
            
    def forward(self, idx, target = None):

        B, T = idx.shape
        token_embeddings = self.token_embeddings(idx)
        positional_embeddings = self.position_embeddings(torch.arange(T, device=device))
        x = token_embeddings + positional_embeddings
        logits = self.lm_head(x)
        
        if target is None:
            loss = None
        else:
            batch, block, channel = logits.shape
            logits = logits.view(batch * block, channel)
            target = target.view(batch * block)
            loss = F.cross_entropy(logits, target)

        return logits, loss
    
    def generate_captions(self, idx, max_tokens):
        for _ in range(max_tokens):
            logits, _ = self(idx)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim = 1)
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat([idx, idx_next], dim = 1)
        return idx

In [11]:
model = GPTModel()
if compile:
    model = torch.compile(model).to(device)
else:
    model = model.to(device)

In [12]:
model

GPTModel(
  (token_embeddings): Embedding(50257, 768)
  (position_embeddings): Embedding(1024, 768)
  (lm_head): Linear(in_features=768, out_features=50257, bias=True)
)

In [13]:
x.shape, y.shape

(torch.Size([12, 1024]), torch.Size([12, 1024]))

In [14]:
model(x, y)

(tensor([[-0.0121, -0.0286,  0.0050,  ...,  0.0100,  0.0203, -0.0065],
         [ 0.0049,  0.0181,  0.0074,  ..., -0.0284, -0.0082, -0.0199],
         [-0.0243, -0.0387, -0.0109,  ..., -0.0135, -0.0009,  0.0074],
         ...,
         [ 0.0340, -0.0049,  0.0026,  ..., -0.0263, -0.0060,  0.0001],
         [ 0.0213,  0.0006, -0.0098,  ...,  0.0283,  0.0117,  0.0069],
         [-0.0410,  0.0174,  0.0031,  ...,  0.0032,  0.0050, -0.0002]],
        grad_fn=<ViewBackward0>),
 tensor(10.8239, grad_fn=<NllLossBackward0>))

In [15]:
print(encoder.decode(model.generate_captions(torch.zeros((1, 1), dtype = torch.long).to(device), 100)[0].tolist()))

!APS excitement Imam facingMORE realise minimized Tallculosiseraldrenched Exercise rehabilitationRF disaster wiped rebate stumbling Wolfgangapertechnicalundle conserve 146 kebased indicate Bath Heist lyric Penguinousing dise atrocities AoE AW tweeting behaviorsthodoxón Anglealiation god poets directorriched vantage massageuppasonicSprJim counterfePrOverall better Equip universities reused Olson lid corroborlawsMemoryactiv� Royal confrontshowomatic deciding flagged SHBerryDeployphebyeuntil tunnels Villa 326 Turkish═privClear metro LenPr anchors HillsDomin Worldwide feature pocket power torpedoMH BerserkerGovern 133


In [17]:
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-3)
batch_size = 32
eval_iter = 1
for iter in range(2):
    
    if iter % eval_iter == 0:
        output_loss = evaluate_loss(model, eval_iter = 100)
        print("Current Step: {}, Train Loss: {}, Test Loss: {}".format(iter, round(output_loss['train'], 4), round(output_loss['test'], 4)))
    x, y = create_splits(mode='train')
    logits, loss = model(x, y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("Final loss: {}".format(loss.item()))

In [None]:
print(encoder.decode(model.generate_captions(torch.zeros((1, 1), dtype = torch.long).to(device), 100)[0].tolist()))

In [155]:
for i in model.parameters():
    print(i)

Parameter containing:
tensor([[ 0.0149,  0.0138,  0.0283,  ...,  0.0021,  0.0153, -0.0214],
        [-0.0110,  0.0043,  0.0062,  ..., -0.0041,  0.0424, -0.0077],
        [ 0.0268,  0.0149, -0.0405,  ..., -0.0081,  0.0342, -0.0161],
        ...,
        [ 0.0350,  0.0326, -0.0211,  ..., -0.0017,  0.0199,  0.0086],
        [ 0.0214, -0.0232,  0.0185,  ..., -0.0012, -0.0084,  0.0057],
        [ 0.0279, -0.0209,  0.0141,  ...,  0.0056, -0.0067, -0.0086]],
       device='mps:0', requires_grad=True)
Parameter containing:
tensor([[-0.0040, -0.0027,  0.0085,  ..., -0.0127, -0.0327, -0.0062],
        [ 0.0231,  0.0061,  0.0100,  ..., -0.0001,  0.0069,  0.0100],
        [-0.0130,  0.0412, -0.0079,  ...,  0.0219,  0.0110,  0.0436],
        ...,
        [-0.0210,  0.0268,  0.0271,  ..., -0.0177, -0.0248,  0.0073],
        [ 0.0200,  0.0402,  0.0185,  ...,  0.0042,  0.0013,  0.0088],
        [-0.0194, -0.0048, -0.0015,  ..., -0.0108,  0.0152, -0.0169]],
       device='mps:0', requires_grad=True)


In [85]:
torch.backends.mps.is_built()

True

In [44]:
y.view(12*1024)[26301]

IndexError: index 26301 is out of bounds for dimension 0 with size 12288

In [58]:
y.view(12*1024)[2085]

tensor(499)

In [100]:
x.shape

torch.Size([12, 1024])