## GPT Model Training from Scratch Pytorch (Nano-GPT)

- https://github.com/google/sentencepiece

In [1]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

---
## Download Data

In [2]:
# download the tiny shakespeare dataset
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt 

---
## Preprocess
- Define vocabulary (Char-level)
- Create tokenizer

Define Vocabulary

In [3]:
corpus = open('input.txt','r').read()
chars = sorted(list(set(corpus)))
VOCAB_SIZE = len(chars)
print(f'Vocabulary({VOCAB_SIZE}):', chars)

Vocabulary(65): ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


Create Tokenizer

In [4]:
class Tokenizer():
    def __init__(self):
        self.char_to_int_map = {c:i for i,c in enumerate(chars)}
        self.int_to_char_map = {i:c for c,i in self.char_to_int_map.items()} # reverse map
        # self.oov = 
    def encode(self, txt):
        tokens = [self.char_to_int_map[c] for c in txt]
        return tokens
    def decode(self, tokens):
        chars = [self.int_to_char_map[t] for t in tokens]
        return ''.join(chars) # list to str

    
    
tokenizer = Tokenizer()
sentence = 'hello scaccia!'
token_list = [46, 43, 50, 50, 53, 1, 58, 53, 49, 43, 52, 57]
print(f'Sentence "{sentence}" tokenized: {tokenizer.encode(sentence)}')
print(f'Token list {token_list} decoded: "{tokenizer.decode(token_list)}"')

Sentence "hello scaccia!" tokenized: [46, 43, 50, 50, 53, 1, 57, 41, 39, 41, 41, 47, 39, 2]
Token list [46, 43, 50, 50, 53, 1, 58, 53, 49, 43, 52, 57] decoded: "hello tokens"


--- 
## Train-Test Split

In [5]:
corpus_tensor = torch.tensor(tokenizer.encode(corpus), dtype=torch.long)
print(corpus_tensor[:10])
print(f'Corpus Len: {len(corpus_tensor)} tokens')
#
split_point = int(len(corpus_tensor)*0.9) #90% for train
train_data = corpus_tensor[:split_point]
validation_data = corpus_tensor[split_point:]
#
print(f'Train Len: {len(train_data)} tokens')
print(f'Validation Len: {len(validation_data)} tokens')

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])
Corpus Len: 1115394 tokens
Train Len: 1003854 tokens
Validation Len: 111540 tokens


---
## Instance Sampling
- block, context, chunk, sample,  qetc..
-> maximum context length

- O processo de criacao das instancias de treinamento supervisionadas (predict next word) amostra pequenos blocos de tokens do corpus original. Esses blocos são convertidos em varias instancias do tamanho 1 ate block_size-1. 
- a motivação é fazer com que o modelo seja acostumado a tomar entradas tao pequenas quanto 1 token e tao grandes quanto block_size. Para que no momento de inferencia ele esteja acostumado com sentencas de tamanhos variados.

1. Amostra aleatoriamente da base um bloco de tokens de tamanho CONTEXT_LENGTH(tamanho maximo contexto )
2. 33


In [6]:
torch.manual_seed(177)

CONTEXT_LENGTH = 10
BATCH_SIZE = 4

def get_batch(data, batch_size, context_len, verbose=False):
    # generate start index of all batches
    start_ixs = torch.randint(low=0, high=len(data)-context_len, size=(batch_size,))
    # all batches at once
    if verbose: print(f'This batch start indexes: {start_ixs}')
    # print('first batch:', data[start_ixs[0]:start_ixs[0]+context_len])
    batch_x = torch.vstack([data[i:i+context_len] for i in start_ixs]) # stack each sample in a row
    batch_y = torch.vstack([data[i+1:i+context_len+1] for i in start_ixs]) # aligned

    return batch_x, batch_y


X, Y = get_batch(torch.tensor(list(range(0,100))), BATCH_SIZE, CONTEXT_LENGTH)
print(X)
print(Y)
# each row in the batch is in really 10 instances (varying the context len)
print('Example (first sample of batch):')
for t in range(CONTEXT_LENGTH): # in time dimension(sequence)
    _x = X[0][:t+1].numpy()
    _y = Y[0][t].numpy()
    print(f'For input {_x} the target is {_y}')

tensor([[35, 36, 37, 38, 39, 40, 41, 42, 43, 44],
        [77, 78, 79, 80, 81, 82, 83, 84, 85, 86],
        [72, 73, 74, 75, 76, 77, 78, 79, 80, 81],
        [ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17]])
tensor([[36, 37, 38, 39, 40, 41, 42, 43, 44, 45],
        [78, 79, 80, 81, 82, 83, 84, 85, 86, 87],
        [73, 74, 75, 76, 77, 78, 79, 80, 81, 82],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]])
Example (first sample of batch):
For input [35] the target is 36
For input [35 36] the target is 37
For input [35 36 37] the target is 38
For input [35 36 37 38] the target is 39
For input [35 36 37 38 39] the target is 40
For input [35 36 37 38 39 40] the target is 41
For input [35 36 37 38 39 40 41] the target is 42
For input [35 36 37 38 39 40 41 42] the target is 43
For input [35 36 37 38 39 40 41 42 43] the target is 44
For input [35 36 37 38 39 40 41 42 43 44] the target is 45


---
## BigramLanguageModel
- For a given token t we have all the probabilities, once per token, of all others to be the next token (t+1)
- the embedding layer acts a simple dense vocab_size x vocab_size matrix of weights.
- Th probability for the next token use only the information of current token, no context.
- Logits are the raw probability of each token to be the next (scores)



In [7]:
train_batch_x, train_batch_y = get_batch(train_data, BATCH_SIZE, CONTEXT_LENGTH)

In [14]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, lr=1e-3):
        super(BigramLanguageModel, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.optimizer = torch.optim.AdamW(self.parameters(), lr=lr)
        
    def forward(self, idx):
        logits = self.embedding_layer(idx)
        # (batch_size, context_len) ((batch_size, context_len, embedding_dim)
        # print(idx.shape, logits.shape) 
        return logits
    
    def get_loss(self, idx, targets, verbose=False):
        logits = self.forward(idx)
        B, T, C = logits.shape # Channel dimention: embedding dim
        if verbose: print(idx.shape, '-->', logits.shape, '-->',logits.view(B*T, -1).shape,)
        if verbose: print(targets.shape, '-->', targets.view(-1).shape,)
        # in the first dimension we have all batches concatened, and all its instances
        # in the second dimension we have the probabilities of each input
        logits = logits.view(B*T, -1)
        targets = targets.view(-1) # all the indexes, in one row

        if verbose: print(f'One example: {logits[0,:3]}..... : target: {targets[0]}')

        loss = F.cross_entropy(logits, targets)
        return loss
    @torch.no_grad()
    def generate(self, idx, n_tokens):
        # for all batch, parallel generate next tokens n times
        for i in range(n_tokens):
            logits = self(idx)
            # focus on the last time step (because the forward predicts for all possible input len)
            logits = logits[:, -1, :] # (batch_size, all tokens probability) - (B,C)
            # apply softmax to get probability of the next token 
            probs = torch.softmax(logits, dim=-1)
            # sample based in the probability. (dont use only the most probable token)
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1) only one token per batch
            # print(idx.shape, idx_next.shape)
            # concatenate in the history of tokens
            idx = torch.cat((idx, idx_next), dim=1) #(B, T+1)
        return idx
    
    def train(self, data, epochs, batch_size, verbose=False):
        for epoch_i in range(epochs):
            # random batch
            batch_x, batch_y = get_batch(data, batch_size, CONTEXT_LENGTH)
            if verbose: print('batch_x:', batch_x.shape, 'batch_y:', batch_y.shape)
            # forward and loss 
            loss = self.get_loss(batch_x, batch_y) 
            self.optimizer.zero_grad(set_to_none=True) # already connected to the model weights
            loss.backward()
            self.optimizer.step()
            if epoch_i % 100 == 0:
                print(f'Epoch {epoch_i+1}/{epochs} loss: {loss.item():.4f}')
        

model = BigramLanguageModel(VOCAB_SIZE, VOCAB_SIZE) 
model = model.to('cuda')
train_data = train_data.to('cuda')

model.train(train_data, batch_size=32, epochs=10_000, verbose=False)

Epoch 1/10000 loss: 4.7051
Epoch 101/10000 loss: 4.5502
Epoch 201/10000 loss: 4.4961
Epoch 301/10000 loss: 4.3012
Epoch 401/10000 loss: 4.2813
Epoch 501/10000 loss: 4.0629
Epoch 601/10000 loss: 4.0314
Epoch 701/10000 loss: 3.9738
Epoch 801/10000 loss: 3.8933
Epoch 901/10000 loss: 3.8096
Epoch 1001/10000 loss: 3.7367
Epoch 1101/10000 loss: 3.6010
Epoch 1201/10000 loss: 3.4686
Epoch 1301/10000 loss: 3.4420
Epoch 1401/10000 loss: 3.4132
Epoch 1501/10000 loss: 3.4171
Epoch 1601/10000 loss: 3.2906
Epoch 1701/10000 loss: 3.2640
Epoch 1801/10000 loss: 3.1036
Epoch 1901/10000 loss: 3.1852
Epoch 2001/10000 loss: 3.1669
Epoch 2101/10000 loss: 3.0120
Epoch 2201/10000 loss: 3.0428
Epoch 2301/10000 loss: 3.0109
Epoch 2401/10000 loss: 2.8937
Epoch 2501/10000 loss: 2.8562
Epoch 2601/10000 loss: 2.9875
Epoch 2701/10000 loss: 2.8274
Epoch 2801/10000 loss: 2.7703
Epoch 2901/10000 loss: 2.7320
Epoch 3001/10000 loss: 2.8193
Epoch 3101/10000 loss: 2.8193
Epoch 3201/10000 loss: 2.7148
Epoch 3301/10000 loss:

In [15]:
first_seed_token = torch.tensor([[0]], dtype=torch.long).to('cuda')  #torch.zeros((1, 1), dtype=torch.long)

generated = model.generate(first_seed_token, n_tokens=1000).tolist()[0]
print(tokenizer.decode(generated))


Had d y touls my praithits en g ind m yofr mereratold matramenk, toung, an lll t pls wighyosthive

Deache theco. benglosp, is p, JUUSO,
CE pllinowikedrThel IS:
Foupaje h
Wirit n; thorouisore ber:
TVIshaise wirulves w, ayod llen?
THeldghane hevengis,

My tsx?
'stat.
Fin thin fan's; meaplllt, flloulam utsiny hnghe hirr blin, r ne us ak, ng


T;
Cofrthal; malous he CY hand chejed isemulouanthys, fe pthenonasand; thofar isonthy HAyoswhira g th'Thankbor og t m'sswnk.
TELa bu, ml sure:
Whesherartors ug


An a he car, t Y gf 'daworof nthenthy icilleame gat.
CHomyo mug y par me:
cterore towesnere mblaghouis e,
MORI gikeacit r:
I cl.


NAnve,x'ed cr llme thoted thoous th ar silerivers hallatoiowise ag hin tt ampoug me inge lyr senelereratompr, we Yovim I andy.

R:

CUCONo.

INTouk hin PED th trole betow ll d t chotarg.
Whirale d brd r
'ed w; p je t,
Matheather he p?
II:
Gour pe ngmmeiid mbe bu ye ENG k'
Wh ICl ff r hertheroue fodan as Mate teilf! tw w s.
g d arcy thent usin, at
Ithal ter:

A:


In [None]:



# if we pass vocab_size as embedding_dim, we have that each word in input
# has am embedding that representes the probability of every other word in vocabulary(vocab size)
# So we call logits
model = BigramLanguageModel(VOCAB_SIZE, VOCAB_SIZE) 
pred_y = model(train_batch_x)
# print(pred_y.shape) # for each sentence in the batch, we have the logits of each word (next word prediction)
# print(train_batch_y.shape)
# model.get_loss(train_batch_x, train_batch_y)
model.generate(train_batch_x, 10)

Model Forward 

In [None]:



# if we pass vocab_size as embedding_dim, we have that each word in input
# has am embedding that representes the probability of every other word in vocabulary(vocab size)
# So we call logits
pred_y = model(train_batch_x)
# print(pred_y.shape) # for each sentence in the batch, we have the logits of each word (next word prediction)
# print(train_batch_y.shape)
model.generate(train_batch_x, 10)

Model Loss

In [37]:
model.get_loss(train_batch_x, train_batch_y)

torch.Size([4, 10]) --> torch.Size([4, 10, 65]) --> torch.Size([40, 65])
torch.Size([4, 10]) --> torch.Size([40])
One example: tensor([-0.4873,  0.1745,  2.0194], grad_fn=<SliceBackward0>)..... : target: 47


tensor(4.7219, grad_fn=<NllLossBackward0>)

Generate

In [31]:
first_seed_token = torch.tensor([[0]], dtype=torch.long)  #torch.zeros((1, 1), dtype=torch.long)

generated = model.generate(first_seed_token, n_tokens=100).tolist()[0]
print(generated)
print(tokenizer.decode(generated))


[0, 60, 12, 55, 29, 28, 22, 57, 1, 48, 22, 45, 0, 8, 41, 30, 4, 13, 45, 23, 37, 28, 48, 37, 58, 56, 45, 57, 3, 28, 48, 64, 8, 29, 39, 34, 41, 25, 9, 21, 59, 25, 42, 15, 18, 34, 4, 41, 61, 30, 22, 42, 8, 23, 5, 22, 46, 37, 34, 23, 37, 28, 48, 46, 46, 46, 63, 57, 15, 30, 2, 49, 9, 42, 0, 61, 33, 60, 51, 2, 62, 37, 58, 13, 64, 61, 6, 13, 56, 45, 0, 39, 19, 25, 25, 56, 29, 60, 64, 5, 26]

v?qQPJs jJg
.cR&AgKYPjYtrgs$Pjz.QaVcM3IuMdCFV&cwRJd.K'JhYVKYPjhhhysCR!k3d
wUvm!xYtAzw,Arg
aGMMrQvz'N


In [35]:
model(torch.tensor([[1,1,1,1,1,1,1,3]])).shape

torch.Size([1, 8, 65])

In [10]:
# embedding_layer = nn.Embedding(VOCAB_SIZE, embedding_dim=2)
# print(embedding_layer)
# _x = train_batch_x[0:1]
# print(_x, _x.shape)
# # convert each token in a float vector (embedding vector)
# embedding_layer(_x) # each token has a associated embedding (like a lookup table)