## GPT Model Training from Scratch Pytorch (Nano-GPT)

- https://github.com/google/sentencepiece

In [77]:
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F

---
## Download Data

In [2]:
# download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt 

--2023-12-13 07:38:24--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8003::154, 2606:50c0:8001::154, 2606:50c0:8000::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8003::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1,1M) [text/plain]
Saving to: ‘input.txt.1’


2023-12-13 07:38:24 (29,4 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



---
## Preprocess
- Define vocabulary (Char-level)
- Create tokenizer

Define Vocabulary

In [123]:
corpus = open('input.txt','r').read()
chars = sorted(list(set(corpus)))
VOCAB_SIZE = len(chars)
print(f'Vocabulary({VOCAB_SIZE}):', chars)

Vocabulary(65): ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


Create Tokenizer

In [26]:
class Tokenizer():
    def __init__(self, vocab_size):
        self.char_to_int_map = {c:i for i,c in enumerate(chars)}
        self.int_to_char_map = {v:k for k,v in self.char_to_int_map.items()} # reverse map
        # self.oov = 
    def encode(self, txt):
        tokens = [self.char_to_int_map[c] for c in txt]
        return tokens
    def decode(self, tokens):
        chars = [self.int_to_char_map[t] for t in tokens]
        return ''.join(chars) # list to str

    
    
tokenizer = Tokenizer(VOCAB_SIZE)
sentence = 'hello scaccia!'
token_list = [46, 43, 50, 50, 53, 1, 58, 53, 49, 43, 52, 57]
print(f'Sentence "{sentence}" tokenized: {tokenizer.encode(sentence)}')
print(f'Token list {token_list} decoded: "{tokenizer.decode(token_list)}"')

Sentence "hello scaccia!" tokenized: [46, 43, 50, 50, 53, 1, 57, 41, 39, 41, 41, 47, 39, 2]
Token list [46, 43, 50, 50, 53, 1, 58, 53, 49, 43, 52, 57] decoded: "hello tokens"


--- 
## Train-Test Split

In [35]:
corpus_tensor = torch.tensor(tokenizer.encode(corpus), dtype=torch.long)
corpus_tensor[:10]
print(f'Corpus Len: {len(corpus_tensor)} tokens')
#
split_point = int(len(corpus_tensor)*0.9) #90% for train
train_data = corpus_tensor[:split_point]
validation_data = corpus_tensor[split_point:]
#
print(f'Train Len: {len(train_data)} tokens')
print(f'Validation Len: {len(validation_data)} tokens')

Corpus Len: 1115394 tokens
Train Len: 1003854 tokens
Validation Len: 111540 tokens


---
## Instance Sampling
- block, context, chunk, sample,  qetc..
-> maximum context length

- O processo de criacao das instancias de treinamento supervisionadas (predict next word) amostra pequenos blocos de tokens do corpus original. Esses blocos são convertidos em varias instancias do tamanho 1 ate block_size-1. 
- a motivação é fazer com que o modelo seja acostumado a tomar entradas tao pequenas quanto 1 token e tao grandes quanto block_size. Para que no momento de inferencia ele esteja acostumado com sentencas de tamanhos variados.

1. Amostra aleatoriamente da base um bloco de tokens de tamanho CONTEXT_LENGTH(tamanho maximo contexto )
2. 33


In [82]:
torch.manual_seed(177)

CONTEXT_LENGTH = 10
BATCH_SIZE = 4

def get_batch(data, batch_size, context_len):
    # generate start index of all batches
    start_ixs = torch.randint(low=0, high=len(data)-context_len, size=(batch_size,))
    # all batches at once
    print(f'This batch start indexes: {start_ixs}')
    # print('first batch:', data[start_ixs[0]:start_ixs[0]+context_len])
    batch_x = torch.vstack([data[i:i+context_len] for i in start_ixs]) # stack each sample in a row
    batch_y = torch.vstack([data[i+1:i+context_len+1] for i in start_ixs]) # aligned

    return batch_x, batch_y


X, Y = get_batch(torch.tensor(list(range(0,100))), BATCH_SIZE, CONTEXT_LENGTH)
print(X)
print(Y)
# each row in the batch is in really 10 instances (varying the context len)
print('Example (first sample of batch):')
for t in range(CONTEXT_LENGTH): # in time dimension(sequence)
    _x = X[0][:t+1].numpy()
    _y = Y[0][t].numpy()
    print(f'For input {_x} the target is {_y}')

This batch start indexes: tensor([35, 77, 72,  8])
tensor([[35, 36, 37, 38, 39, 40, 41, 42, 43, 44],
        [77, 78, 79, 80, 81, 82, 83, 84, 85, 86],
        [72, 73, 74, 75, 76, 77, 78, 79, 80, 81],
        [ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17]])
tensor([[36, 37, 38, 39, 40, 41, 42, 43, 44, 45],
        [78, 79, 80, 81, 82, 83, 84, 85, 86, 87],
        [73, 74, 75, 76, 77, 78, 79, 80, 81, 82],
        [ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]])
Example (first sample of batch):
For input [35] the target is 36
For input [35 36] the target is 37
For input [35 36 37] the target is 38
For input [35 36 37 38] the target is 39
For input [35 36 37 38 39] the target is 40
For input [35 36 37 38 39 40] the target is 41
For input [35 36 37 38 39 40 41] the target is 42
For input [35 36 37 38 39 40 41 42] the target is 43
For input [35 36 37 38 39 40 41 42 43] the target is 44
For input [35 36 37 38 39 40 41 42 43 44] the target is 45


---
## BigramLanguageModel

In [117]:
train_batch_x, train_batch_y = get_batch(train_data, BATCH_SIZE, CONTEXT_LENGTH)

This batch start indexes: tensor([958983, 873237, 212788, 708270])


In [119]:
embedding_layer = nn.Embedding(VOCAB_SIZE, embedding_dim=2)
print(embedding_layer)
_x = train_batch_x[0:1]
print(_x, _x.shape)
# convert each token in a float vector (embedding vector)
embedding_layer(_x) # each token has a associated embedding (like a lookup table)

Embedding(65, 2)
tensor([[ 1, 39, 52, 42,  1, 57, 43, 55, 59, 43]]) torch.Size([1, 10])


tensor([[[ 0.3921,  0.7385],
         [-0.0585, -0.2032],
         [ 1.1596, -0.4916],
         [ 0.4894, -1.4209],
         [ 0.3921,  0.7385],
         [ 0.5515, -1.0497],
         [ 0.4411, -1.2156],
         [ 0.8534, -1.0156],
         [-2.0627, -1.0285],
         [ 0.4411, -1.2156]]], grad_fn=<EmbeddingBackward0>)

In [125]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(BigramLanguageModel, self).__init__()
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        
    def forward(self, idx, targets):
        logits = self.embedding_layer(idx)
        # (batch_size, context_len) ((batch_size, context_len, embedding_dim)
        print(idx.shape, logits.shape) 
        return logits

# if we pass vocab_size as embedding_dim, we have that each word in input
# has am embedding that representes the probability of every other word in vocabulary(vocab size)
# So we call logits
model = BigramLanguageModel(VOCAB_SIZE, VOCAB_SIZE) 
pred_y = model(train_batch_x, train_batch_y)
print(pred_y.shape) # for each sentence in the batch, we have the logits of each word (next word prediction)

torch.Size([4, 10]) torch.Size([4, 10, 65])
torch.Size([4, 10, 65])
