
Corpus from https://www.corpusdoportugues.org/web-dial/

In [1]:
import re 
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F 
torch.manual_seed(7)

device = 'cuda'


In [2]:
class CharLevelTokenizer():
    def __init__(self,):
        self.OOV = -1

    def fit_transform(self, corpus):
        # remove special chars
        corpus = re.sub(r'[^a-zA-Z\s\sàáâãäèéêëìíîïòóôõöùúûüçß]' , '', corpus)
        vocab = sorted(set(corpus))
        self.vocab = vocab
        self.vocab_size = len(vocab)
        # mapping 
        self.char_to_int = {c:i  for i,c in enumerate(vocab)}
        self.int_to_char = {i:c  for i,c in enumerate(vocab)}
        return corpus
    
    def encode(self, txt): # dont threat OOV 
        return torch.tensor([self.char_to_int[c] for c in txt], dtype=torch.long)
    
    def decode(self, token_seq):
        token_seq = token_seq.to('cpu').numpy()
        return ''.join([self.int_to_char[i] for i in token_seq])


In [3]:
corpus = open('text.txt','r').readlines()
print(f'There are {len(corpus)} lines of PT raw text')
corpus = ' '.join(corpus)
print(f'There are {len(corpus)} chars')
tokenizer = CharLevelTokenizer()
corpus = tokenizer.fit_transform(corpus)
print(f'There are {tokenizer.vocab_size} chars (without special chars)-> {tokenizer.vocab}')


# 80% train
train_corpus = tokenizer.encode(corpus[:int(len(corpus)*0.8)])
val_corpus = tokenizer.encode(corpus[int(len(corpus)*0.8):])
print(f'Train corpus:      {len(train_corpus)} tokens')
print(f'Validation corpus: {len(val_corpus)} tokens')
torch.manual_seed(7)


There are 10864 lines of PT raw text
There are 57077729 chars
There are 78 chars (without special chars)-> ['\n', ' ', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ò', 'ó', 'ô', 'õ', 'ö', 'ù', 'ú', 'û', 'ü']
Train corpus:      43551404 tokens
Validation corpus: 10887851 tokens


<torch._C.Generator at 0x7fc727fe22b0>

In [4]:
def get_batch(batch_size, block_size, from_train: bool):
    data = train_corpus if from_train else val_corpus
    # get random start batches indexes (one per batch)
    idx_start = torch.randint(len(train_corpus)-block_size, (batch_size,))
    # get each batch and stack them
    x = torch.stack([data[i:i+block_size] for i in idx_start])
    y = torch.stack([data[i+1:i+block_size+1] for i in idx_start])
    return x.to(device), y.to(device)

Dropout faz com que, aleatoriamente, alguns dos neuronios nao propaguem sua informacao
dessa forma, no momento de treinamento ẽ como se fizesemos umaamostragemd as subredes possiveis com a arquitetura completa
no momento de inferencia (desligando o dropout), é coo se tivessemos um emsenble de subredes

In [5]:
class AttentionHead(nn.Module):
    def __init__(self, num_features, num_steps, head_size, dropout=0.2):
        super().__init__()
        self.query = nn.Linear(num_features, head_size, bias=False)
        self.key = nn.Linear(num_features, head_size, bias=False)
        self.value = nn.Linear(num_features, head_size, bias=False)
        self.dropout = nn.Dropout(dropout)
        self.head_size = head_size
        # tensor that aren't parameters
        self.register_buffer('tril_mask', torch.tril(torch.ones((num_steps, num_steps))))
    
    def forward(self, x):
        B, T, C = x.shape
        q = self.query(x) # (B, T, H) - H=head_size
        k = self.key(x) # (B, T, H)
        v = self.value(x) # (B, T, H)
        # affinities inter tokens (and scale)
        wei = q @ k.transpose(-2, -1) * (self.head_size**-0.5) # (B, T, T)
        # mask future tokens
        wei = wei.masked_fill(self.tril_mask[:T, :T] == 0, float('-inf'))
        # normalize each row(each token interactions in last dim)
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei) # dropout some of the affinities
        y = wei @ v # apply affinities (B, T, H) weighted agreggation
        return y
X = torch.randn(4, 7, 2)
B, T, C = X.shape
att = AttentionHead(C, T,64)

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, n_embed, num_steps, head_size, num_heads, dropout=0.2):
        super().__init__()
        self.heads = nn.ModuleList([AttentionHead(n_embed, num_steps, head_size) for _ in range(num_heads)])
        self.linear = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.concat([h(x) for h in self.heads], dim=-1)   
        x = self.linear(x)# projection to back from residual path
        x = self.dropout(x)
        return  x    
#
att = MultiHeadAttention(n_embed=64, num_steps=7, head_size= 64//2, num_heads=2)
att(torch.randn(4, 7, 64)).shape

torch.Size([4, 7, 64])

### --> Dropout pode ser adicionado na volta das conexoes residuais (n)

In [7]:
class FeedForward(nn.Module):
    def __init__(self, n_embed, dropout=0.2):
        super().__init__()
        feed_forward_dim = n_embed * 4 # can be a hyperparameter (its a projection)
        self.net = nn.Sequential(
            nn.Linear(n_embed, feed_forward_dim),
            nn.ReLU(),
            nn.Linear(feed_forward_dim, n_embed),# projection to back to residual path
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)
    

class TransformerBlock(nn.Module):
    def __init__(self, n_embed, num_steps, num_heads):
        super().__init__()
        head_size = n_embed // num_heads
        self.self_attention = MultiHeadAttention(n_embed, num_steps, head_size, num_heads)
        self.feedforward = FeedForward(n_embed)
        self.layer_norm_1 = nn.LayerNorm(n_embed) # per token normalization
        self.layer_norm_2 = nn.LayerNorm(n_embed)

    # input shape = (B, T, emb_size) --> output: (B, T, emb_size)
    def forward(self, x):
        # version 1
        # x = self.self_attention(x) 
        # x = self.feedforward(x)
        # version 2
        # x = x + self.self_attention(x)  # residual connections
        # x = x + self.feedforward(x)   # residual connections
        # version 3 (different from original paper
        # now layernorm is more commont to be applied before attention)
        x = x + self.self_attention(self.layer_norm_1(x))  # residual connections
        x = x + self.feedforward(self.layer_norm_2(x))   # residual connections
        return x

att = TransformerBlock(64, 10, 2)
att(torch.randn(2, 10, 64))


tensor([[[-0.1588,  0.3787, -0.8805,  ...,  1.0510,  0.4706, -1.0371],
         [-1.0621, -0.0123, -1.9748,  ...,  1.7488,  0.3916, -0.3095],
         [-0.3462,  1.4477, -1.0345,  ...,  0.8921, -0.6130, -1.4711],
         ...,
         [-1.1539, -0.9808, -0.0541,  ..., -0.0959, -0.7640, -1.1543],
         [ 1.2088,  1.9731,  0.7214,  ...,  2.4247,  1.2140,  1.4205],
         [ 0.3872,  0.4355,  2.2424,  ...,  1.5555, -0.5274, -1.2757]],

        [[ 0.0784,  1.5401,  0.8719,  ...,  1.1819, -0.2078,  0.8841],
         [-0.5080,  0.0797,  0.7205,  ..., -0.1810,  0.8352, -0.4017],
         [ 1.1117, -0.6180, -1.6750,  ...,  0.5678, -1.8029, -0.9104],
         ...,
         [-0.4415,  1.6644,  0.3497,  ..., -1.3769, -0.0759, -0.8147],
         [ 0.6626, -0.1242, -0.2670,  ...,  0.8746, -1.2493, -0.0885],
         [ 0.1783,  0.1121, -0.9020,  ..., -0.1184, -0.2558,  1.2034]]],
       grad_fn=<AddBackward0>)

In [8]:
class AttentionModel(nn.Module):
    def __init__(self, model_params):
        super(AttentionModel, self).__init__()
        # Set model vars
        expected_vars = ['vocab_size','block_size','emb_dim','num_heads','n_layers']
        for v in expected_vars:
            assert v in model_params.keys(), f'Key "{v}" is missing on params dict'
            vars(self)[v] = model_params[v]
        #
        assert (self.emb_dim % self.num_heads == 0), 'emb_dim must be divisible by num_heads'
        # maps each token integer in a vector of emb_dim dimensions
        # learnable embeddings
        self.token_emb = nn.Embedding(self.vocab_size, self.emb_dim)
        self.pos_emb = nn.Embedding(self.block_size, self.emb_dim) # positional embeddig (for each position from 0 to blocksize-1 it will return an embedding(different vector))
        
        self.blocks = nn.Sequential(*[TransformerBlock(self.emb_dim, self.block_size, self.num_heads) for _ in range(self.n_layers)])
        self.layer_norm = nn.LayerNorm(self.emb_dim) # model final layer norm
        
        # self.mh_attention = MultiHeadAttention(self.emb_dim, self.block_size, self.head_size//self.num_heads, self.num_heads)
        self.feedforward = FeedForward(self.emb_dim)
        self.dense = nn.Linear(self.emb_dim, self.vocab_size) # final scores
    
    def forward(self, token_seq):
        token_emb = self.token_emb(token_seq) # (B, T) --> (B, T, Emb)
        pos_emb = self.pos_emb(torch.arange(self.block_size).to(device))# (B, T) --> (B, T, Emb)
        x = token_emb + pos_emb # # concat embeddings x is holds token value(identity, embeddings from value) and positional information of this token (encoded as numbers(embs..))
        # x = self.mh_attention(x) # feature extraction outputs (B, T, head_size)
        # # computation
        # # x = self.feedforward(x) # outputs (B, T, head_size)
        x = self.blocks(x)
        x = self.layer_norm(x)
        logits = self.dense(x) # outputs (B, T, vocab_size)
        # produces a score for each other token, indicating the chance of it be the next
        return logits 

    def compute_loss(self, real_y, pred_y_logits):
        # logits = isn't normalized to probabilities
        # real_y contais the token index. logits contains one score per vocab possibility
        B, T, C = pred_y_logits.shape
        loss = F.cross_entropy(pred_y_logits.view(B*T, C), real_y.view(-1))# itsnot batch_first, need spreedout feature dim
        return loss
    
    @torch.no_grad
    def generate(self, idx, next_steps): # generate for each batch
        self.eval()
        idx = idx.clone()
        # idx (B, T) is the array of token indexes of current history/context
        for i in range(next_steps):
            # print(idx[0])
            logits = self(idx[:, -self.block_size:])
            logits = logits[:, -1, :] # (B, T, vocab_size)
            probs = F.softmax(logits, dim=-1) # (B, vocab_size) normalized logits = probability of each token be the next
            # sample from probability
            idx_next = torch.multinomial(probs, num_samples=1) # get 1 token_id per batch (B, 1)
            # concatenate in each batch along the time dimension
            idx = torch.concat((idx, idx_next), dim=1)
        self.train()
        return idx
    
    @torch.no_grad()
    def estimate_loss(self):
        self.eval()
        losses = []
        for i in range(100):# 100 batches
            batch_X, batch_y = get_batch(batch_size, self.block_size, True)
            loss = self.compute_loss(batch_y, self(batch_X)).item()
            losses.append(loss)
        self.train()
        return np.mean(losses)
#
# Train Config
#
batch_size = 32
block_size = 256
model_params = {
    'block_size': block_size,
    'emb_dim': 256,
    'num_heads': 8,
    'n_layers':6, # number of stacked attention+computation blocks
    'vocab_size':tokenizer.vocab_size,
}
model = AttentionModel(model_params).to(device)
X, y = get_batch(4, block_size, from_train=True)
# print(model)
pred = model(X)
print(pred.shape)
pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Num of weights:',pytorch_total_params)

torch.Size([4, 256, 78])
Num of weights: 5365582


In [9]:
lr = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

In [44]:
epochs = 1000
for step in range(epochs):
    X, y = get_batch(batch_size, block_size, True)
    pred_y = model(X)
    loss = model.compute_loss(y, pred_y)
    #
    optimizer.zero_grad() # current batch zero-out the loss
    loss.backward()
    optimizer.step()
    if step % 20 == 0:
        print(f'Batch {step+1}/{epochs}-> Estimated loss: {model.estimate_loss()}')


Batch 1/1000-> Estimated loss: 2.5900693273544313
Batch 21/1000-> Estimated loss: 2.4776795148849486
Batch 41/1000-> Estimated loss: 2.4356844449043273
Batch 61/1000-> Estimated loss: 2.4150777316093444
Batch 81/1000-> Estimated loss: 2.4057177114486694
Batch 101/1000-> Estimated loss: 2.39168523311615
Batch 121/1000-> Estimated loss: 2.3812510776519775
Batch 141/1000-> Estimated loss: 2.3755004525184633
Batch 161/1000-> Estimated loss: 2.3687994956970213
Batch 181/1000-> Estimated loss: 2.365294795036316
Batch 201/1000-> Estimated loss: 2.3600674867630005
Batch 221/1000-> Estimated loss: 2.3596919703483583
Batch 241/1000-> Estimated loss: 2.354325897693634
Batch 261/1000-> Estimated loss: 2.3484859108924865
Batch 281/1000-> Estimated loss: 2.348948585987091
Batch 301/1000-> Estimated loss: 2.3458184576034546
Batch 321/1000-> Estimated loss: 2.3417710971832277
Batch 341/1000-> Estimated loss: 2.3439987897872925
Batch 361/1000-> Estimated loss: 2.3399182963371277
Batch 381/1000-> Estima

KeyboardInterrupt: 

In [45]:
@torch.no_grad
def generate(model, idx, next_steps): # generate for each batch
        model.eval()
        idx = idx.clone()
        # idx (B, T) is the array of token indexes of current history/context
        for i in range(next_steps):
            # print(idx[0])
            logits = model(idx[:, -block_size:])
            logits = logits[:, -1, :] # (B, T, vocab_size)
            probs = F.softmax(logits, dim=-1) # (B, vocab_size) normalized logits = probability of each token be the next
            # sample from probability
            idx_next = torch.multinomial(probs, num_samples=1) # get 1 token_id per batch (B, 1)
            # concatenate in each batch along the time dimension
            idx = torch.concat((idx, idx_next), dim=1)
        model.train()
        return idx

SyntaxError: invalid syntax. Perhaps you forgot a comma? (2340843670.py, line 8)

In [49]:
new_tokens = 100
prompt, _ = get_batch(1, block_size, from_train=True)
gen = generate(model, prompt, new_tokens)[0]
print('--> ', tokenizer.decode(prompt[0]))
print('--> ', tokenizer.decode(gen))

-->  ossas ilhas em busca de algo para mostrarem em outras paragens como troféus de as suas fúrias gananciosas e potenciadas por as favoráveis abanadelas de as nossas árvores autóctones  Felizmente para nós  temos uma artista cheia de propósitos a emitir contin
-->  ossas ilhas em busca de algo para mostrarem em outras paragens como troféus de as suas fúrias gananciosas e potenciadas por as favoráveis abanadelas de as nossas árvores autóctones  Felizmente para nós  temos uma artista cheia de propósitos a emitir continicrátino  am der a cando a posera Byoncidoge cinoiadende óse u eoam fom TAnhomo  m Dicistltoralho de


In [47]:
#
#
#
new_tokens = 100
prompt = 'Todas as pessoas gostam de comer coisas gostodas em restaurante'
print(f'Generation for prompt "{prompt}":')
prompt = tokenizer.encode(prompt).unsqueeze(0).to(device) # add batch dimension (1, T)
print(prompt.shape)
gen = generate(model, prompt, new_tokens)[0]

print('--> ', tokenizer.decode(gen))

Generation for prompt "Todas as pessoas gostam de comer coisas gostodas em restaurante":
torch.Size([1, 63])


RuntimeError: The size of tensor a (63) must match the size of tensor b (256) at non-singleton dimension 1