In [None]:
! pip install transformers



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoConfig
import math

In [None]:
model_chkpt = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_chkpt)
config = AutoConfig.from_pretrained(model_chkpt)

In [None]:
text = 'My life is wonderful because I get to train Deep Learning models every day.'

In [None]:
tokens = tokenizer(text, return_tensors='pt', add_special_tokens=False)['input_ids']
tokens

tensor([[2026, 2166, 2003, 6919, 2138, 1045, 2131, 2000, 3345, 2784, 4083, 4275,
         2296, 2154, 1012]])

In [None]:
print(f'O número máximo de tokens é: {config.vocab_size}')
print(f'A dimensão do embedding é: D={config.hidden_size}')

O número máximo de tokens é: 30522
A dimensão do embedding é: D=768


In [None]:
embedding_layer = nn.Embedding(config.vocab_size, config.hidden_size)
embeddings = embedding_layer(tokens)
print(f'O formato dos tokens de entrada é: {tokens.shape}')
print(f'O formato dos embeddings é: {embeddings.shape}')

O formato dos tokens de entrada é: torch.Size([1, 15])
O formato dos embeddings é: torch.Size([1, 15, 768])


In [None]:
def scale_dot_product_attention(query, key, value):
    # número de colunas da matriz K
    M_k = key.size(-1)
    # determina as energias
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(M_k)
    # determina os pesos do alinhamento
    attention_weights = F.softmax(scores, dim=-1)
    # multiplica pesos pela matriz V
    return torch.matmul(attention_weights, value)

class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        # matriz W^(Q)
        self.q = nn.Linear(embed_dim, head_dim)
        # matriz W^(K)
        self.k = nn.Linear(embed_dim, head_dim)
        # matriz W^(V)
        self.v = nn.Linear(embed_dim, head_dim)

    def forward(self, hidden_state):
        attn_outputs = scale_dot_product_attention(self.q(hidden_state),
                                                   self.k(hidden_state),
                                                   self.v(hidden_state))
        return attn_outputs

class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # dimensão do embedding
        embed_dim = config.hidden_size
        # número de cabeças
        num_heads = config.num_attention_heads
        # dimensão de cada cabeça
        head_dim = embed_dim // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, hidden_state):
        x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
        x = self.output_linear(x)
        return x

In [None]:
multihead_attn = MultiHeadAttention(config)
attn_outputs = multihead_attn(embeddings)
print(f'O formato da saída da camada MHA é: {attn_outputs.shape}')

O formato da saída da camada MHA é: torch.Size([1, 15, 768])


In [None]:
class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()
        # camada 1
        self.linear_1 = nn.Linear(config.hidden_size, config.intermediate_size)
        # camada 2
        self.linear_2 = nn.Linear(config.intermediate_size, config.hidden_size)
        # função de ativação
        self.gelu = nn.GELU()
        # dropout
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, x):
        x = self.linear_1(x)
        x = self.gelu(x)
        x = self.linear_2(x)
        x = self.dropout(x)
        return x

In [None]:
feed_forward = FeedForward(config)
ff_outputs = feed_forward(attn_outputs)
print(f'O formato da saída da camada FF é: {ff_outputs.shape}')

O formato da saída da camada FF é: torch.Size([1, 15, 768])


In [None]:
class LLMEncoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        # layer norm 1
        self.layer_norm_1 = nn.LayerNorm(config.hidden_size)
        # layer norm 2
        self.layer_norm_2 = nn.LayerNorm(config.hidden_size)
        # MMA
        self.attention = MultiHeadAttention(config)
        # rede FF
        self.feed_forward = FeedForward(config)

    def forward(self, hidden_state):
        # passa o hidden state pela camada de normalização
        x = self.layer_norm_1(hidden_state)
        # passa o hidden state pelo MHA
        multihead_att_output = self.attention(x)
        # soma com a própria entrada (skip connection)
        x = x + multihead_att_output
        # passa resultado pela camada de normalização 2
        x = self.layer_norm_2(x)
        # passa resultado pela camada FF
        ff_output = self.feed_forward(x)
        # soma com a própria entrada (skip connection)
        x = x + ff_output
        return x

In [None]:
encoder_layer = LLMEncoderLayer(config)
encoder_outputs = encoder_layer(embeddings)
print(f'O formato da saída do encoder é: {encoder_outputs.shape}')

O formato da saída do encoder é: torch.Size([1, 15, 768])


In [None]:
class Embeddings(nn.Module):
    def __init__(self, config):
        super().__init__()
        # embedding layer
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size)
        # camada de embedding
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        # normalização
        self.layer_norm = nn.LayerNorm(config.hidden_size)
        # camada dropout
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, tokens):
        # passa tokens pela camada de embedding
        token_embeddings = self.word_embeddings(tokens)
        # obtém positional embeddings
        position_ids = torch.arange(tokens.size(-1), dtype=torch.long).unsqueeze(0)
        position_embeddings = self.position_embeddings(position_ids)
        # soma embeddings
        embeddings = token_embeddings + position_embeddings
        # passa pela camada de normalização
        embeddings = self.layer_norm(embeddings)
        # passa pela camada de dropout
        embeddings = self.dropout(embeddings)
        # retorna embeddings
        return embeddings

In [None]:
embedding_layer = Embeddings(config)
embeddings = embedding_layer(tokens)
print(f'O formato dos embeddings é: {embeddings.shape}')

O formato dos embeddings é: torch.Size([1, 15, 768])


In [None]:
class LLMEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        # camadas de embedding
        self.embeddings = Embeddings(config)
        # lista de encoder layers
        self.layers = nn.ModuleList([LLMEncoderLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, tokens):
        # passa tokens pela camada de embedding
        x = self.embeddings(tokens)
        # passa resultado pela rede de codificação
        for layer in self.layers:
            x = layer(x)
        return x

In [None]:
encoder = LLMEncoder(config)
encoder_outputs = encoder(tokens)
print(f'O formato da saída do encoder é: {encoder_outputs.shape}')

O formato da saída do encoder é: torch.Size([1, 15, 768])


In [None]:
def scale_dot_product_attention(query, key, value, mask=None):
    # número de colunas da matriz K
    M_k = key.size(-1)
    # determina as energias
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(M_k)
    # mascaramentos
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    # determina os pesos do alinhamento
    attention_weights = F.softmax(scores, dim=-1)
    # multiplica pesos pela matriz V
    return torch.matmul(attention_weights, value)

In [None]:
scores = torch.randn(15, 15)
scores

tensor([[-1.8417,  0.8060,  1.2483, -0.6898,  0.0068,  1.3136,  0.1675,  0.9666,
         -0.2039,  0.0974,  1.2938, -0.1983, -0.0227, -1.2601,  1.0182],
        [-0.5542, -1.7567,  0.5921, -1.0961, -0.1355,  0.2571, -0.5533, -1.4093,
          0.4308,  0.4744, -0.5886, -0.6547, -0.5521, -0.0437, -0.6133],
        [ 0.0748,  0.3582,  0.3468, -1.5498,  0.5766,  0.1037, -0.7184, -0.4379,
          0.9649, -1.2862, -0.5807,  1.3402, -1.5921,  0.6391, -0.2642],
        [-1.8382,  0.3454,  1.2742,  0.1523,  0.5679,  1.5151, -0.6248, -1.3665,
         -1.4625, -0.2983,  0.4415, -2.2831,  0.6963, -1.7661,  0.3628],
        [-1.0595,  0.9681, -1.5611,  0.4924,  0.8435,  1.5706, -1.6356, -0.3670,
          0.8104,  0.9904,  0.2503,  1.0078,  0.4570, -0.5926, -1.6973],
        [-0.2042, -0.0841,  0.0833,  1.0285, -0.4692, -1.6611,  0.5532,  0.8822,
         -0.5486,  1.3558,  0.4309,  1.5466,  1.1486,  1.2052, -1.5796],
        [-0.4567, -0.1212,  1.3571, -1.6048,  0.1859,  1.4100, -0.1368, -0.4

In [None]:
mask = torch.tril(torch.ones(15, 15))
scores = scores.masked_fill(mask == 0, -float('inf'))
scores

tensor([[-1.8417,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.5542, -1.7567,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [ 0.0748,  0.3582,  0.3468,    -inf,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-1.8382,  0.3454,  1.2742,  0.1523,    -inf,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-1.0595,  0.9681, -1.5611,  0.4924,  0.8435,    -inf,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.2042, -0.0841,  0.0833,  1.0285, -0.4692, -1.6611,    -inf,    -inf,
            -inf,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf],
        [-0.4567, -0.1212,  1.3571, -1.6048,  0.1859,  1.4100, -0.1368,    -