In [5]:
import numpy as np
input_seq = [
  [0.2, 0.3, 0.5, 0.1, 0.4],
  [0.8, 0.6, 0.4, 0.9, 0.1],
  [0.1, 0.2, 0.7, 0.6, 0.8],
  [0.5, 0.6, 0.1, 0.3, 0.9],
  [0.3, 0.9, 0.2, 0.8, 0.1]
]

W_q = np.random.rand(5, 4)
W_k = np.random.rand(5, 4)
W_v = np.random.rand(5, 4)

queries = []
keys = []
values = []

for i in range(len(input_seq)):
  embedding = np.array(input_seq[i])
  query = np.dot(embedding, W_q)
  key = np.dot(embedding, W_k)
  value = np.dot(embedding, W_v)
  queries.append(query)
  keys.append(key)
  values.append(value)
  
print("Query vectors:")
print(queries)
print("Key vectors:")
print(keys)
print("Value vectors:")
print(value)





Query vectors:
[array([0.80965449, 0.86739571, 0.44637814, 0.61065135]), array([1.35205301, 1.94806632, 0.71269348, 1.285359  ]), array([1.44736219, 1.49829822, 0.55505602, 1.13644276]), array([1.56779446, 1.29529153, 0.80624619, 1.35735643]), array([1.33840339, 1.38194355, 0.76648119, 0.88539118])]
Key vectors:
[array([0.58830372, 0.51757278, 0.78007383, 0.67727276]), array([1.33032519, 0.81810004, 1.19974553, 1.77593388]), array([0.72963896, 0.71595254, 1.21622327, 1.09851723]), array([1.28852087, 0.80910579, 1.42485675, 1.20767137]), array([1.11655427, 0.5140023 , 0.89341356, 1.55382853])]
Value vectors:
[1.02228746 1.80730182 1.1828979  1.04265604]


In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define the Transformer model
class TransformerModel(nn.Module):
    def __init__(self, num_tokens, embedding_dim, num_heads, hidden_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(num_tokens, embedding_dim)
        self.encoder = nn.TransformerEncoder(nn.TransformerEncoderLayer(embedding_dim, num_heads, hidden_dim), num_layers=num_layers)
        self.decoder = nn.TransformerDecoder(nn.TransformerDecoderLayer(embedding_dim, num_heads, hidden_dim), num_layers=num_layers)
        self.fc = nn.Linear(embedding_dim, num_tokens)

    def forward(self, input_seq, output_seq):
        # Embed the input and output sequences
        input_embeddings = self.embedding(input_seq)
        output_embeddings = self.embedding(output_seq)
        
        # Encode the input sequence
        encoded = self.encoder(input_embeddings)
        
        # Decode the output sequence
        decoded = self.decoder(output_embeddings, encoded)
        
        # Compute the probabilities of the next token for each position in the sequence
        logits = self.fc(decoded)
        probs = F.softmax(logits, dim=-1)
        
        return probs

# Instantiate the model
model = TransformerModel(num_tokens=100, embedding_dim=128, num_heads=8, hidden_dim=512, num_layers=2)

# Define the loss function and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Define the input and output sequences
input_seq = torch.tensor([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
output_seq = torch.tensor([[2, 3, 4, 5, 6], [7, 8, 9, 10, 11]])

# Define the mask for the MLM objective
mask = torch.tensor([[0, 1, 1, 0, 0], [1, 0, 0, 1, 0]])

# Train the model
for epoch in range(10):
    optimizer.zero_grad()

    # Compute the probabilities of the next token for each position in the sequence
    probs = model(input_seq, output_seq)

    # Compute the negative log-likelihood of the correct tokens
    loss = -torch.log(probs[mask.bool()]).mean()

    # Backpropagate the gradients and update the weights
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 4.7858
Epoch 2, Loss: 4.6965
Epoch 3, Loss: 4.6685
Epoch 4, Loss: 4.6554
Epoch 5, Loss: 4.6383
Epoch 6, Loss: 4.6327
Epoch 7, Loss: 4.6342
Epoch 8, Loss: 4.6281
Epoch 9, Loss: 4.6238
Epoch 10, Loss: 4.6237
