In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

def generate_random_data(batch_size, seq_len, feature_dim, num_samples):
    # Generate input data with the correct feature dimension
    data = torch.randn(num_samples, seq_len, feature_dim)  # Random features with dimension = feature_dim
    # Targets should remain 2D: [num_samples, seq_len]
    targets = torch.randint(0, feature_dim, (num_samples, seq_len))
    return TensorDataset(data, targets)

# Transformer model definition
class TransformerModel(nn.Module):
    def __init__(self, feature_dim, num_heads, num_layers):
        super(TransformerModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=feature_dim, nhead=num_heads, dim_feedforward=feature_dim * 4, batch_first=True
        )
        self.encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)

        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=feature_dim, nhead=num_heads, dim_feedforward=feature_dim * 4,
        )
        self.decoder = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)

        self.embedding = nn.Embedding(128, feature_dim)  # Embedding for target tokens
        self.fc_out = nn.Linear(feature_dim, 128)  # Output layer to predict tokens

    def forward(self, src, tgt):
        memory = self.encoder(src)
        print(memory.shape)
        tgt_emb = self.embedding(tgt)  # Convert token IDs to embeddings
        print(tgt_emb.shape)
        output = self.decoder(tgt_emb, memory)
        return self.fc_out(output)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
batch_size = 64
seq_len = 50
feature_dim = 128
num_layers = 2
num_heads = 2
num_samples = 10000
epochs = 1

# Dataset and DataLoader
dataset = generate_random_data(batch_size, seq_len, feature_dim, num_samples)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Model, loss, and optimizer
model = TransformerModel(feature_dim, num_heads, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)


In [27]:
import time

start = time.perf_counter()
# Training
for epoch in range(epochs):
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        print(src.shape)
        print(tgt.shape)

        tgt_input = tgt.T  # Input for the decoder (remove last token)
        tgt_output = tgt  # Target tokens for loss computation (remove first token)

        optimizer.zero_grad()
        outputs = model(src.permute(1, 0, 2), tgt_input)  # No need to permute tgt_input (handled by embedding)
        loss = criterion(outputs.permute(1, 2, 0), tgt_output)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(dataloader):.4f}")

end = time.perf_counter()
print(f'total time for : {end - start}')

torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size([64, 50, 128])
torch.Size([64, 50])
torch.Size([50, 64, 128])
torch.Size([50, 64, 128])
torch.Size

KeyboardInterrupt: 

In [17]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)
src = torch.rand(10, 32, 512)
out = transformer_encoder(src)



In [10]:
decoder_layer = nn.TransformerDecoderLayer(d_model=512, nhead=8)
tgt = torch.rand(20, 32, 512)
out = decoder_layer(tgt, out)