In [7]:
import torch
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"Running on {torch.cuda.get_device_name(0)}")
else:
    device = torch.device('cpu')
    print("Running on CPU")

Running on Tesla T4


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy

In [9]:
class AttentionHead(nn.Module):
    def __init__(self, model_dim, num_heads):
        super(AttentionHead, self).__init__()
        assert model_dim % num_heads == 0, "model_dim must be divisible by num_heads"

        self.model_dim = model_dim
        self.num_heads = num_heads
        self.head_dim = model_dim // num_heads

        self.query_layer = nn.Linear(model_dim, model_dim)
        self.key_layer = nn.Linear(model_dim, model_dim)
        self.value_layer = nn.Linear(model_dim, model_dim)
        self.output_layer = nn.Linear(model_dim, model_dim)

    def compute_attention(self, query, key, value, mask=None):
            # Compute attention scores (dot product of query and key)
        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # Check if mask is provided
        if mask is not None:
            # Expand mask to match the dimensions of attention scores (batch_size, num_heads, seq_len, seq_len)
            mask = mask.unsqueeze(1).repeat(1, query.size(1), 1, 1)

            # Apply the mask to the scores (masked_fill expects the mask to have the same shape as scores)
            scores = scores.masked_fill(mask == 0, -1e9)

        # Compute the attention probabilities
        attention_probs = torch.softmax(scores, dim=-1)

        # Multiply the attention probabilities with the value tensor
        output = torch.matmul(attention_probs, value)

        return output

    def split_heads(self, tensor):
        batch_size, seq_len, model_dim = tensor.size()
        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)

    def merge_heads(self, tensor):
        batch_size, num_heads, seq_len, head_dim = tensor.size()
        return tensor.transpose(1, 2).contiguous().view(batch_size, seq_len, self.model_dim)

    def forward(self, query, key, value, mask=None):
        query = self.split_heads(self.query_layer(query))
        key = self.split_heads(self.key_layer(key))
        value = self.split_heads(self.value_layer(value))

        attention_output = self.compute_attention(query, key, value, mask)
        output = self.output_layer(self.merge_heads(attention_output))
        return output

In [10]:
class FeedForwardNetwork(nn.Module):
    def __init__(self, model_dim, hidden_dim):
        super(FeedForwardNetwork, self).__init__()
        self.fc1 = nn.Linear(model_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, model_dim)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [11]:
class PositionalEmbedding(nn.Module):
    def __init__(self, model_dim, max_len):
        super(PositionalEmbedding, self).__init__()

        pos_enc = torch.zeros(max_len, model_dim)
        positions = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, model_dim, 2).float() * -(math.log(10000.0) / model_dim))

        pos_enc[:, 0::2] = torch.sin(positions * div_term)
        pos_enc[:, 1::2] = torch.cos(positions * div_term)

        self.register_buffer('pos_enc', pos_enc.unsqueeze(0))

    def forward(self, x):
        return x + self.pos_enc[:, :x.size(1)]


In [12]:
class EncoderBlock(nn.Module):
    def __init__(self, model_dim, num_heads, hidden_dim, dropout_rate):
        super(EncoderBlock, self).__init__()
        self.self_attention = AttentionHead(model_dim, num_heads)
        self.feed_forward = FeedForwardNetwork(model_dim, hidden_dim)
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, encoder_input, src_mask):
        attention_out = self.self_attention(encoder_input, encoder_input, encoder_input, src_mask)
        encoder_input = self.norm1(encoder_input + self.dropout(attention_out))
        ff_out = self.feed_forward(encoder_input)
        encoder_input = self.norm2(encoder_input + self.dropout(ff_out))
        return encoder_input

In [13]:
class DecoderBlock(nn.Module):
    def __init__(self, model_dim, num_heads, hidden_dim, dropout_rate):
        super(DecoderBlock, self).__init__()
        self.self_attention = AttentionHead(model_dim, num_heads)
        self.cross_attention = AttentionHead(model_dim, num_heads)
        self.feed_forward = FeedForwardNetwork(model_dim, hidden_dim)
        self.norm1 = nn.LayerNorm(model_dim)
        self.norm2 = nn.LayerNorm(model_dim)
        self.norm3 = nn.LayerNorm(model_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, decoder_input, encoder_output, src_mask, tgt_mask):
        attention_out = self.self_attention(decoder_input, decoder_input, decoder_input, tgt_mask)
        decoder_input = self.norm1(decoder_input + self.dropout(attention_out))
        cross_attention_out = self.cross_attention(decoder_input, encoder_output, encoder_output, src_mask)
        decoder_input = self.norm2(decoder_input + self.dropout(cross_attention_out))
        ff_out = self.feed_forward(decoder_input)
        decoder_input = self.norm3(decoder_input + self.dropout(ff_out))
        return decoder_input

In [14]:
class TransformerModel(nn.Module):
    def __init__(self, input_vocab_size, output_vocab_size, model_dim, num_heads, num_layers, hidden_dim, max_len, dropout_rate):
        super(TransformerModel, self).__init__()
        self.encoder_embedding = nn.Embedding(input_vocab_size, model_dim)
        self.decoder_embedding = nn.Embedding(output_vocab_size, model_dim)
        self.positional_embedding = PositionalEmbedding(model_dim, max_len)

        self.encoder_blocks = nn.ModuleList([EncoderBlock(model_dim, num_heads, hidden_dim, dropout_rate) for _ in range(num_layers)])
        self.decoder_blocks = nn.ModuleList([DecoderBlock(model_dim, num_heads, hidden_dim, dropout_rate) for _ in range(num_layers)])

        self.fc_out = nn.Linear(model_dim, output_vocab_size)
        self.dropout = nn.Dropout(dropout_rate)

    def generate_mask(self, source, target):
        # Ensure masks are created on the same device as the inputs
        src_mask = (source != 0).unsqueeze(1).to(source.device)
        tgt_mask = (target != 0).unsqueeze(1).to(target.device)

        seq_len = target.size(1)
        no_peak_mask = (1 - torch.triu(torch.ones(1, seq_len, seq_len), diagonal=1)).bool().to(target.device)
        tgt_mask = tgt_mask & no_peak_mask

        return src_mask, tgt_mask


    def forward(self, src_input, tgt_input):
        src_mask, tgt_mask = self.generate_mask(src_input, tgt_input)
        src_embedded = self.dropout(self.positional_embedding(self.encoder_embedding(src_input)))
        tgt_embedded = self.dropout(self.positional_embedding(self.decoder_embedding(tgt_input)))

        encoder_output = src_embedded
        for encoder_block in self.encoder_blocks:
            encoder_output = encoder_block(encoder_output, src_mask)

        decoder_output = tgt_embedded
        for decoder_block in self.decoder_blocks:
            decoder_output = decoder_block(decoder_output, encoder_output, src_mask, tgt_mask)

        output = self.fc_out(decoder_output)
        return output


In [15]:
# Set hyperparameters
input_vocab_size = 5000
output_vocab_size = 5000
model_dim = 512
num_heads = 8
num_layers = 6
hidden_dim = 2048
max_len = 100
dropout_rate = 0.1



# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model and move it to GPU (if available)
transformer_model = TransformerModel(input_vocab_size, output_vocab_size, model_dim, num_heads, num_layers, hidden_dim, max_len, dropout_rate).to(device)

# Generate random sample data and move it to GPU (if available)
src_batch = torch.randint(1, input_vocab_size, (64, max_len)).to(device)  # (batch_size, seq_length)
tgt_batch = torch.randint(1, output_vocab_size, (64, max_len)).to(device)  # (batch_size, seq_length)

# Set loss function and optimizer
loss_fn = nn.CrossEntropyLoss(ignore_index=0).to(device)
optimizer = optim.Adam(transformer_model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

# Training loop
transformer_model.train()

for epoch in range(500):
    optimizer.zero_grad()

    # Forward pass with data moved to GPU
    predictions = transformer_model(src_batch, tgt_batch[:, :-1])

    # Compute loss with data on GPU
    loss = loss_fn(predictions.contiguous().view(-1, output_vocab_size), tgt_batch[:, 1:].contiguous().view(-1))

    # Backpropagation
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")


Epoch 1, Loss: 8.684684753417969
Epoch 2, Loss: 8.552639961242676
Epoch 3, Loss: 8.483640670776367
Epoch 4, Loss: 8.42927360534668
Epoch 5, Loss: 8.371991157531738
Epoch 6, Loss: 8.30825424194336
Epoch 7, Loss: 8.227605819702148
Epoch 8, Loss: 8.151398658752441
Epoch 9, Loss: 8.06497573852539
Epoch 10, Loss: 7.992325782775879
Epoch 11, Loss: 7.903415679931641
Epoch 12, Loss: 7.822192192077637
Epoch 13, Loss: 7.738828659057617
Epoch 14, Loss: 7.654994010925293
Epoch 15, Loss: 7.575342655181885
Epoch 16, Loss: 7.493861198425293
Epoch 17, Loss: 7.406813621520996
Epoch 18, Loss: 7.333458423614502
Epoch 19, Loss: 7.247169017791748
Epoch 20, Loss: 7.165889739990234
Epoch 21, Loss: 7.090582847595215
Epoch 22, Loss: 7.0098371505737305
Epoch 23, Loss: 6.936218738555908
Epoch 24, Loss: 6.860867023468018
Epoch 25, Loss: 6.768840789794922
Epoch 26, Loss: 6.697455406188965
Epoch 27, Loss: 6.620323181152344
Epoch 28, Loss: 6.553584575653076
Epoch 29, Loss: 6.484265327453613
Epoch 30, Loss: 6.4164147