## Transformer

### Architecture

In [1]:
import torch
import torch.nn as nn
import math
import numpy as np

#### `SelfAttention`

In [2]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * self.heads == self.embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(embed_size, embed_size)
        self.keys = nn.Linear(embed_size, embed_size)
        self.queries = nn.Linear(embed_size, embed_size)
        self.fc_out = nn.Linear(embed_size, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0] # Batch size
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        # Shapes before splitting:
        # values:  (N, value_len, embed_size)
        # keys:    (N, key_len  , embed_size)
        # queries: (N, query_len, embed_size)

        # Split the embedding into self.heads different pieces
        values = self.values(values).reshape(N, value_len, self.heads, self.head_dim)
        keys = self.keys(keys).reshape(N, key_len, self.heads, self.head_dim)
        queries = self.queries(query).reshape(N, query_len, self.heads, self.head_dim)
        # Shapes after splitting: 
        # values:  (N, value_len, heads, head_dim)
        # keys:    (N, key_len  , heads, head_dim)
        # queries: (N, query_len, heads, head_dim)

        # Compute the dot product between queries and keys for each head, 
        # and divide by sqrt of head_dim for numerical stability
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys]) / math.sqrt(self.head_dim)
        # Shape of energy: (N, heads, query_len, key_len)

        # Apply mask
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))
        # Shape of mask: (N, 1, 1, key_len)
        # 0 in key_len dimension means that the respective element in energy is set to -1e20
        # Mask will be broadcasted to (N, heads, query_len, key_len) by PyTorch automatically

        # Compute the attention weights for each head using the softmax function
        attention = torch.softmax(energy, dim=-1)
        # Shape of attention: (N, heads, query_len, key_len)

        # Multiply the attention weights with the values for each head and then concatenate
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.embed_size
        )
        # Shape of out: (N, query_len, embed_size)

        out = self.fc_out(out)
        return out

#### `TransformerBlock`: 

`SelfAttention` -> layerNorm -> Feed-Forward -> layerNorm

In [3]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size),
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        # Compute self-attention
        attention = self.attention(value, key, query, mask)
        # Shape of attention: (N, query_len, embed_size)

        # Add skip connection, run through normalization and dropout
        x = self.dropout(self.norm1(attention + query))
        # Shape of x: (N, query_len, embed_size)

        # Feed-forward network
        forward = self.feed_forward(x)
        # Shape of forward: (N, query_len, embed_size)

        # Add skip connection, run through normalization and dropout
        out = self.dropout(self.norm2(forward + x))
        # Shape of out: (N, query_len, embed_size)

        return out

#### Encoder: `num_layers` of `TransformerBlock` 

In [4]:
class Encoder(nn.Module):
    def __init__(
        self,
        src_vocab_size, # Size of the source vocabulary
        num_layers, # Number of TransformerBlocks
        max_length, # Maximum length of the sentence
        embed_size,
        heads,
        forward_expansion,
        dropout,
        device,
    ):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(
                    embed_size,
                    heads,
                    forward_expansion,
                    dropout,
                )
                for _ in range(num_layers)
            ]
        )

        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape

        # Positions is the index of the word in the sentence (0, 1, 2, ..., seq_length)
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)

        # Add word embeddings and position embeddings
        out = self.dropout(
            (self.word_embedding(x) + self.position_embedding(positions))
        )
        # Shape of out: (N, seq_length, embed_size)

        # In the Encoder the query, key, value are all the same
        for layer in self.layers:
            out = layer(out, out, out, mask)
            # Shape of out: (N, seq_length, embed_size)

        return out

#### `DecoderBlock`

Masked `SelfAttention` -> layerNorm -> `TransformerBlock`

In [5]:
class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout):
        super(DecoderBlock, self).__init__()
        self.norm = nn.LayerNorm(embed_size)
        self.masked_attention = SelfAttention(embed_size, heads=heads)
        self.transformer_block = TransformerBlock(
            embed_size, heads, forward_expansion, dropout
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, encoder_value, encoder_key, src_mask, trg_mask):
        # Self attention on the target sentence with mask
        attention = self.masked_attention(x, x, x, trg_mask)

        # Add skip connection, run through normalization and dropout
        query = self.dropout(self.norm(attention + x))

        # Transformer block with encoder's output as value and key
        out = self.transformer_block(encoder_value, encoder_key, query, src_mask)
        # Shape of out: (N, query_len, embed_size)
        
        return out

#### Decoder: `num_layers` of `DecoderBlock`

In [6]:
class Decoder(nn.Module):
    def __init__(
        self,
        trg_vocab_size, # Size of the target vocabulary
        num_layers, # Number of DecoderBlocks
        max_length, # Maximum length of the sentence
        embed_size,
        heads,
        forward_expansion,
        dropout,
        device,
    ):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(
                    embed_size,
                    heads,
                    forward_expansion,
                    dropout)
                for _ in range(num_layers)
            ]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape

        # Positions is the index of the word in the sentence (0, 1, 2, ..., seq_length)
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)

        # Add word embeddings and position embeddings
        x = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))

        # In the Decoder the key and value are the encoder's output,
        # and the query is the output of the previous DecoderBlock
        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(x)

        return out

#### `Transformer`

In [7]:
class Transformer(nn.Module):
    def __init__(
        self,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx, # Index of the padding token in the source vocabulary
        trg_pad_idx, # Index of the padding token in the target vocabulary
        num_layers,
        max_length,
        embed_size,
        heads,
        forward_expansion=4,
        dropout=0.0,
        device="cpu",
    ):

        super(Transformer, self).__init__()

        # Initialize the Encoder
        self.encoder = Encoder(
            src_vocab_size,
            num_layers,
            max_length,
            embed_size,
            heads,
            forward_expansion,
            dropout,
            device,
        )

        # Initialize the Decoder
        self.decoder = Decoder(
            trg_vocab_size,
            num_layers,
            max_length,
            embed_size,
            heads,
            forward_expansion,
            dropout,
            device,
        )

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        # Shape of src: (N, src_len)
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # Shape of src_mask: (N, 1, 1, src_len)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape

        # Create a lower triangular matrix of ones with shape (trg_len, trg_len),
        # then expand it to (N, 1, trg_len, trg_len)
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(
            N, 1, trg_len, trg_len
        )

        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask) # raw l
        return out

### `Transformer` Training

In [8]:
import random

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set seed for reproducibility
seed = 4212
set_seed(seed)

#### Hyperparameters

In [9]:
sos_idx = 100 # Start of sequence index
eos_idx = 101 # End of sequence index
pad_idx = 0 # Padding index
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
num_samples = 5000 # Number of samples in the dataset
max_length = 6 + 2 # Maximum length of the sequence including the <SOS> and <EOS> tokens
vocab_size = 99 + 3 # Numbers from 1 to 99 and three indices: padding, sos, eos
num_layers = 2 # Number of Blocks in the Encoder and Decoder
embed_size = 32
heads = 2
forward_expansion = 4
dropout = 0.0
learning_rate = 0.001
batch_size = 256
num_epochs = 20

#### Data Generation

In [10]:
# Function to generate data for the three tasks: copy, reverse, and sort
def generate_data(num_samples, max_length, vocab_size, pad_idx, sos_idx, eos_idx, task, seed=None):
    if seed is not None:
        set_seed(seed)

    src_data = []
    trg_data = []
    
    for _ in range(num_samples):
        seq_length = np.random.randint(1, max_length - 1)  # Length of the random sequence
        sequence = np.random.randint(1, vocab_size - 2, seq_length).tolist()
        
        if task == 'copy':
            target_sequence = sequence
        elif task == 'sort':
            target_sequence = sorted(sequence)
        elif task == 'reverse':
            target_sequence = sequence[::-1]
        else:
            raise ValueError("Unknown task")
        
        # Add the <SOS> and <EOS> tokens, then pad to the maximum length
        src_sequence = [sos_idx] + sequence + [eos_idx] + [pad_idx] * (max_length - len(sequence) - 2)
        trg_sequence = [sos_idx] + target_sequence + [eos_idx] + [pad_idx] * (max_length - len(target_sequence) - 2)
        
        src_data.append(src_sequence)
        trg_data.append(trg_sequence)
    
    src_data = torch.tensor(src_data, dtype=torch.long)
    trg_data = torch.tensor(trg_data, dtype=torch.long)
    
    return src_data, trg_data

In [11]:
set_seed(seed)

# Generate data for the copying task
src_data_copy, trg_data_copy = generate_data(num_samples, max_length, vocab_size, pad_idx, sos_idx, eos_idx, 'copy',)

# Generate data for the reversing task
src_data_reverse, trg_data_reverse = generate_data(num_samples, max_length, vocab_size, pad_idx, sos_idx, eos_idx, 'reverse')

# Generate data for the sorting task
src_data_sort, trg_data_sort = generate_data(num_samples, max_length, vocab_size, pad_idx, sos_idx, eos_idx, 'sort')

print("Copy Task - Source Data:")
print(src_data_copy[:3])
print("Copy Task - Target Data:")
print(trg_data_copy[:3])
print()
print("Reverse Task - Source Data:")
print(src_data_reverse[:3])
print("Reverse Task - Target Data:")
print(trg_data_reverse[:3])
print()
print("Sort Task - Source Data:")
print(src_data_sort[:3])
print("Sort Task - Target Data:")
print(trg_data_sort[:3])

Copy Task - Source Data:
tensor([[100,  10,  14,  82, 101,   0,   0,   0],
        [100,  99,  36,  70,  98, 101,   0,   0],
        [100,  61, 101,   0,   0,   0,   0,   0]])
Copy Task - Target Data:
tensor([[100,  10,  14,  82, 101,   0,   0,   0],
        [100,  99,  36,  70,  98, 101,   0,   0],
        [100,  61, 101,   0,   0,   0,   0,   0]])

Reverse Task - Source Data:
tensor([[100,  31, 101,   0,   0,   0,   0,   0],
        [100,   7,  38,  70, 101,   0,   0,   0],
        [100,  84,  72,  97,   2,   1,  99, 101]])
Reverse Task - Target Data:
tensor([[100,  31, 101,   0,   0,   0,   0,   0],
        [100,  70,  38,   7, 101,   0,   0,   0],
        [100,  99,   1,   2,  97,  72,  84, 101]])

Sort Task - Source Data:
tensor([[100,  58,  15, 101,   0,   0,   0,   0],
        [100,  83, 101,   0,   0,   0,   0,   0],
        [100,  75,  71,  24, 101,   0,   0,   0]])
Sort Task - Target Data:
tensor([[100,  15,  58, 101,   0,   0,   0,   0],
        [100,  83, 101,   0,   0,   0

#### Train-Test Split

In [12]:
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [13]:
# Split the data into training and test sets
def prepare_data(src_data, trg_data, test_size=0.2, seed=None):
    if seed is not None:
        set_seed(seed)

    src_train, src_test, trg_train, trg_test = train_test_split(src_data, trg_data, test_size=test_size)
    train_dataset = TensorDataset(src_train, trg_train)
    test_dataset = TensorDataset(src_test, trg_test)
    return train_dataset, test_dataset

In [14]:
set_seed(4212)

# For the 'copy' task
train_data, test_data = prepare_data(src_data_copy, trg_data_copy)
train_loader_copy = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader_copy = DataLoader(test_data, batch_size=batch_size)

# For the 'reverse' task
train_data, test_data = prepare_data(src_data_reverse, trg_data_reverse)
train_loader_reverse = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader_reverse = DataLoader(test_data, batch_size=batch_size)

# For the 'sort' task
train_data, test_data = prepare_data(src_data_sort, trg_data_sort)
train_loader_sort = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader_sort = DataLoader(test_data, batch_size=batch_size)

print(f'Size of the training dataset: {len(train_data)}')
print(f'Size of the test dataset: {len(test_data)}')

Size of the training dataset: 4000
Size of the test dataset: 1000


#### Training the Models

In [15]:
import torch.optim as optim

In [16]:
# Function to train the model
def train_model(model, train_loader, criterion, optimizer, num_epochs, device):
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0
        for src, trg in train_loader:
            src = src.to(device)
            trg = trg.to(device)

            # Forward pass
            output = model(src, trg[:, :-1]) # Exclude the last token from the target sequence
            output = output.reshape(-1, output.shape[2])
            trg = trg[:, 1:].reshape(-1) # Exclude the <SOS> token from the target sequence

            # Compute loss
            loss = criterion(output, trg)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()

        if epoch % 2 == 1:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/len(train_loader)}')

    print("Training complete.")
    return model

# Define the loss function
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

Copy Model

In [17]:
set_seed(4212)

# Initialize the Transformer model for the 'copy' task
model_copy = Transformer(
    src_vocab_size=vocab_size,
    trg_vocab_size=vocab_size,
    src_pad_idx=pad_idx,
    trg_pad_idx=pad_idx,
    num_layers=num_layers,
    max_length=max_length,
    embed_size=embed_size,
    heads=heads,
    forward_expansion=forward_expansion,
    dropout=dropout,
    device=device
).to(device)

# Define the optimizer
optimizer_copy = optim.Adam(model_copy.parameters(), lr=learning_rate)

# Train the model for the 'copy' task
model_copy = train_model(model_copy, train_loader_copy, criterion, optimizer_copy, num_epochs, device)

Epoch [2/20], Loss: 4.102633774280548
Epoch [4/20], Loss: 3.926971048116684
Epoch [6/20], Loss: 3.3467292189598083
Epoch [8/20], Loss: 2.0810547322034836
Epoch [10/20], Loss: 1.0563989281654358
Epoch [12/20], Loss: 0.48228208534419537
Epoch [14/20], Loss: 0.23481028713285923
Epoch [16/20], Loss: 0.13550485204905272
Epoch [18/20], Loss: 0.08979252399876714
Epoch [20/20], Loss: 0.06490117637440562
Training complete.


Reverse Model

In [18]:
set_seed(4212)

# Initialize the Transformer model for the 'reverse' task
model_reverse = Transformer(
    src_vocab_size=vocab_size,
    trg_vocab_size=vocab_size,
    src_pad_idx=pad_idx,
    trg_pad_idx=pad_idx,
    num_layers=num_layers,
    max_length=max_length,
    embed_size=embed_size,
    heads=heads,
    forward_expansion=forward_expansion,
    dropout=dropout,
    device=device
).to(device)

# Define the optimizer
optimizer_reverse = optim.Adam(model_reverse.parameters(), lr=learning_rate)

# Train the model for the 'reverse' task
model_reverse = train_model(model_reverse, train_loader_reverse, criterion, optimizer_reverse, num_epochs, device)

Epoch [2/20], Loss: 4.10052752494812
Epoch [4/20], Loss: 3.9241564571857452
Epoch [6/20], Loss: 3.54915614426136
Epoch [8/20], Loss: 3.142897590994835
Epoch [10/20], Loss: 2.496100053191185
Epoch [12/20], Loss: 1.5218461453914642
Epoch [14/20], Loss: 0.6681469455361366
Epoch [16/20], Loss: 0.28762717358767986
Epoch [18/20], Loss: 0.155604082159698
Epoch [20/20], Loss: 0.09971246775239706
Training complete.


Sort Model

In [19]:
set_seed(4212)

# Initialize the Transformer model for the 'sort' task
model_sort = Transformer(
    src_vocab_size=vocab_size,
    trg_vocab_size=vocab_size,
    src_pad_idx=pad_idx,
    trg_pad_idx=pad_idx,
    num_layers=num_layers,
    max_length=max_length,
    embed_size=embed_size,
    heads=heads,
    forward_expansion=forward_expansion,
    dropout=dropout,
    device=device
).to(device)

# Define the optimizer
optimizer_sort = optim.Adam(model_sort.parameters(), lr=learning_rate)

# Train the model for the 'sort' task
model_sort = train_model(model_sort, train_loader_sort, criterion, optimizer_sort, num_epochs, device)

Epoch [2/20], Loss: 4.047736406326294
Epoch [4/20], Loss: 3.688209608197212
Epoch [6/20], Loss: 3.2033394277095795
Epoch [8/20], Loss: 2.660450294613838
Epoch [10/20], Loss: 2.0859378278255463
Epoch [12/20], Loss: 1.5131881088018417
Epoch [14/20], Loss: 0.9860993400216103
Epoch [16/20], Loss: 0.6012790687382221
Epoch [18/20], Loss: 0.39554205164313316
Epoch [20/20], Loss: 0.2926633283495903
Training complete.


### Evaluating the Models

In [20]:
from nltk.translate.bleu_score import sentence_bleu

In [23]:
# Function to generate output for a given input tensor
def generate_output(model, input_tensors, max_length, pad_idx, sos_idx, eos_idx, device):
    model.eval()
    batch_size = input_tensors.size(0)
    with torch.no_grad():
        trg_tensors = torch.full((batch_size, max_length), pad_idx).to(device)
        trg_tensors[:, 0] = sos_idx
        
        for i in range(1, max_length):
            output = model(input_tensors, trg_tensors[:, :i])
            next_tokens = output.argmax(2)[:, -1]
            trg_tensors[:, i] = next_tokens
            if (next_tokens == eos_idx).all():
                trg_tensors[:, i+1:] = pad_idx  # Fill the rest with padding tokens
                break
        
        return trg_tensors
    
# Function to process the predicted output tensor and return the predicted tokens    
def process_output(output_tensor, pad_idx, sos_idx, eos_idx):
    output_tokens = output_tensor.squeeze().tolist()
    processed_tokens = [token for token in output_tokens if token not in [pad_idx, sos_idx, eos_idx]]
    return processed_tokens

# Function to evaluate the model on the test set
def evaluate_model(model, test_loader, criterion, max_length, pad_idx, sos_idx, eos_idx, device):
    model.eval()
    total_loss = 0
    total_correct_sequences = 0
    total_sequences = 0
    
    with torch.no_grad():
        for src, trg in test_loader:
            src = src.to(device)
            trg = trg.to(device)
            
            # Forward pass
            output = model(src, trg[:, :-1])
            
            # Reshape output and target to match the expected input shape for CrossEntropyLoss
            output = output.reshape(-1, output.shape[2])
            trg_flat = trg[:, 1:].reshape(-1)
            
            # Calculate loss
            loss = criterion(output, trg_flat)
            total_loss += loss.item()
            
            # Generate predictions
            output_tensors = generate_output(model, src, max_length, pad_idx, sos_idx, eos_idx, device)
            
            # Compare the tensors directly for sequence-level accuracy
            for i in range(src.size(0)):
                target_tensor = trg[i].unsqueeze(0)
                predicted_tensor = output_tensors[i].unsqueeze(0)
                
                if torch.equal(target_tensor, predicted_tensor):
                    total_correct_sequences += 1
                else:
                    print(f"Target: {target_tensor.squeeze()}")
                    print(f"Predicted: {predicted_tensor.squeeze()}")
                total_sequences += 1
    
    average_loss = total_loss / len(test_loader)
    accuracy = total_correct_sequences / total_sequences
    return average_loss, accuracy

#### Using Test Dataset

something wrong with the generate output function maybe. \
its repeating the EOS idx instead of padding with 0

In [24]:
# Example usage
average_loss, accuracy = evaluate_model(model_copy, test_loader_copy, criterion, max_length, pad_idx, sos_idx, eos_idx, device)
print("Average Loss:", average_loss)
print("Accuracy:", accuracy)

Target: tensor([100,  82,  27,  75, 101,   0,   0,   0])
Predicted: tensor([100,  82,  27,  75, 101, 101, 101, 101])
Target: tensor([100,  61, 101,   0,   0,   0,   0,   0])
Predicted: tensor([100,  61, 101,  61,  61, 101, 101, 101])
Target: tensor([100,  49,  46,  53,  37,  44, 101,   0])
Predicted: tensor([100,  49,  46,  53,  37,  44, 101, 101])
Target: tensor([100,  68,  39,  78,   7,   9, 101,   0])
Predicted: tensor([100,  68,  39,  78,   7,   9, 101, 101])
Target: tensor([100,  40,  67,  67, 101,   0,   0,   0])
Predicted: tensor([100,  40,  67,  67, 101, 101,  67, 101])
Target: tensor([100,  71, 101,   0,   0,   0,   0,   0])
Predicted: tensor([100,  71, 101,  38,  71,  81, 101, 101])
Target: tensor([100,  80,  45, 101,   0,   0,   0,   0])
Predicted: tensor([100,  80,  45, 101, 101, 101, 101, 101])
Target: tensor([100,  94,  31,  96, 101,   0,   0,   0])
Predicted: tensor([100,  94,  31,  96, 101, 101, 101, 101])
Target: tensor([100,  34, 101,   0,   0,   0,   0,   0])
Predict

#### A Single Input

In [22]:
# Example input sequence (maximum length: 6)
input_sequence = [6,5,1,4,3]

# Prepare the input sequence: Add <SOS> and <EOS> tokens, pad the sequence to the maximum length
input_tensor = torch.tensor([sos_idx] + input_sequence + [eos_idx] + [pad_idx] * (max_length - len(input_sequence) - 2)).unsqueeze(0)  # Add batch dimension

# Move the input tensor to the appropriate device
input_tensor = input_tensor.to(device)

# Generate output for each model
output_tensor_copy = generate_output(model_copy, input_tensor, max_length, pad_idx, sos_idx, eos_idx, device)
output_tensor_reverse = generate_output(model_reverse, input_tensor, max_length, pad_idx, sos_idx, eos_idx, device)
output_tensor_sort = generate_output(model_sort, input_tensor, max_length, pad_idx, sos_idx, eos_idx, device)

# Remove padding, start, and end tokens
predicted_output_copy = process_output(output_tensor_copy, pad_idx, sos_idx, eos_idx)
predicted_output_reverse = process_output(output_tensor_reverse, pad_idx, sos_idx, eos_idx)
predicted_output_sort = process_output(output_tensor_sort, pad_idx, sos_idx, eos_idx)

# Expected outputs for each task
expected_output_copy = input_sequence
expected_output_reverse = input_sequence[::-1]
expected_output_sort = sorted(input_sequence)

print("Input Sequence:", input_sequence)
print()

print(f'Task: Copy')
print(f'Predicted Output: {predicted_output_copy}')
print(f'Expected Output:  {expected_output_copy}')
print(f'Match?: {predicted_output_copy == expected_output_copy}')
print()

print(f'Task: Reverse')
print(f'Predicted Output: {predicted_output_reverse}')
print(f'Expected Output:  {expected_output_reverse}')
print(f'Match?: {predicted_output_reverse == expected_output_reverse}')
print()

print(f'Task: Sort')
print(f'Predicted Output: {predicted_output_sort}')
print(f'Expected Output:  {expected_output_sort}')
print(f'Match?: {predicted_output_sort == expected_output_sort}')

Input Sequence: [6, 5, 1, 4, 3]

Task: Copy
Predicted Output: [6, 5, 1, 4, 3]
Expected Output:  [6, 5, 1, 4, 3]
Match?: True

Task: Reverse
Predicted Output: [3, 4, 1, 5, 6]
Expected Output:  [3, 4, 1, 5, 6]
Match?: True

Task: Sort
Predicted Output: [1, 3, 5, 4]
Expected Output:  [1, 3, 4, 5, 6]
Match?: False


### Further Tuning