# Install required libraries

In [1]:
!pip install transformers datasets seaborn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

# Import necessary libraries

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from transformers import GPT2Tokenizer
import matplotlib.pyplot as plt
import seaborn as sns

# Mount Google Drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Check GPU availability

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 1. Preprocessing the Penn Treebank (PTB) Dataset

## Load the PTB dataset from Hugging Face

In [5]:
dataset = load_dataset('ptb_text_only')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.21k [00:00<?, ?B/s]

ptb_text_only.py:   0%|          | 0.00/6.50k [00:00<?, ?B/s]

The repository for ptb_text_only contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/ptb_text_only.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/5.10M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/400k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/450k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/42068 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3761 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3370 [00:00<?, ? examples/s]

## Tokenizer initialization (using GPT2 tokenizer for demonstration)

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [None]:
tokenizer.pad_token = tokenizer.eos_token

## Tokenize the PTB dataset

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['sentence'], truncation=True, padding='max_length', max_length=64)

## Apply tokenization to the dataset

In [None]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/42068 [00:00<?, ? examples/s]

Map:   0%|          | 0/3761 [00:00<?, ? examples/s]

Map:   0%|          | 0/3370 [00:00<?, ? examples/s]

## Define data loaders

In [None]:
from torch.utils.data import DataLoader

In [None]:
batch_size = 32
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

In [None]:
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# 2. Baseline GAM-RHN Model (without Attention)

In [None]:
class RHNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RHNCell, self).__init__()
        self.hidden_size = hidden_size
        self.rhn_layers = 5  # Number of recurrent layers
        self.input_size = input_size

        # Projection layer to match the input size to the hidden size
        self.input_projection = nn.Linear(input_size, hidden_size)

        # Hidden layers for each recurrent step (they should accept hidden_size as input and output)
        self.hidden_layers = nn.ModuleList(
            [nn.Linear(hidden_size, hidden_size) for _ in range(self.rhn_layers)]
        )

    def forward(self, x, h):

        # Project input to match the hidden size (from input_size to hidden_size)
        x = self.input_projection(x)  # Shape: (batch_size, hidden_size)

        # Recurrent Highway Network with multiple layers
        for layer in self.hidden_layers:
            h = torch.tanh(layer(x) + h)  # Shape: (batch_size, hidden_size)

        return h

In [None]:
class GAM_RHN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(GAM_RHN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)  # Embedding Layer
        self.rhn_cell = RHNCell(embedding_dim, hidden_size)  # RHN Cell
        self.fc = nn.Linear(hidden_size, num_classes)  # Output layer (from hidden_size to num_classes)

    def forward(self, x):

        # Get the embeddings
        embedded = self.embedding(x)  # Shape: (batch_size, sequence_length, embedding_dim)

        # Initialize hidden state
        h = torch.zeros(x.size(0), self.rhn_cell.hidden_size).to(device)  # Shape: (batch_size, hidden_size)

        # Iterate over the sequence
        for step in range(x.size(1)):  # x.size(1) is the sequence length
            h = self.rhn_cell(embedded[:, step, :], h)  # Shape: (batch_size, hidden_size)

        # Return final output after passing through the fully connected layer
        output = self.fc(h)  # Shape: (batch_size, num_classes)

        return output

## Hyperparameters

In [None]:
vocab_size = tokenizer.vocab_size
embedding_dim = 256
hidden_size = 512
num_classes = vocab_size
epochs = 10
num_heads = 8

## Initialize the model

In [None]:
model = GAM_RHN(vocab_size, embedding_dim, hidden_size, num_classes).to(device)

# 3. Integrating Attention Mechanisms

In [None]:
class GAM_RHN_Attention(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes, attention_type='dot_product', num_heads=8):
        super(GAM_RHN_Attention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rhn_cell = RHNCell(embedding_dim, hidden_size)
        self.attention_type = attention_type

        # Initialize different attention mechanisms based on attention_type
        if attention_type == 'dot_product':
            self.attention = DotProductAttention(hidden_size)
        elif attention_type == 'additive':
            self.attention = AdditiveAttention(hidden_size)
        elif attention_type == 'scaled_dot_product':
            self.attention = ScaledDotProductAttention(hidden_size)
        elif attention_type == 'multi_head':
            self.attention = MultiHeadAttention(hidden_size, num_heads)
        else:
            raise ValueError(f"Unknown attention type: {attention_type}")

        # Fully connected output layer
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)  # Shape: (batch_size, sequence_length, embedding_dim)
        h = torch.zeros(x.size(0), self.rhn_cell.hidden_size).to(device)  # Initialize hidden state

        # Process the input sequence through RHN cell and collect hidden states
        rhn_outputs = []  # To store hidden states from each time step
        for step in range(x.size(1)):
            h = self.rhn_cell(embedded[:, step, :], h)  # Forward step through RHN cell
            rhn_outputs.append(h.unsqueeze(1))  # Shape: (batch_size, 1, hidden_size)

        rhn_outputs = torch.cat(rhn_outputs, dim=1)  # Shape: (batch_size, sequence_length, hidden_size)

        # Attention mechanism: Query is the last hidden state, Key and Value are the entire sequence
        query = h.unsqueeze(1)  # Shape: (batch_size, 1, hidden_size)
        key = rhn_outputs  # Shape: (batch_size, sequence_length, hidden_size)
        value = rhn_outputs  # Shape: (batch_size, sequence_length, hidden_size)

        # Apply the selected attention mechanism
        context, attn_weights = self.attention(query, key, value)
        context = context.squeeze(1)  # Remove the singleton dimension after attention

        # Output prediction via fully connected layer
        output = self.fc(context)  # Shape: (batch_size, num_classes)

        return output, attn_weights


## (a) Dot-Product Attention

In [None]:
class DotProductAttention(nn.Module):
    def __init__(self, hidden_size):
        super(DotProductAttention, self).__init__()
        self.hidden_size = hidden_size

    def forward(self, query, key, value):
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (self.hidden_size ** 0.5)
        attention_weights = nn.functional.softmax(attention_scores, dim=-1)
        return torch.matmul(attention_weights, value), attention_weights

## (b) Additive Attention

In [None]:
class AdditiveAttention(nn.Module):
    def __init__(self, hidden_size):
        super(AdditiveAttention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, query, key, value):
        # Repeat query to match the sequence length of key
        query = query.repeat(1, key.size(1), 1)  # Shape: (batch_size, sequence_length, hidden_size)

        # Concatenate query and key along the last dimension
        attn_input = torch.cat((query, key), dim=-1)  # Shape: (batch_size, sequence_length, hidden_size * 2)

        # Linear transformation for attention weights
        attn_weights = torch.tanh(self.attn(attn_input))  # Shape: (batch_size, sequence_length, hidden_size)
        attn_weights = torch.matmul(attn_weights, self.v.unsqueeze(-1)).squeeze(-1)  # Shape: (batch_size, sequence_length)

        # Softmax to normalize the weights
        attn_weights = nn.functional.softmax(attn_weights, dim=-1)  # Shape: (batch_size, sequence_length)

        # Compute the weighted sum of values based on the attention weights
        context = torch.matmul(attn_weights.unsqueeze(1), value)  # Shape: (batch_size, 1, hidden_size)

        return context, attn_weights

## (c) Scaled Dot Product Attention

In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, hidden_size):
        super(ScaledDotProductAttention, self).__init__()
        self.hidden_size = hidden_size

    def forward(self, query, key, value):
        # Scale the dot product of query and key by the square root of hidden_size
        scale = self.hidden_size ** 0.5
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / scale  # Shape: (batch_size, query_len, key_len)

        # Softmax to get attention weights
        attention_weights = nn.functional.softmax(attention_scores, dim=-1)  # Shape: (batch_size, query_len, key_len)

        # Multiply the attention weights by the value
        output = torch.matmul(attention_weights, value)  # Shape: (batch_size, query_len, hidden_size)

        return output, attention_weights


## (d) Multi-Head Attention

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert hidden_size % num_heads == 0, "Hidden size must be divisible by the number of heads"

        self.num_heads = num_heads
        self.head_dim = hidden_size // num_heads

        # Linear layers to project queries, keys, and values to multiple heads
        self.query_proj = nn.Linear(hidden_size, hidden_size)
        self.key_proj = nn.Linear(hidden_size, hidden_size)
        self.value_proj = nn.Linear(hidden_size, hidden_size)

        # Final linear layer after concatenating all heads
        self.out_proj = nn.Linear(hidden_size, hidden_size)

    def forward(self, query, key, value):
        batch_size = query.size(0)

        # Project and reshape queries, keys, and values for multi-head attention
        query = self.query_proj(query).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        key = self.key_proj(key).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        value = self.value_proj(value).view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        # Perform scaled dot-product attention on each head
        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (self.head_dim ** 0.5)
        attention_weights = nn.functional.softmax(attention_scores, dim=-1)

        # Multiply the attention weights by the values
        multi_head_output = torch.matmul(attention_weights, value)  # Shape: (batch_size, num_heads, query_len, head_dim)

        # Concatenate all heads and project back to the original hidden size
        multi_head_output = multi_head_output.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.head_dim)

        # Apply final projection
        output = self.out_proj(multi_head_output)

        return output, attention_weights


## Initialize the model

In [None]:
model_with_attention_dot = GAM_RHN_Attention(vocab_size, embedding_dim, hidden_size, num_classes, attention_type='dot_product').to(device)

In [None]:
model_with_attention_additive = GAM_RHN_Attention(vocab_size, embedding_dim, hidden_size, num_classes, attention_type='additive').to(device)

In [None]:
model_with_attention_scaled = GAM_RHN_Attention(vocab_size, embedding_dim, hidden_size, num_classes, attention_type='scaled_dot_product').to(device)

In [None]:
model_with_attention_multi_head = GAM_RHN_Attention(vocab_size, embedding_dim, hidden_size, num_classes,attention_type='multi_head', num_heads=num_heads).to(device)

# 4. Training and Evaluation Functions

In [None]:
def train_model(model, train_loader, val_loader, epochs=5):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            # Extract inputs and move to the device
            inputs = torch.stack(batch['input_ids']).to(device)
            labels = torch.stack(batch['input_ids']).to(device)

            optimizer.zero_grad()
            outputs = model(inputs)  # Unpack outputs and attention weights, use only outputs (logits)
            labels = labels.argmax(dim=1)  # Assuming labels are one-hot encoded
            loss = criterion(outputs, labels.squeeze())  # Calculate loss using logits

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')


In [None]:
def train_model_attn(model, train_loader, val_loader, epochs=5):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            # Extract inputs and move to the device
            inputs = torch.stack(batch['input_ids']).to(device)
            labels = torch.stack(batch['input_ids']).to(device)

            optimizer.zero_grad()
            outputs, _ = model(inputs)  # Unpack outputs and attention weights, use only outputs (logits)
            labels = labels.argmax(dim=1)  # Assuming labels are one-hot encoded
            loss = criterion(outputs, labels.squeeze())  # Calculate loss using logits

            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}')


In [None]:
def evaluate_model(model, val_loader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    perplexity_loss = 0
    total_tokens = 0
    correct = 0
    total = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for batch in val_loader:
            # Extract inputs and move to the device
            inputs = torch.stack(batch['input_ids']).to(device)
            labels = torch.stack(batch['input_ids']).to(device)

            outputs = model(inputs)  # Get model outputs
            labels = labels.argmax(dim=1)  # Match the labels as in the training function

            # Calculate loss
            loss = criterion(outputs, labels.squeeze())
            total_loss += loss.item()
            perplexity_loss += loss.item()*labels.size(0)
            total_tokens += labels.size(0)

            # Calculate accuracy
            _, predicted = torch.max(outputs, -1)  # Get predicted class
            correct += (predicted == labels).sum().item()  # Compare predictions to true labels
            total += labels.size(0)

    accuracy = correct / total  # Compute accuracy
    perplexity = torch.exp(torch.tensor(perplexity_loss / total_tokens))  # Compute perplexity
    print(f'Validation Loss: {total_loss/len(val_loader)}, Accuracy: {accuracy*100:.2f}%, Perplexity: {perplexity:.4f}')

In [None]:
def evaluate_model_attn(model, val_loader):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    perplexity_loss = 0
    total_tokens = 0
    correct = 0
    total = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():  # Disable gradient calculation for evaluation
        for batch in val_loader:
            # Extract inputs and move to the device
            inputs = torch.stack(batch['input_ids']).to(device)
            labels = torch.stack(batch['input_ids']).to(device)

            outputs, _ = model(inputs)  # Unpack outputs and attention weights, use only outputs (logits)
            labels = labels.argmax(dim=1)  # Match the labels as in the training function

            # Calculate loss
            loss = criterion(outputs, labels.squeeze())
            total_loss += loss.item()
            perplexity_loss += loss.item()*labels.size(0)
            total_tokens += labels.size(0)

            # Calculate accuracy
            _, predicted = torch.max(outputs, dim=1)  # Get predicted class
            correct += (predicted == labels).sum().item()  # Compare predictions to true labels
            total += labels.size(0)

    accuracy = correct / total  # Compute accuracy
    perplexity = torch.exp(torch.tensor(perplexity_loss / total_tokens))  # Compute perplexity
    print(f'Validation Loss: {total_loss/len(val_loader)}, Accuracy: {accuracy*100:.2f}%, Perplexity: {perplexity:.4f}')


# 5. Running the Training and Evaluation

In [None]:
drive_model_path = '/content/drive/MyDrive/GAM_RHN_Models/'

## Load models

In [None]:
model.load_state_dict(torch.load(drive_model_path+'baseline_model.pth'))

  model.load_state_dict(torch.load(drive_model_path+'baseline_model.pth'))


<All keys matched successfully>

In [None]:
model_with_attention_dot.load_state_dict(torch.load(drive_model_path+'model_with_attention_dot.pth'))

  model_with_attention_dot.load_state_dict(torch.load(drive_model_path+'model_with_attention_dot.pth'))


<All keys matched successfully>

In [None]:
model_with_attention_additive.load_state_dict(torch.load(drive_model_path+'model_with_attention_additive.pth'))

  model_with_attention_additive.load_state_dict(torch.load(drive_model_path+'model_with_attention_additive.pth'))


<All keys matched successfully>

In [None]:
model_with_attention_scaled.load_state_dict(torch.load(drive_model_path+'model_with_attention_scaled.pth'))

  model_with_attention_scaled.load_state_dict(torch.load(drive_model_path+'model_with_attention_scaled.pth'))


<All keys matched successfully>

In [None]:
model_with_attention_multi_head.load_state_dict(torch.load(drive_model_path+'model_with_attention_multi_head.pth'))

  model_with_attention_multi_head.load_state_dict(torch.load(drive_model_path+'model_with_attention_multi_head.pth'))


<All keys matched successfully>

## Train and Save the base model

In [None]:
train_model(model, train_loader, val_loader, epochs)

Epoch 1, Loss: 1.702924354901332
Epoch 2, Loss: 1.6931042525704822
Epoch 3, Loss: 1.6886845801946329
Epoch 4, Loss: 1.6792589662419526
Epoch 5, Loss: 1.6784979684724555
Epoch 6, Loss: 1.669092068132792
Epoch 7, Loss: 1.678046857540145
Epoch 8, Loss: 1.6911044550939205
Epoch 9, Loss: 1.687586255037286
Epoch 10, Loss: 1.683333332230383


In [None]:
torch.save(model.state_dict(), drive_model_path+'baseline_model.pth')

## Train the attention-based models

### Dot Product

In [None]:
train_model_attn(model_with_attention_dot, train_loader, val_loader, epochs)

Epoch 1, Loss: 1.5029080079535568
Epoch 2, Loss: 1.5449438955620667
Epoch 3, Loss: 1.5392065106235984
Epoch 4, Loss: 1.5041965535385073
Epoch 5, Loss: 1.5427856260379458
Epoch 6, Loss: 1.5535714959463693
Epoch 7, Loss: 1.512100071553495
Epoch 8, Loss: 1.5315617144334452
Epoch 9, Loss: 1.520805040081191
Epoch 10, Loss: 1.5394496066035426


In [None]:
torch.save(model_with_attention_dot.state_dict(), drive_model_path+'model_with_attention_dot.pth')

### Additive

In [None]:
train_model_attn(model_with_attention_additive, train_loader, val_loader, epochs)

Epoch 1, Loss: 1.4023516687376871
Epoch 2, Loss: 1.4025496743931063
Epoch 3, Loss: 1.380897620286325
Epoch 4, Loss: 1.3969976309587746
Epoch 5, Loss: 1.38962312048379
Epoch 6, Loss: 1.3703761347573067
Epoch 7, Loss: 1.384744517567493
Epoch 8, Loss: 1.3937774291056644
Epoch 9, Loss: 1.3952377247266443


In [None]:
torch.save(model_with_attention_additive.state_dict(), drive_model_path+'model_with_attention_additive.pth')

### Scaled Dot Product

In [None]:
train_model_attn(model_with_attention_scaled, train_loader, val_loader, epochs)

Epoch 1, Loss: 1.597168147790568
Epoch 2, Loss: 1.5943747822322774
Epoch 3, Loss: 1.5783449635532871
Epoch 4, Loss: 1.6047045328544574
Epoch 5, Loss: 1.5936922358469365
Epoch 6, Loss: 1.5405459218152122
Epoch 7, Loss: 1.5425199512066496
Epoch 8, Loss: 1.502246460443214
Epoch 9, Loss: 1.5264660508913686
Epoch 10, Loss: 1.5074200053178766


In [None]:
torch.save(model_with_attention_scaled.state_dict(), drive_model_path+'model_with_attention_scaled.pth')

### Multi-Head

In [None]:
train_model_attn(model_with_attention_multi_head, train_loader, val_loader, epochs)

Epoch 1, Loss: 1.5513135179140722
Epoch 2, Loss: 1.5540823666541748
Epoch 3, Loss: 1.5253408987939132
Epoch 4, Loss: 1.5244066514443082
Epoch 5, Loss: 1.5210926742608104
Epoch 6, Loss: 1.531933231294835
Epoch 7, Loss: 1.506683950029852
Epoch 8, Loss: 1.5321828037840333
Epoch 9, Loss: 1.5029307933814626
Epoch 10, Loss: 1.5296422215468983


In [None]:
torch.save(model_with_attention_multi_head.state_dict(), drive_model_path+'model_with_attention_multi_head.pth')

## Evaluate

In [None]:
evaluate_model(model, test_loader)

Validation Loss: 1.753053731584953, Accuracy: 57.77%, Perplexity: 5.7722


In [None]:
evaluate_model_attn(model_with_attention_dot, test_loader)

Validation Loss: 1.595713795746787, Accuracy: 56.14%, Perplexity: 4.9318


In [None]:
evaluate_model_attn(model_with_attention_additive, test_loader)

Validation Loss: 1.46426950976, Accuracy: 57.69%, Perplexity: 4.3244


In [None]:
evaluate_model_attn(model_with_attention_scaled, test_loader)

Validation Loss: 1.5841315863496166, Accuracy: 57.39%, Perplexity: 4.8751


In [None]:
evaluate_model_attn(model_with_attention_multi_head, test_loader)

Validation Loss: 1.5476916785967552, Accuracy: 57.40%, Perplexity: 4.7006
