## Compare the performance and applicability of learnable positional encoding versus relative positional encoding in the context of audio processing, in the Transformer architecture. Write a script for both architectures, where you change this component of the model. 

In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import torchaudio
from torchaudio.transforms import MelSpectrogram
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
class LocalAttention(nn.Module):
    def __init__(self, embed_size, num_heads, window_size):
        super(LocalAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_size, num_heads)
        self.window_size = window_size

    def forward(self, x):
        batch_size, seq_len, embed_size = x.size()
        output = torch.zeros_like(x)
        for i in range(0, seq_len, self.window_size):
            end = min(i + self.window_size, seq_len)
            attn_output, _ = self.attention(x[:, i:end, :], x[:, i:end, :], x[:, i:end, :])
            output[:, i:end, :] = attn_output
        return output

In [3]:
class LearnablePositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(LearnablePositionalEncoding, self).__init__()
        self.positional_encoding = nn.Parameter(torch.zeros(1, max_len, embed_size))

    def forward(self, x):
        seq_len = x.size(1)
        return x + self.positional_encoding[:, :seq_len, :]

class TransformerLearnablePositionalEncoding(nn.Module):
    def __init__(self, embed_size, num_heads, num_layers, window_size):
        super(TransformerLearnablePositionalEncoding, self).__init__()
        self.embedding = nn.Linear(128, embed_size)
        self.positional_encoding = LearnablePositionalEncoding(embed_size)
        self.local_attention_layers = nn.ModuleList(
            [LocalAttention(embed_size, num_heads, window_size) for _ in range(num_layers)]
        )
        self.fc = nn.Linear(embed_size, 10)  # Example output size

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for local_layer in self.local_attention_layers:
            x = local_layer(x)
        x = self.fc(x.mean(dim=1))
        return x

In [4]:
class RelativePositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(RelativePositionalEncoding, self).__init__()
        self.relative_positions = nn.Parameter(torch.zeros(max_len, embed_size))

    def forward(self, x):
        seq_len = x.size(1)
        relative_positions = self.relative_positions[:seq_len, :]
        return x + relative_positions.unsqueeze(0)

class TransformerRelativePositionalEncoding(nn.Module):
    def __init__(self, embed_size, num_heads, num_layers, window_size):
        super(TransformerRelativePositionalEncoding, self).__init__()
        self.embedding = nn.Linear(128, embed_size)
        self.positional_encoding = RelativePositionalEncoding(embed_size)
        self.local_attention_layers = nn.ModuleList(
            [LocalAttention(embed_size, num_heads, window_size) for _ in range(num_layers)]
        )
        self.fc = nn.Linear(embed_size, 10)  # Example output size

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for local_layer in self.local_attention_layers:
            x = local_layer(x)
        x = self.fc(x.mean(dim=1))
        return x

In [None]:
def prepare_datasets():
    # Ensure the data directory exists
    if not os.path.exists('data'):
        os.makedirs('data')

    # Remove any partially downloaded files
    partial_files = [f for f in os.listdir('data') if f.endswith('.partial')]
    for f in partial_files:
        os.remove(os.path.join('data', f))

    # Download and load the dataset with error handling
    try:
        train_dataset = torchaudio.datasets.LIBRISPEECH(root="data", url="train-clean-100", download=True)
        test_dataset = torchaudio.datasets.LIBRISPEECH(root="data", url="test-clean", download=True)
    except Exception as e:
        print(f"Error downloading dataset: {e}")
        return None, None, None

    # Split the training dataset into training and validation sets
    train_size = int(0.8 * len(train_dataset))
    val_size = len(train_dataset) - train_size
    train_dataset, val_dataset = random_split(train_dataset, [train_size, val_size])

    return train_dataset, val_dataset, test_dataset

class AudioDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.mel_spectrogram = MelSpectrogram()

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        waveform, sample_rate, _, _, label, _ = self.dataset[idx]
        mel_spectrogram = self.mel_spectrogram(waveform)
        mel_spectrogram = mel_spectrogram.permute(0, 2, 1)  # (batch_size, seq_len, feature_dim)
        return mel_spectrogram, label

def train_model(model, dataloader, criterion, optimizer, num_epochs=10):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for inputs, labels in dataloader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * inputs.size(0)
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}')

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in dataloader:
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')
    return accuracy, precision, recall, f1

if __name__ == "__main__":
    # Prepare the datasets
    train_dataset, val_dataset, test_dataset = prepare_datasets()
    if train_dataset is None or val_dataset is None or test_dataset is None:
        print("Failed to prepare datasets.")
        exit(1)

    # Create data loaders
    train_dataloader = DataLoader(AudioDataset(train_dataset), batch_size=32, shuffle=True)
    val_dataloader = DataLoader(AudioDataset(val_dataset), batch_size=32, shuffle=False)
    test_dataloader = DataLoader(AudioDataset(test_dataset), batch_size=32, shuffle=False)

    # Initialize the models, criterion, and optimizer
    model_learnable = TransformerLearnablePositionalEncoding(embed_size=256, num_heads=8, num_layers=4, window_size=10)
    model_relative = TransformerRelativePositionalEncoding(embed_size=256, num_heads=8, num_layers=4, window_size=10)
    criterion = nn.CrossEntropyLoss()
    optimizer_learnable = optim.Adam(model_learnable.parameters(), lr=0.001)
    optimizer_relative = optim.Adam(model_relative.parameters(), lr=0.001)

    # Train the learnable positional encoding model
    print("Training Learnable Positional Encoding Model")
    train_model(model_learnable, train_dataloader, criterion, optimizer_learnable, num_epochs=10)
    accuracy_learnable, precision_learnable, recall_learnable, f1_learnable = evaluate_model(model_learnable, test_dataloader)
    print(f'Learnable Positional Encoding Model - Accuracy: {accuracy_learnable:.4f}, Precision: {precision_learnable:.4f}, Recall: {recall_learnable:.4f}, F1 Score: {f1_learnable:.4f}')

    # Train the relative positional encoding model
    print("Training Relative Positional Encoding Model")
    train_model(model_relative, train_dataloader, criterion, optimizer_relative, num_epochs=10)
    accuracy_relative, precision_relative, recall_relative, f1_relative = evaluate_model(model_relative, test_dataloader)
    print(f'Relative Positional Encoding Model - Accuracy: {accuracy_relative:.4f}, Precision: {precision_relative:.4f}, Recall: {recall_relative:.4f}, F1 Score: {f1_relative:.4f}')