**Lab3-CopyDelayTask.ipnyb**

This simple provided notebook is as initial code for the task of comparing the capacities of Vanilla-RNN and LSTM networks in a sequence memory tasks.

**Lab3 1st task:** understand exactly what this code does (what is the task, and why it make sense to use this task), and then execute it with different parameters in order to extract conclusions of the differences obtained for RNN and LSTM. With the default settings, there is a similar behaviour of both models, and even a worse one from the LSTM, so you task is to look at the learning curves, optimize the parameters and then do a fair comparison of both models.

**To be uploaded to the Virtual Campus:** A pdf file with a short explanation of what the code does (no code), explaining the task, together with a description of your experiments with the final parameters used and your conclusions about the behavior of both models. A figure (the same format or similar to the one printed by the code with your results should be included).

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np

# Parameters
input_size = 5 #vector of inputs
hidden_size = 32 #def 32
output_size = input_size
batch_size = 32 # def 32
learning_rate = 0.01 #def 0.01
max_epochs = 15 # def 15
patience = 3  #def 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
num_runs = 5  # Number of runs to average per delay def:5

# Delay values to test
delays = list(range(1, 30 , 4))  #def (1, 30, 4))

# Data generation
def generate_batch_delay(batch_size, delay, input_size):
    seq_len = delay * 2
    seq = torch.zeros(batch_size, seq_len, input_size)
    seq[:, :delay, :] = (torch.rand(batch_size, delay, input_size) > 0.5).float()
    target = torch.zeros_like(seq)
    target[:, delay:, :] = seq[:, :delay, :]
    return seq.to(device), target.to(device), seq_len

# Models
class VanillaRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.rnn(x)
        return self.fc(out)

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)
        return self.fc(out)

# Accuracy calculation
def binary_accuracy(output, target):
    preds = (torch.sigmoid(output) > 0.5).float()
    correct = (preds == target).float()
    return correct.mean().item()

# Train model for given delay with early stopping
def train_for_delay_early_stopping(model, delay):
    seq_len = delay * 2
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    best_val_loss = float('inf')
    epochs_no_improve = 0
    best_model_state = None

    for epoch in range(max_epochs):
        model.train()
        train_loss = 0.0
        for _ in range(100):
            seq_in, seq_target, _ = generate_batch_delay(batch_size, delay, input_size)
            optimizer.zero_grad()
            output = model(seq_in)
            loss = criterion(output[:, delay:, :], seq_target[:, delay:, :])
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= 100

        # Validation loss
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for _ in range(20):
                seq_in, seq_target, _ = generate_batch_delay(batch_size, delay, input_size)
                output = model(seq_in)
                loss = criterion(output[:, delay:, :], seq_target[:, delay:, :])
                val_loss += loss.item()
        val_loss /= 20

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                break

    if best_model_state:
        model.load_state_dict(best_model_state)
    return model

# Evaluate model accuracy for given delay
def evaluate_for_delay(model, delay, trials=20):
    seq_len = delay * 2
    model.eval()
    acc = 0
    with torch.no_grad():
        for _ in range(trials):
            seq_in, seq_target, _ = generate_batch_delay(batch_size, delay, input_size)
            output = model(seq_in)
            acc += binary_accuracy(output[:, delay:, :], seq_target[:, delay:, :])
    return acc / trials

rnn_accuracies = []
lstm_accuracies = []

for delay in delays:
    print(f"Evaluating delay {delay} over {num_runs} runs ...")
    rnn_accs = []
    lstm_accs = []
    for _ in range(num_runs):
        rnn = VanillaRNN(input_size, hidden_size, output_size).to(device)
        rnn = train_for_delay_early_stopping(rnn, delay)
        rnn_accs.append(evaluate_for_delay(rnn, delay))

        lstm = LSTMModel(input_size, hidden_size, output_size).to(device)
        lstm = train_for_delay_early_stopping(lstm, delay)
        lstm_accs.append(evaluate_for_delay(lstm, delay))

    rnn_accuracies.append(np.mean(rnn_accs))
    lstm_accuracies.append(np.mean(lstm_accs))

# Plot accuracy vs delay
plt.figure(figsize=(8,5))
plt.plot(delays, rnn_accuracies, 'o-', label='Vanilla RNN')
plt.plot(delays, lstm_accuracies, 's-', label='LSTM')
plt.xlabel('Delay Length')
plt.ylabel('Accuracy')
plt.title(f'Copy Delay Task: Accuracy vs Delay Length (averaged over {num_runs} runs)')
plt.legend()
plt.grid(True)
plt.show()
