<a href="https://colab.research.google.com/github/ShriyaGandotra/Intro-to-Deep-Learning/blob/main/HW_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
################################# PROBLEM 1 - RNN #############################
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import time

# Sample text
text = """Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text.
At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model.
One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.
Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time.
Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.
In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology."""

# Defining the RNN model
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        output = self.fc(output[:, -1, :])  # Get the output of the last RNN cell
        return output

# Function to prepare the dataset
def prepare_dataset(text, max_length):
    chars = sorted(list(set(text)))
    char_to_ix = {ch: i for i, ch in enumerate(chars)}
    ix_to_char = {i: ch for i, ch in enumerate(chars)}
    X = []
    y = []
    for i in range(len(text) - max_length):
        sequence = text[i:i + max_length]
        label = text[i + max_length]
        X.append([char_to_ix[char] for char in sequence])
        y.append(char_to_ix[label])
    X = np.array(X)
    y = np.array(y)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val, chars, char_to_ix, ix_to_char

def train_and_evaluate(X_train, y_train, X_val, y_val, chars, epochs=100, hidden_size=128, learning_rate=0.005):
    model = CharRNN(len(chars), hidden_size, len(chars))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    X_train_tensor = torch.tensor(X_train, dtype=torch.long)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_val_tensor = torch.tensor(X_val, dtype=torch.long)
    y_val_tensor = torch.tensor(y_val, dtype=torch.long)

    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                val_output = model(X_val_tensor)
                val_loss = criterion(val_output, y_val_tensor)
                _, predicted = torch.max(val_output, 1)
                correct = (predicted == y_val_tensor).sum().item()
                total = y_val_tensor.size(0)
                val_accuracy = correct / total
                print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy}')

    end_time = time.time()
    execution_time = end_time - start_time
    model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
    # Now val_accuracy is a float, so it's okay to return it directly
    return model, loss.item(), val_accuracy, execution_time, model_size

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.eval()
    with torch.no_grad():
        initial_input = torch.tensor([char_to_ix[c] for c in initial_str[-max_length:]], dtype=torch.long).unsqueeze(0)
        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Main loop for different sequence lengths
sequence_lengths = [10, 20, 30]
results = {}

for seq_len in sequence_lengths:
    print(f"\nTraining with sequence length: {seq_len}")
    X_train, X_val, y_train, y_val, chars, char_to_ix, ix_to_char = prepare_dataset(text, seq_len)
    model, loss, val_accuracy, execution_time, model_size = train_and_evaluate(X_train, y_train, X_val, y_val, chars)
    results[seq_len] = {
        'Training Loss': loss,
        'Validation Accuracy': val_accuracy,
        'Execution Time': execution_time,
        'Model Size': model_size,
    }

    # Use the trained model for prediction
    test_str = "Next character prediction is a funda"
    predicted_char = predict_next_char(model, char_to_ix, ix_to_char, test_str, seq_len)
    print(f"Predicted next character for sequence length {seq_len}: '{predicted_char}'")

# Display results
for seq_len, metrics in results.items():
    print(f"\nSequence Length: {seq_len}")
    for key, value in metrics.items():
        print(f"{key}: {value}")


Training with sequence length: 10
Epoch 10, Loss: 2.282480239868164, Validation Loss: 2.314403772354126, Validation Accuracy: 0.36764705882352944
Epoch 20, Loss: 1.825457215309143, Validation Loss: 2.070347547531128, Validation Accuracy: 0.43067226890756305
Epoch 30, Loss: 1.472890853881836, Validation Loss: 1.92751944065094, Validation Accuracy: 0.4579831932773109
Epoch 40, Loss: 1.1645146608352661, Validation Loss: 1.8645515441894531, Validation Accuracy: 0.5063025210084033
Epoch 50, Loss: 0.8783525824546814, Validation Loss: 1.8699814081192017, Validation Accuracy: 0.5063025210084033
Epoch 60, Loss: 0.6238076090812683, Validation Loss: 1.9481017589569092, Validation Accuracy: 0.5105042016806722
Epoch 70, Loss: 0.42213141918182373, Validation Loss: 2.0520007610321045, Validation Accuracy: 0.5126050420168067
Epoch 80, Loss: 0.2613602876663208, Validation Loss: 2.1806812286376953, Validation Accuracy: 0.5210084033613446
Epoch 90, Loss: 0.15947242081165314, Validation Loss: 2.304035902

In [None]:
################################# PROBLEM 1 - LSTM #############################
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import time

# Sample text
text = """Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text.
At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model.
One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.
Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time.
Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.
In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology."""

# Defining the LSTM model
class CharLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, batch_first=True)  # Changed from RNN to LSTM
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hn, cn) = self.lstm(embedded)  # LSTM outputs hidden states and cell states
        output = self.fc(output[:, -1, :])  # Get the output of the last LSTM cell
        return output

# Function to prepare the dataset
def prepare_dataset(text, max_length):
    chars = sorted(list(set(text)))
    char_to_ix = {ch: i for i, ch in enumerate(chars)}
    ix_to_char = {i: ch for i, ch in enumerate(chars)}
    X = []
    y = []
    for i in range(len(text) - max_length):
        sequence = text[i:i + max_length]
        label = text[i + max_length]
        X.append([char_to_ix[char] for char in sequence])
        y.append(char_to_ix[label])
    X = np.array(X)
    y = np.array(y)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val, chars, char_to_ix, ix_to_char

def train_and_evaluate(X_train, y_train, X_val, y_val, chars, epochs=100, hidden_size=128, learning_rate=0.005):
    model = CharLSTM(len(chars), hidden_size, len(chars))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    X_train_tensor = torch.tensor(X_train, dtype=torch.long)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_val_tensor = torch.tensor(X_val, dtype=torch.long)
    y_val_tensor = torch.tensor(y_val, dtype=torch.long)

    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                val_output = model(X_val_tensor)
                val_loss = criterion(val_output, y_val_tensor)
                _, predicted = torch.max(val_output, 1)
                correct = (predicted == y_val_tensor).sum().item()
                total = y_val_tensor.size(0)
                val_accuracy = correct / total
                print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy}')

    end_time = time.time()
    execution_time = end_time - start_time
    model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
    # Now val_accuracy is a float, so it's okay to return it directly
    return model, loss.item(), val_accuracy, execution_time, model_size

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.eval()
    with torch.no_grad():
        initial_input = torch.tensor([char_to_ix[c] for c in initial_str[-max_length:]], dtype=torch.long).unsqueeze(0)
        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Main loop for different sequence lengths
sequence_lengths = [10, 20, 30]
results = {}

for seq_len in sequence_lengths:
    print(f"\nTraining with sequence length: {seq_len}")
    X_train, X_val, y_train, y_val, chars, char_to_ix, ix_to_char = prepare_dataset(text, seq_len)
    model, loss, val_accuracy, execution_time, model_size = train_and_evaluate(X_train, y_train, X_val, y_val, chars)
    results[seq_len] = {
        'Training Loss': loss,
        'Validation Accuracy': val_accuracy,
        'Execution Time': execution_time,
        'Model Size': model_size,
    }

    # Use the trained model for prediction
    test_str = "Next character prediction is a funda"
    predicted_char = predict_next_char(model, char_to_ix, ix_to_char, test_str, seq_len)
    print(f"Predicted next character for sequence length {seq_len}: '{predicted_char}'")

# Display results
for seq_len, metrics in results.items():
    print(f"\nSequence Length: {seq_len}")
    for key, value in metrics.items():
        print(f"{key}: {value}")


Training with sequence length: 10
Epoch 10, Loss: 2.5216989517211914, Validation Loss: 2.4801862239837646, Validation Accuracy: 0.3319327731092437
Epoch 20, Loss: 2.036288261413574, Validation Loss: 2.1897504329681396, Validation Accuracy: 0.4117647058823529
Epoch 30, Loss: 1.6475170850753784, Validation Loss: 2.020236015319824, Validation Accuracy: 0.4579831932773109
Epoch 40, Loss: 1.2916737794876099, Validation Loss: 1.9275712966918945, Validation Accuracy: 0.48739495798319327
Epoch 50, Loss: 0.969119668006897, Validation Loss: 1.9110673666000366, Validation Accuracy: 0.523109243697479
Epoch 60, Loss: 0.6784646511077881, Validation Loss: 1.9523450136184692, Validation Accuracy: 0.5147058823529411
Epoch 70, Loss: 0.4379761815071106, Validation Loss: 2.0450613498687744, Validation Accuracy: 0.523109243697479
Epoch 80, Loss: 0.2670011520385742, Validation Loss: 2.1699278354644775, Validation Accuracy: 0.5189075630252101
Epoch 90, Loss: 0.15875595808029175, Validation Loss: 2.309719800

In [None]:
################################# PROBLEM 1 - GRU #############################
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import time

# Sample text
text = """Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text.
At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model.
One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.
Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time.
Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.
In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology."""

# Defining the GRU model
class CharGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(CharGRU, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)  # Using GRU here
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hn = self.gru(embedded)  # GRU outputs only hidden states
        output = self.fc(output[:, -1, :])  # Get the output of the last GRU cell
        return output

# Function to prepare the dataset
def prepare_dataset(text, max_length):
    chars = sorted(list(set(text)))
    char_to_ix = {ch: i for i, ch in enumerate(chars)}
    ix_to_char = {i: ch for i, ch in enumerate(chars)}
    X = []
    y = []
    for i in range(len(text) - max_length):
        sequence = text[i:i + max_length]
        label = text[i + max_length]
        X.append([char_to_ix[char] for char in sequence])
        y.append(char_to_ix[label])
    X = np.array(X)
    y = np.array(y)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val, chars, char_to_ix, ix_to_char

def train_and_evaluate(X_train, y_train, X_val, y_val, chars, epochs=100, hidden_size=128, learning_rate=0.005):
    model = CharGRU(len(chars), hidden_size, len(chars))
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    X_train_tensor = torch.tensor(X_train, dtype=torch.long)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_val_tensor = torch.tensor(X_val, dtype=torch.long)
    y_val_tensor = torch.tensor(y_val, dtype=torch.long)

    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        output = model(X_train_tensor)
        loss = criterion(output, y_train_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            model.eval()
            with torch.no_grad():
                val_output = model(X_val_tensor)
                val_loss = criterion(val_output, y_val_tensor)
                _, predicted = torch.max(val_output, 1)
                correct = (predicted == y_val_tensor).sum().item()
                total = y_val_tensor.size(0)
                val_accuracy = correct / total
                print(f'Epoch {epoch+1}, Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Validation Accuracy: {val_accuracy}')

    end_time = time.time()
    execution_time = end_time - start_time
    model_size = sum(p.numel() for p in model.parameters() if p.requires_grad)
    # Now val_accuracy is a float, so it's okay to return it directly
    return model, loss.item(), val_accuracy, execution_time, model_size

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.eval()
    with torch.no_grad():
        initial_input = torch.tensor([char_to_ix[c] for c in initial_str[-max_length:]], dtype=torch.long).unsqueeze(0)
        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Main loop for different sequence lengths
sequence_lengths = [10, 20, 30]
results = {}

for seq_len in sequence_lengths:
    print(f"\nTraining with sequence length: {seq_len}")
    X_train, X_val, y_train, y_val, chars, char_to_ix, ix_to_char = prepare_dataset(text, seq_len)
    model, loss, val_accuracy, execution_time, model_size = train_and_evaluate(X_train, y_train, X_val, y_val, chars)
    results[seq_len] = {
        'Training Loss': loss,
        'Validation Accuracy': val_accuracy,
        'Execution Time': execution_time,
        'Model Size': model_size,
    }

    # Use the trained model for prediction
    test_str = "Next character prediction is a funda"
    predicted_char = predict_next_char(model, char_to_ix, ix_to_char, test_str, seq_len)
    print(f"Predicted next character for sequence length {seq_len}: '{predicted_char}'")

# Display results
for seq_len, metrics in results.items():
    print(f"\nSequence Length: {seq_len}")
    for key, value in metrics.items():
        print(f"{key}: {value}")


Training with sequence length: 10
Epoch 10, Loss: 2.375535249710083, Validation Loss: 2.361632823944092, Validation Accuracy: 0.36134453781512604
Epoch 20, Loss: 1.8528224229812622, Validation Loss: 2.0633323192596436, Validation Accuracy: 0.4432773109243697
Epoch 30, Loss: 1.4482131004333496, Validation Loss: 1.9354394674301147, Validation Accuracy: 0.4957983193277311
Epoch 40, Loss: 1.08462655544281, Validation Loss: 1.880557656288147, Validation Accuracy: 0.5063025210084033
Epoch 50, Loss: 0.7647031545639038, Validation Loss: 1.8864102363586426, Validation Accuracy: 0.5294117647058824
Epoch 60, Loss: 0.4978640079498291, Validation Loss: 1.9605859518051147, Validation Accuracy: 0.5210084033613446
Epoch 70, Loss: 0.2996021509170532, Validation Loss: 2.079310417175293, Validation Accuracy: 0.5315126050420168
Epoch 80, Loss: 0.17222781479358673, Validation Loss: 2.224405527114868, Validation Accuracy: 0.5294117647058824
Epoch 90, Loss: 0.10416688024997711, Validation Loss: 2.3575181961

In [None]:
################################# PROBLEM 2 - LSTM #############################
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import requests
import time

# Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

# Character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
vocab_size = len(chars)

# Dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# LSTM Model class
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_dim)
        c0 = torch.zeros(num_layers, x.size(0), hidden_dim)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Function to train and validate the model
def train_and_validate(model, train_loader, test_loader, criterion, optimizer, epochs=5):
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0

        for sequences, targets in train_loader:
            sequences_one_hot = torch.nn.functional.one_hot(sequences, num_classes=vocab_size).float()
            optimizer.zero_grad()
            outputs = model(sequences_one_hot)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train_correct += (predicted == targets).sum().item()
            total_train_samples += targets.size(0)

        train_accuracy = 100 * total_train_correct / total_train_samples

        model.eval()
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0
        with torch.no_grad():
            for sequences, targets in test_loader:
                sequences_one_hot = torch.nn.functional.one_hot(sequences, num_classes=vocab_size).float()
                outputs = model(sequences_one_hot)
                loss = criterion(outputs, targets)
                total_val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total_val_correct += (predicted == targets).sum().item()
                total_val_samples += targets.size(0)

        val_accuracy = 100 * total_val_correct / total_val_samples

        end_time = time.time()  # Record the end time of training
        training_time = end_time - start_time  # Calculate the total training time
        print(f'Total Training Time: {training_time:.2f} seconds')

        print(f'Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader)}, '
              f'Train Accuracy: {train_accuracy:.2f}%, '
              f'Val Loss: {total_val_loss / len(test_loader)}, '
              f'Val Accuracy: {val_accuracy:.2f}%')

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.eval()
    with torch.no_grad():
        sequence = [char_to_ix.get(c, 0) for c in initial_str[-max_length:]]  # Handle characters not in dict
        sequence = torch.tensor(sequence, dtype=torch.long).unsqueeze(0)
        sequence_one_hot = torch.nn.functional.one_hot(sequence, num_classes=vocab_size).float()
        prediction = model(sequence_one_hot)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Training configurations
sequence_lengths = [20, 30, 50]
hidden_dims = [64, 128]
num_layers_list = [1, 2]

for sequence_length in sequence_lengths:
    sequences, targets = [], []
    for i in range(0, len(text) - sequence_length):
        seq = [char_to_int[ch] for ch in text[i:i+sequence_length]]
        target = char_to_int[text[i+sequence_length]]
        sequences.append(seq)
        targets.append(target)

    sequences = torch.tensor(sequences, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)

    dataset = CharDataset(sequences, targets)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    for hidden_dim in hidden_dims:
        for num_layers in num_layers_list:
            print(f"Training with sequence_length={sequence_length}, hidden_dim={hidden_dim}, num_layers={num_layers}")
            model = LSTMModel(vocab_size, hidden_dim, num_layers)
            model_complexity = sum(p.numel() for p in model.parameters() if p.requires_grad)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.01)
            train_and_validate(model, train_loader, test_loader, criterion, optimizer, epochs=5)
            print(f'Model Complexity (Number of Trainable Parameters): {model_complexity}')
            # Prediction example after training
            test_str = "We are accounted poor citiz"
            predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str, sequence_length)
            print(f"Predicted next character for sequence length {sequence_length}: '{predicted_char}'")

Training with sequence_length=20, hidden_dim=64, num_layers=1
Total Training Time: 232.07 seconds
Epoch 1, Train Loss: 1.8344687885138335, Train Accuracy: 45.96%, Val Loss: 1.6776731163348813, Val Accuracy: 49.89%
Total Training Time: 442.61 seconds
Epoch 2, Train Loss: 1.635450831036833, Train Accuracy: 51.00%, Val Loss: 1.6193877157642453, Val Accuracy: 51.38%
Total Training Time: 654.36 seconds
Epoch 3, Train Loss: 1.5996053367767673, Train Accuracy: 51.92%, Val Loss: 1.5951640675687544, Val Accuracy: 51.99%
Total Training Time: 863.89 seconds
Epoch 4, Train Loss: 1.5844321570140634, Train Accuracy: 52.35%, Val Loss: 1.5814144905449259, Val Accuracy: 52.43%
Total Training Time: 1073.57 seconds
Epoch 5, Train Loss: 1.5713836720498622, Train Accuracy: 52.66%, Val Loss: 1.5769753077901512, Val Accuracy: 52.82%
Model Complexity (Number of Trainable Parameters): 37761
Predicted next character for sequence length 20: 'e'
Training with sequence_length=20, hidden_dim=64, num_layers=2
Total 

In [None]:
################################# PROBLEM 2 - GRU #############################
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import requests
import time

# Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

# Character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
vocab_size = len(chars)

# Dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# GRU Model class
class GRUModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(vocab_size, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_dim)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Function to train and validate the model
def train_and_validate(model, train_loader, test_loader, criterion, optimizer, epochs=5):
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0

        for sequences, targets in train_loader:
            sequences_one_hot = torch.nn.functional.one_hot(sequences, num_classes=vocab_size).float()
            optimizer.zero_grad()
            outputs = model(sequences_one_hot)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train_correct += (predicted == targets).sum().item()
            total_train_samples += targets.size(0)

        train_accuracy = 100 * total_train_correct / total_train_samples

        model.eval()
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0
        with torch.no_grad():
            for sequences, targets in test_loader:
                sequences_one_hot = torch.nn.functional.one_hot(sequences, num_classes=vocab_size).float()
                outputs = model(sequences_one_hot)
                loss = criterion(outputs, targets)
                total_val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total_val_correct += (predicted == targets).sum().item()
                total_val_samples += targets.size(0)

        val_accuracy = 100 * total_val_correct / total_val_samples

        end_time = time.time()
        training_time = end_time - start_time
        print(f'Total Training Time: {training_time:.2f} seconds')

        print(f'Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader)}, '
              f'Train Accuracy: {train_accuracy:.2f}%, '
              f'Val Loss: {total_val_loss / len(test_loader)}, '
              f'Val Accuracy: {val_accuracy:.2f}%')

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.eval()
    with torch.no_grad():
        sequence = [char_to_ix.get(c, 0) for c in initial_str[-max_length:]]
        sequence = torch.tensor(sequence, dtype=torch.long).unsqueeze(0)
        sequence_one_hot = torch.nn.functional.one_hot(sequence, num_classes=vocab_size).float()
        prediction = model(sequence_one_hot)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Training configurations
sequence_lengths = [20, 30]
hidden_dims = [64, 128]
num_layers_list = [1,2]

for sequence_length in sequence_lengths:
    sequences, targets = [], []
    for i in range(0, len(text) - sequence_length):
        seq = [char_to_int[ch] for ch in text[i:i+sequence_length]]
        target = char_to_int[text[i+sequence_length]]
        sequences.append(seq)
        targets.append(target)

    sequences = torch.tensor(sequences, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)

    dataset = CharDataset(sequences, targets)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    for hidden_dim in hidden_dims:
        for num_layers in num_layers_list:
            print(f"Training with sequence_length={sequence_length}, hidden_dim={hidden_dim}, num_layers={num_layers}")
            model = GRUModel(vocab_size, hidden_dim, num_layers)
            model_complexity = sum(p.numel() for p in model.parameters() if p.requires_grad)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.005)
            train_and_validate(model, train_loader, test_loader, criterion, optimizer, epochs=5)
            print(f'Model Complexity (Number of Trainable Parameters): {model_complexity}')
            test_str = "We are accounted poor citiz"
            predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str, sequence_length)
            print(f"Predicted next character for sequence length {sequence_length}: '{predicted_char}'")


Training with sequence_length=20, hidden_dim=64, num_layers=1
Total Training Time: 106.58 seconds
Epoch 1, Train Loss: 1.850730815924794, Train Accuracy: 45.71%, Val Loss: 1.6985545118034262, Val Accuracy: 49.42%
Total Training Time: 213.27 seconds
Epoch 2, Train Loss: 1.6591577898546395, Train Accuracy: 50.57%, Val Loss: 1.6518393607229866, Val Accuracy: 50.67%
Total Training Time: 319.02 seconds
Epoch 3, Train Loss: 1.624965954360371, Train Accuracy: 51.40%, Val Loss: 1.6235939019449948, Val Accuracy: 51.82%
Total Training Time: 426.51 seconds
Epoch 4, Train Loss: 1.6078284868582595, Train Accuracy: 51.82%, Val Loss: 1.6210839941935231, Val Accuracy: 51.43%
Total Training Time: 533.14 seconds
Epoch 5, Train Loss: 1.5991160287914505, Train Accuracy: 52.01%, Val Loss: 1.6128737849882808, Val Accuracy: 51.80%
Model Complexity (Number of Trainable Parameters): 29377
Predicted next character for sequence length 20: 'e'
Training with sequence_length=20, hidden_dim=64, num_layers=2
Total Tr

In [None]:
########################## PART 3 LSTM - SEQUENCE LENGTH 50 ####################

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import requests
import time

# Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

# Character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
vocab_size = len(chars)

# Dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# LSTM Model class
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(vocab_size, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_dim)
        c0 = torch.zeros(num_layers, x.size(0), hidden_dim)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Function to train and validate the model
def train_and_validate(model, train_loader, test_loader, criterion, optimizer, epochs=5):
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0

        for sequences, targets in train_loader:
            sequences_one_hot = torch.nn.functional.one_hot(sequences, num_classes=vocab_size).float()
            optimizer.zero_grad()
            outputs = model(sequences_one_hot)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train_correct += (predicted == targets).sum().item()
            total_train_samples += targets.size(0)

        train_accuracy = 100 * total_train_correct / total_train_samples

        model.eval()
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0
        with torch.no_grad():
            for sequences, targets in test_loader:
                sequences_one_hot = torch.nn.functional.one_hot(sequences, num_classes=vocab_size).float()
                outputs = model(sequences_one_hot)
                loss = criterion(outputs, targets)
                total_val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total_val_correct += (predicted == targets).sum().item()
                total_val_samples += targets.size(0)

        val_accuracy = 100 * total_val_correct / total_val_samples

        end_time = time.time()
        training_time = end_time - start_time
        print(f'Total Training Time: {training_time:.2f} seconds')

        print(f'Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader)}, '
              f'Train Accuracy: {train_accuracy:.2f}%, '
              f'Val Loss: {total_val_loss / len(test_loader)}, '
              f'Val Accuracy: {val_accuracy:.2f}%')

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.eval()
    with torch.no_grad():
        sequence = [char_to_ix.get(c, 0) for c in initial_str[-max_length:]]
        sequence = torch.tensor(sequence, dtype=torch.long).unsqueeze(0)
        sequence_one_hot = torch.nn.functional.one_hot(sequence, num_classes=vocab_size).float()
        prediction = model(sequence_one_hot)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Training configurations
sequence_lengths = [50]
hidden_dims = [128]
num_layers_list = [1]

for sequence_length in sequence_lengths:
    sequences, targets = [], []
    for i in range(0, len(text) - sequence_length):
        seq = [char_to_int[ch] for ch in text[i:i+sequence_length]]
        target = char_to_int[text[i+sequence_length]]
        sequences.append(seq)
        targets.append(target)

    sequences = torch.tensor(sequences, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)

    dataset = CharDataset(sequences, targets)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    for hidden_dim in hidden_dims:
        for num_layers in num_layers_list:
            print(f"Training with sequence_length={sequence_length}, hidden_dim={hidden_dim}, num_layers={num_layers}")
            model = LSTMModel(vocab_size, hidden_dim, num_layers)
            model_complexity = sum(p.numel() for p in model.parameters() if p.requires_grad)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.01)
            train_and_validate(model, train_loader, test_loader, criterion, optimizer, epochs=5)
            print(f'Model Complexity (Number of Trainable Parameters): {model_complexity}')
            test_str = "We are accounted poor citiz"
            predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str, sequence_length)
            print(f"Predicted next character for sequence length {sequence_length}: '{predicted_char}'")

Training with sequence_length=50, hidden_dim=128, num_layers=1
Total Training Time: 1196.74 seconds
Epoch 1, Train Loss: 1.7219909401162936, Train Accuracy: 49.00%, Val Loss: 1.5473143732281027, Val Accuracy: 53.40%
Total Training Time: 2384.16 seconds
Epoch 2, Train Loss: 1.5160577504984738, Train Accuracy: 54.23%, Val Loss: 1.5003331482444628, Val Accuracy: 54.41%
Total Training Time: 3573.92 seconds
Epoch 3, Train Loss: 1.4735207109125696, Train Accuracy: 55.24%, Val Loss: 1.4728702473284798, Val Accuracy: 55.23%
Total Training Time: 4763.03 seconds
Epoch 4, Train Loss: 1.4515145211544018, Train Accuracy: 55.76%, Val Loss: 1.4644388814322646, Val Accuracy: 55.46%
Total Training Time: 5977.59 seconds
Epoch 5, Train Loss: 1.4364180721694197, Train Accuracy: 56.11%, Val Loss: 1.447383544633803, Val Accuracy: 55.95%
Model Complexity (Number of Trainable Parameters): 108225
Predicted next character for sequence length 50: 'e'


In [None]:
########################## PART 3 GRU - SEQUENCE LENGTH 50 #####################
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim
import requests
import time

# Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

# Character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}
vocab_size = len(chars)

# Dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# GRU Model class
class GRUModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(vocab_size, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        h0 = torch.zeros(num_layers, x.size(0), hidden_dim)
        out, _ = self.gru(x, h0)
        out = self.fc(out[:, -1, :])
        return out

# Function to train and validate the model
def train_and_validate(model, train_loader, test_loader, criterion, optimizer, epochs=5):
    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        total_train_loss = 0
        total_train_correct = 0
        total_train_samples = 0

        for sequences, targets in train_loader:
            sequences_one_hot = torch.nn.functional.one_hot(sequences, num_classes=vocab_size).float()
            optimizer.zero_grad()
            outputs = model(sequences_one_hot)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            total_train_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total_train_correct += (predicted == targets).sum().item()
            total_train_samples += targets.size(0)

        train_accuracy = 100 * total_train_correct / total_train_samples

        model.eval()
        total_val_loss = 0
        total_val_correct = 0
        total_val_samples = 0
        with torch.no_grad():
            for sequences, targets in test_loader:
                sequences_one_hot = torch.nn.functional.one_hot(sequences, num_classes=vocab_size).float()
                outputs = model(sequences_one_hot)
                loss = criterion(outputs, targets)
                total_val_loss += loss.item()
                _, predicted = torch.max(outputs, 1)
                total_val_correct += (predicted == targets).sum().item()
                total_val_samples += targets.size(0)

        val_accuracy = 100 * total_val_correct / total_val_samples

        end_time = time.time()
        training_time = end_time - start_time
        print(f'Total Training Time: {training_time:.2f} seconds')

        print(f'Epoch {epoch+1}, Train Loss: {total_train_loss / len(train_loader)}, '
              f'Train Accuracy: {train_accuracy:.2f}%, '
              f'Val Loss: {total_val_loss / len(test_loader)}, '
              f'Val Accuracy: {val_accuracy:.2f}%')

# Prediction function
def predict_next_char(model, char_to_ix, ix_to_char, initial_str, max_length):
    model.eval()
    with torch.no_grad():
        sequence = [char_to_ix.get(c, 0) for c in initial_str[-max_length:]]
        sequence = torch.tensor(sequence, dtype=torch.long).unsqueeze(0)
        sequence_one_hot = torch.nn.functional.one_hot(sequence, num_classes=vocab_size).float()
        prediction = model(sequence_one_hot)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return ix_to_char[predicted_index]

# Training configurations
sequence_lengths = [50]
hidden_dims = [128]
num_layers_list = [1]

for sequence_length in sequence_lengths:
    sequences, targets = [], []
    for i in range(0, len(text) - sequence_length):
        seq = [char_to_int[ch] for ch in text[i:i+sequence_length]]
        target = char_to_int[text[i+sequence_length]]
        sequences.append(seq)
        targets.append(target)

    sequences = torch.tensor(sequences, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)

    dataset = CharDataset(sequences, targets)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

    for hidden_dim in hidden_dims:
        for num_layers in num_layers_list:
            print(f"Training with sequence_length={sequence_length}, hidden_dim={hidden_dim}, num_layers={num_layers}")
            model = GRUModel(vocab_size, hidden_dim, num_layers)
            model_complexity = sum(p.numel() for p in model.parameters() if p.requires_grad)
            criterion = nn.CrossEntropyLoss()
            optimizer = optim.Adam(model.parameters(), lr=0.01)
            train_and_validate(model, train_loader, test_loader, criterion, optimizer, epochs=5)
            print(f'Model Complexity (Number of Trainable Parameters): {model_complexity}')
            test_str = "We are accounted poor citiz"
            predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str, sequence_length)
            print(f"Predicted next character for sequence length {sequence_length}: '{predicted_char}'")


Training with sequence_length=50, hidden_dim=128, num_layers=1
Total Training Time: 505.09 seconds
Epoch 1, Train Loss: 1.812353704978123, Train Accuracy: 46.74%, Val Loss: 1.7430219107910743, Val Accuracy: 48.54%
Total Training Time: 1007.24 seconds
Epoch 2, Train Loss: 2.2774022956298734, Train Accuracy: 35.15%, Val Loss: 2.438851301586635, Val Accuracy: 30.97%
Total Training Time: 1514.93 seconds
Epoch 3, Train Loss: 2.3355426010246205, Train Accuracy: 33.23%, Val Loss: 2.235202285932934, Val Accuracy: 36.30%
Total Training Time: 2034.46 seconds
Epoch 4, Train Loss: 2.0461352515333506, Train Accuracy: 40.64%, Val Loss: 1.9315357864142422, Val Accuracy: 43.82%
Total Training Time: 2557.02 seconds
Epoch 5, Train Loss: 1.8624211294444797, Train Accuracy: 45.30%, Val Loss: 1.8252340101744344, Val Accuracy: 46.40%
Model Complexity (Number of Trainable Parameters): 83265
Predicted next character for sequence length 50: 'e'
