<a href="https://colab.research.google.com/github/hurricane195/Intro-to-Deep-Learning/blob/Homework_5/HW5_P2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Like Homework 3. Problem 2, Build a transformer mode, for the tiny Shakespeare dataset, the data loader code is already provided.

1. **Train the models for the sequence of 20 and 30, report and compare training loss, validation accuracy, execution time for training, and computational and mode size complexities, and compare it against RNN-based models.**

2. Adjust the hyperparameters (number of layers, hidden size, and the number of heads) and compare your results (training and validation loss, computation complexity, model size, training and inference time, and the output sequence). Analyze their influence on accuracy, running time, and computational perplexity.

3. What if we increase the sequence length to 50. Perform the training and report the accuracy and model complexity results.


In [None]:
#Using a modided example of Dr. Tabkhi's "RNN" available at https://github.com/HamedTabkhi/Intro-to-DL/blob/main/RNN.py
#Using a modided example of Dr. Tabkhi's "RNN-CharDataset" available at https://github.com/HamedTabkhi/Intro-to-DL/blob/main/RNN-CharDataset.py
#Using a modided example of Dr. Tabkhi's "shakespeare-loader.py" available at https://github.com/HamedTabkhi/Intro-to-DL/blob/main/shakespeare-loader.py
#Using a modided example of Dr. Tabkhi's "transformer_encoder_nextcharactor" available at https://github.com/HamedTabkhi/Intro-to-DL/blob/main/transformer_encoder_nextcharactor.py
#Random help from Chat GPT on formatting, sytntax, etc.
#Random help from Chat Colab AI on formatting, sytntax, etc.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
import time

import torch
from torch.utils.data import Dataset, DataLoader
import requests

In [None]:
# Check for CUDA support and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [None]:
#Download the dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text  # This is the entire text data

**MAXIMUM LENGTH OF INPUT SECQUENCES = 20**

In [None]:
#Prepare the dataset
sequence_length = 20
# Create a character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

In [None]:
# Encode the text into integers
encoded_text = [char_to_int[ch] for ch in text]

In [None]:
# Create sequences and targets
sequences = []
targets = []
for i in range(0, len(encoded_text) - sequence_length):
    seq = encoded_text[i:i+sequence_length]
    target = encoded_text[i+sequence_length]
    sequences.append(seq)
    targets.append(target)

sequences = np.array(sequences)
targets = np.array(targets)

In [None]:
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(sequences, targets, test_size=0.2, random_state=42)

In [None]:
# Converting data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)

In [None]:
#Create a dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Instantiate the dataset
dataset = CharDataset(sequences, targets)

In [None]:
# Create datasets and data loaders
batch_size = 128
train_dataset = CharDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CharDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Defining the Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])  # Get the output of the last Transformer block
        return output

In [None]:
# Hyperparameters
hidden_size = 128
num_layers = 3
nhead = 2
learning_rate = 0.001
epochs = 10

In [None]:
# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [None]:
#count trainable parameters of the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

1795777

In [None]:
# Training the model
start_time = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    train_accuracy = correct / total
    avg_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    with torch.no_grad():
        total_val_loss = 0
        correct = 0
        total = 0
        for batch_X, batch_y in val_loader:
            val_output = model(batch_X)
            val_loss = criterion(val_output, batch_y)
            total_val_loss += val_loss.item()
            _, predicted = torch.max(val_output.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    val_accuracy = correct / total
    avg_val_loss = total_val_loss / len(val_loader)

    """
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    """
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time} seconds")

Epoch 1/10, Training Loss: 2.5166, Training Accuracy: 0.2608, Validation Loss: 2.4869, Validation Accuracy: 0.2684
Epoch 2/10, Training Loss: 2.4855, Training Accuracy: 0.2658, Validation Loss: 2.4731, Validation Accuracy: 0.2685
Epoch 3/10, Training Loss: 2.4809, Training Accuracy: 0.2666, Validation Loss: 2.4687, Validation Accuracy: 0.2659
Epoch 4/10, Training Loss: 2.4764, Training Accuracy: 0.2677, Validation Loss: 2.4671, Validation Accuracy: 0.2639
Epoch 5/10, Training Loss: 2.4745, Training Accuracy: 0.2677, Validation Loss: 2.4657, Validation Accuracy: 0.2697
Epoch 6/10, Training Loss: 2.4723, Training Accuracy: 0.2683, Validation Loss: 2.4646, Validation Accuracy: 0.2677
Epoch 7/10, Training Loss: 2.4704, Training Accuracy: 0.2688, Validation Loss: 2.4634, Validation Accuracy: 0.2697
Epoch 8/10, Training Loss: 2.4725, Training Accuracy: 0.2687, Validation Loss: 2.4632, Validation Accuracy: 0.2682
Epoch 9/10, Training Loss: 2.4702, Training Accuracy: 0.2688, Validation Loss: 2

In [None]:
# Prediction function
def predict_next_char(model, char_to_int, int_to_char, initial_str):
    model.eval()
    with torch.no_grad():
        initial_input = torch.tensor([char_to_int[c] for c in initial_str[-sequence_length:]], dtype=torch.long).unsqueeze(0)
        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return int_to_char[predicted_index]

# Predicting the first next character
test_str = "This is a simple example to demonstrate how to predict the next char"
predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str)
print('FIRST TEST STRING: This is a simple example to demonstrate how to predict the next char..')
print(f"First predicted next character: '{predicted_char}'")
print("")

# Predicting the second next character
test_str = "Long live the quee"
predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str)
print('SECOND TEST STRING: Long live the quee..')
print(f"Second predicted next character: '{predicted_char}'")
print("")


# Predicting the third next character
test_str = "Be quiet and do not spea"
predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str)
print('THIRD TEST STRING: Be quiet and do not spea..')
print(f"Third predicted next character: '{predicted_char}'")
print("")

FIRST TEST STRING: This is a simple example to demonstrate how to predict the next char..
First predicted next character: 'e'

SECOND TEST STRING: Long live the quee..
Second predicted next character: ' '

THIRD TEST STRING: Be quiet and do not spea..
Third predicted next character: 'n'



**MAXIMUM LENGTH OF INPUT SECQUENCES = 30**

In [None]:
#Prepare the dataset
sequence_length = 30
# Create a character mapping to integers
chars = sorted(list(set(text)))
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

In [None]:
# Encode the text into integers
encoded_text = [char_to_int[ch] for ch in text]

In [None]:
# Create sequences and targets
sequences = []
targets = []
for i in range(0, len(encoded_text) - sequence_length):
    seq = encoded_text[i:i+sequence_length]
    target = encoded_text[i+sequence_length]
    sequences.append(seq)
    targets.append(target)

sequences = np.array(sequences)
targets = np.array(targets)

In [None]:
# Splitting the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(sequences, targets, test_size=0.2, random_state=42)

In [None]:
# Converting data to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)
X_val = torch.tensor(X_val, dtype=torch.long)
y_val = torch.tensor(y_val, dtype=torch.long)

In [None]:
#Create a dataset class
class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]

# Instantiate the dataset
dataset = CharDataset(sequences, targets)

In [None]:
# Create datasets and data loaders
batch_size = 128
train_dataset = CharDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = CharDataset(X_val, y_val)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Defining the Transformer model
class CharTransformer(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers, nhead):
        super(CharTransformer, self).__init__()
        self.embedding = nn.Embedding(input_size, hidden_size)
        encoder_layers = nn.TransformerEncoderLayer(hidden_size, nhead)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding(x)
        transformer_output = self.transformer_encoder(embedded)
        output = self.fc(transformer_output[:, -1, :])  # Get the output of the last Transformer block
        return output

In [None]:
# Hyperparameters
hidden_size = 128
num_layers = 3
nhead = 2
learning_rate = 0.001
epochs = 10

In [None]:
# Model, loss, and optimizer
model = CharTransformer(len(chars), hidden_size, len(chars), num_layers, nhead)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)



In [None]:
#count trainable parameters of the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

count_parameters(model)

1795777

In [None]:
# Training the model
start_time = time.time()
for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_X)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        _, predicted = torch.max(output.data, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y).sum().item()
    train_accuracy = correct / total
    avg_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    with torch.no_grad():
        total_val_loss = 0
        correct = 0
        total = 0
        for batch_X, batch_y in val_loader:
            val_output = model(batch_X)
            val_loss = criterion(val_output, batch_y)
            total_val_loss += val_loss.item()
            _, predicted = torch.max(val_output.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    val_accuracy = correct / total
    avg_val_loss = total_val_loss / len(val_loader)

    """
    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    """
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

end_time = time.time()
training_time = end_time - start_time
print(f"Training time: {training_time} seconds")

Epoch 1/10, Training Loss: 2.5161, Training Accuracy: 0.2607, Validation Loss: 2.5102, Validation Accuracy: 0.2642
Epoch 2/10, Training Loss: 2.4873, Training Accuracy: 0.2649, Validation Loss: 2.4790, Validation Accuracy: 0.2709
Epoch 3/10, Training Loss: 2.4790, Training Accuracy: 0.2669, Validation Loss: 2.4714, Validation Accuracy: 0.2700
Epoch 4/10, Training Loss: 2.4779, Training Accuracy: 0.2673, Validation Loss: 2.4711, Validation Accuracy: 0.2700
Epoch 5/10, Training Loss: 2.4731, Training Accuracy: 0.2682, Validation Loss: 2.4714, Validation Accuracy: 0.2712
Epoch 6/10, Training Loss: 2.4713, Training Accuracy: 0.2683, Validation Loss: 2.4671, Validation Accuracy: 0.2713
Epoch 7/10, Training Loss: 2.4737, Training Accuracy: 0.2680, Validation Loss: 2.4657, Validation Accuracy: 0.2697
Epoch 8/10, Training Loss: 2.4729, Training Accuracy: 0.2682, Validation Loss: 2.4647, Validation Accuracy: 0.2706
Epoch 9/10, Training Loss: 2.4706, Training Accuracy: 0.2686, Validation Loss: 2

In [None]:
# Prediction function
def predict_next_char(model, char_to_int, int_to_char, initial_str):
    model.eval()
    with torch.no_grad():
        initial_input = torch.tensor([char_to_int[c] for c in initial_str[-sequence_length:]], dtype=torch.long).unsqueeze(0)
        prediction = model(initial_input)
        predicted_index = torch.argmax(prediction, dim=1).item()
        return int_to_char[predicted_index]

# Predicting the first next character
test_str = "This is a simple example to demonstrate how to predict the next char"
predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str)
print('FIRST TEST STRING: This is a simple example to demonstrate how to predict the next char..')
print(f"First predicted next character: '{predicted_char}'")
print("")

# Predicting the second next character
test_str = "Long live the quee"
predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str)
print('SECOND TEST STRING: Long live the quee..')
print(f"Second predicted next character: '{predicted_char}'")
print("")


# Predicting the third next character
test_str = "Be quiet and do not spea"
predicted_char = predict_next_char(model, char_to_int, int_to_char, test_str)
print('THIRD TEST STRING: Be quiet and do not spea..')
print(f"Third predicted next character: '{predicted_char}'")
print("")

FIRST TEST STRING: This is a simple example to demonstrate how to predict the next char..
First predicted next character: 'n'

SECOND TEST STRING: Long live the quee..
Second predicted next character: ' '

THIRD TEST STRING: Be quiet and do not spea..
Third predicted next character: 'o'

