# Scribing Assignment - Part II

# Import libraries

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data

In [None]:
## mount to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load data

In [None]:
path = '/content/drive/MyDrive/Dataset/assignment4/testdata.txt'

In [None]:
# Read the text file
with open(path, 'r',encoding='utf-8') as file:
    raw_text = file.read()

# Print the content of the text file
raw_text = raw_text.lower()



In [None]:
# load text and covert to lowercase
#filename = "textdata.txt"
#raw_text = open(filename, 'r', encoding='utf-8').read()
#raw_text = raw_text.lower()  ## convert the text to lower case

In [None]:
print(raw_text[:1000])

three rings for the elven-kings under the sky,
               seven for the dwarf-lords in their halls of stone,
            nine for mortal men doomed to die,
              one for the dark lord on his dark throne
           in the land of mordor where the shadows lie.
               one ring to rule them all, one ring to find them,
               one ring to bring them all and in the darkness bind them
           in the land of mordor where the shadows lie.
           
foreword

this tale grew in the telling, until it became a history of the great war of the ring and included many glimpses of the yet more ancient history that preceded it. it was begun soon after _the hobbit_ was written and before its publication in 1937; but i did not go on with this sequel, for i wished first to complete and set in order the mythology and legends of the elder days, which had then been taking shape for some years. i desired to do this for my own satisfaction, and i had little hope that other people 

## data preprocessing

In [None]:
raw_text = raw_text.replace('\n\n','\n').replace('\n','|')  ## | is used to represent EOS --> END OF SENTENCE

In [None]:
raw_text[:1000]

'three rings for the elven-kings under the sky,|               seven for the dwarf-lords in their halls of stone,|            nine for mortal men doomed to die,|              one for the dark lord on his dark throne|           in the land of mordor where the shadows lie.|               one ring to rule them all, one ring to find them,|               one ring to bring them all and in the darkness bind them|           in the land of mordor where the shadows lie.|           |foreword|this tale grew in the telling, until it became a history of the great war of the ring and included many glimpses of the yet more ancient history that preceded it. it was begun soon after _the hobbit_ was written and before its publication in 1937; but i did not go on with this sequel, for i wished first to complete and set in order the mythology and legends of the elder days, which had then been taking shape for some years. i desired to do this for my own satisfaction, and i had little hope that other people 

In [None]:
## PUNCTUATION REMOVAL
import string

punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [None]:
punctuations = punctuations.replace('|','')  ## replace '|' from string of punctuations. we need '|' to represent EOS
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{}~


In [None]:
def remove_punc(text):
    # Create a translation table to remove punctuation characters
    translator = str.maketrans('','', punctuations)

    # Use the translation table to remove punctuation
    text = text.translate(translator)
    return text

In [None]:
raw_text = remove_punc(raw_text)

In [None]:
raw_text[:1000]

'three rings for the elvenkings under the sky|               seven for the dwarflords in their halls of stone|            nine for mortal men doomed to die|              one for the dark lord on his dark throne|           in the land of mordor where the shadows lie|               one ring to rule them all one ring to find them|               one ring to bring them all and in the darkness bind them|           in the land of mordor where the shadows lie|           |foreword|this tale grew in the telling until it became a history of the great war of the ring and included many glimpses of the yet more ancient history that preceded it it was begun soon after the hobbit was written and before its publication in 1937 but i did not go on with this sequel for i wished first to complete and set in order the mythology and legends of the elder days which had then been taking shape for some years i desired to do this for my own satisfaction and i had little hope that other people would be intereste

### create mapping of unique chars to integers and integers to characters

In [None]:
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i,c) for i,c in enumerate(chars))

In [None]:
chars

[' ',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '\x96',
 'á',
 'â',
 'ä',
 'é',
 'ë',
 'í',
 'ó',
 'ú',
 'û']

### summarize the loaded data

In [None]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  982155
Total Vocab:  48


# prepare the dataset

In [None]:
seq_length = 7  ## length of one sequence /timesteps
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)  ## number of patters / datapoints
print("Total Patterns: ", n_patterns)

Total Patterns:  982148


In [None]:
## print a sample datapoint. You can uncomment to see the sample
dataX[0]
dataY[0]

19

In [None]:
# reshape X to be [samples, time steps] --> embedding layer in LSTM accepts the inputs in this shape only.
#timesteps = sequence length
# convert dataX to torch tensor and reshaping
X = torch.tensor(dataX, dtype=torch.int32).reshape(n_patterns, seq_length)

# convert dataY to torch tensor
y = torch.tensor(dataY)

In [None]:
X.shape

torch.Size([982148, 7])

## Write code to prepare a train, val and test split from X and y to create

X_train, y_train

X_val, y_val

X_test, y_test

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into test (80%) and temporary (20%)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the temporary data into training (80%) and validation (20%)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

# Print the sizes of the splits
print("Training set size:", len(X_train))
print("Validation set size:", len(X_val))
print("Test set size:", len(X_test))

Training set size: 589288
Validation set size: 196430
Test set size: 196430


# Create NN architecture

## Custom LSTM network

In [None]:
"""
parameters of LSTM/RNN/GRU layer:

input_size – The number of expected features in the input x

hidden_size – The number of features in the hidden state h

num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM,
with the second LSTM taking in outputs of the first LSTM and computing the final results

batch_first – If True, then the input and output tensors are provided as (batch, seq, feature)

"""
######
# hyperparameters
#lstm_embeding_dim = ??
#lstm_hid_size = ??
#lstm_layers=  ??
######

# Set hyperparameters
input_size = n_vocab  # Number of unique characters
output_size = n_vocab  # Number of unique characters
embedding_dim = 50
hidden_size = 100
num_layers = 2

class CharModel_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, embedding_dim=50):
        super(CharModel_LSTM, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.linear(out[:, -1, :])  # Select the output from the last time step
        return out

## Custom RNN Network

In [None]:
"""
parameters of LSTM/RNN/GRU layer:

input_size – The number of expected features in the input x

hidden_size – The number of features in the hidden state h

num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two RNNs together to form a stacked RNN,
with the second RNN taking in outputs of the first RNN and computing the final results

batch_first – If True, then the input and output tensors are provided as (batch, seq, feature)

"""
######
# hyperparameters
#rnn_embeding_dim = ??
#rnn_hid_size = ??
#rnn_layers=  ??


######
# Set hyperparameters
input_size = n_vocab  # Number of unique characters
output_size = n_vocab  # Number of unique characters
embedding_dim = 50
hidden_size = 100
num_layers = 2

class CharModel_RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, embedding_dim=50):
        super(CharModel_RNN, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.rnn(x)
        out = self.linear(out[:, -1, :])  # Select the output from the last time step
        return out

## Custom GRU Net

In [None]:
"""
parameters of LSTM/RNN/GRU layer:

input_size – The number of expected features in the input x

hidden_size – The number of features in the hidden state h

num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two GRU together to form a stacked GRU,
with the second GRU taking in outputs of the first GRU and computing the final results

batch_first – If True, then the input and output tensors are provided as (batch, seq, feature)

"""
######
# hyperparameters
#gru_embeding_dim = ??
#gru_hid_size = ??
#gru_layers=  ??


######
# Set hyperparameters
input_size = n_vocab  # Number of unique characters
output_size = n_vocab  # Number of unique characters
embedding_dim = 50
hidden_size = 100
num_layers = 2



class CharModel_GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, embedding_dim=50):
        super(CharModel_GRU, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, num_layers, batch_first=True)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.gru(x)
        out = self.linear(out[:, -1, :])  # Select the output from the last time step
        return out

In [None]:
# Instantiate models
#lstm_model = CharModel_LSTM(input_size, hidden_size, num_layers, output_size, embedding_dim)
#rnn_model = CharModel_RNN(input_size, hidden_size, num_layers, output_size, embedding_dim)
#gru_model = CharModel_GRU(input_size, hidden_size, num_layers, output_size, embedding_dim)

In [None]:
# Set hyperparameters
input_size = n_vocab  # Number of unique characters
output_size = n_vocab  # Number of unique characters
embedding_dim = 50
hidden_size = 100
num_layers = 2

# Check for GPU availability
model = CharModel_LSTM(input_size, hidden_size, num_layers, output_size, embedding_dim)


if torch.cuda.is_available:
  print('cuda available')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Now you can use the lstm_model for training or inference.


cuda available


CharModel_LSTM(
  (embedding): Embedding(48, 50)
  (lstm): LSTM(50, 100, num_layers=2, batch_first=True)
  (linear): Linear(in_features=100, out_features=48, bias=True)
)

# Training

1. Write code to train the LSTM network and store the model as a checkpoint. Use validation data to tune your hyperparameters learning rate, batch size, num epochs, etc.

In [None]:

"""model = CharModel_LSTM()
if torch.cuda.is_available:
  print('cuda available')

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)"""

'model = CharModel_LSTM()\nif torch.cuda.is_available:\n  print(\'cuda available\')\n\ndevice = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")\nmodel.to(device)'

In [None]:
# Set up the parameters for training,
# tune the tunable parameters if necessary using val data
batch_size = 40
optimizer = optim.Adam(model.parameters())
loss_fn = nn.CrossEntropyLoss(reduction="sum")
train_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
val_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
test_loader = data.DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)

In [None]:
import os
n_epochs = 40
best_model = None
### ckpt_path =  # set up check point path
## If you have stored a checkpoint, In this way you can resume training model after the last training step by just loading it from the directory given.
# if os.path.isfile(ckpt_path):
#     model.load_state_dict(torch.load(ckpt_path)[0])
#     print('loading from model check point')

best_loss = np.inf
for epoch in range(n_epochs):
    model.train()
    #Write training code
    for X_batch, y_batch in train_loader:
        y_pred = model(X_batch.to(device))
        loss = loss_fn(y_pred.to(device), y_batch.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    # Validation
    # Writevalidation code and tune parameters if necessary
    #Include codes to print necessary logs
    model.eval()
    loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            y_pred = model(X_batch.to(device))
            loss += loss_fn(y_pred.to(device), y_batch.to(device))
        if loss < best_loss:
            best_loss = loss
            best_model = model.state_dict()
        print("Epoch %d: Cross-entropy: %.4f" % (epoch, loss))
torch.save([best_model, char_to_int], "single-char.pth")


Epoch 0: Cross-entropy: 832755.0625
Epoch 1: Cross-entropy: 777695.0000
Epoch 2: Cross-entropy: 749372.1875
Epoch 3: Cross-entropy: 735536.3125
Epoch 4: Cross-entropy: 726456.3750
Epoch 5: Cross-entropy: 715060.0625
Epoch 6: Cross-entropy: 705209.6875
Epoch 7: Cross-entropy: 702151.1250
Epoch 8: Cross-entropy: 696727.9375
Epoch 9: Cross-entropy: 695195.1250
Epoch 10: Cross-entropy: 687252.4375
Epoch 11: Cross-entropy: 686725.1875
Epoch 12: Cross-entropy: 683332.9375
Epoch 13: Cross-entropy: 682335.1875
Epoch 14: Cross-entropy: 674999.8125
Epoch 15: Cross-entropy: 674161.0000
Epoch 16: Cross-entropy: 673590.7500
Epoch 17: Cross-entropy: 670990.5625
Epoch 18: Cross-entropy: 669421.3750
Epoch 19: Cross-entropy: 668560.2500
Epoch 20: Cross-entropy: 668059.8750
Epoch 21: Cross-entropy: 665401.6250
Epoch 22: Cross-entropy: 662319.3750
Epoch 23: Cross-entropy: 662683.5000
Epoch 24: Cross-entropy: 662400.8750
Epoch 25: Cross-entropy: 663001.6250
Epoch 26: Cross-entropy: 660435.6250
Epoch 27: C

# Generating character using trained model

In [None]:
best_model, char_to_int = torch.load("single-char.pth")  ## loading saved model
n_vocab = len(char_to_int)
int_to_char = dict((i, c) for c, i in char_to_int.items())
model.load_state_dict(best_model)

<All keys matched successfully>

In [None]:
#Test your model's generative performance using the following testing code
# randomly generate a prompt
seq_length = 7
n =10 ## number of characters to predict
rand_index = np.random.choice(len(X_test))
prompt = X_test[rand_index]
next_char_actual = y_test[rand_index]

# Convert the PyTorch tensor to a list of integers
prompt_list = prompt.tolist()
# Map the list of integers to characters using char_to_int dictionary
pattern = [char_to_int.get(c, '<unknown>') for c in prompt_list]


#---------------------------------------------------------------------------#
#prediction

model.eval()
print('Prompt: "%s"' % prompt)
print('Next chars (actual):',next_char_actual )
print('----')
print('prediction:\n')
#Use your model to predict the next seq_length characters and check the performance
print('\n--------')


Prompt: "tensor([18, 11, 32, 19, 24, 17,  0], dtype=torch.int32)"
Next chars (actual): tensor(11)
----
prediction:


--------


In [None]:
# Use your model to predict the next seq_length characters and check the performance
seq_length = 7
raw_text = open(path, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
raw_text = remove_punc(raw_text)
start = np.random.randint(0, len(raw_text)-seq_length)
prompt = raw_text[start:start+seq_length]
pattern = [char_to_int[c] for c in prompt]
with torch.no_grad():
    for i in range(n):
        # format input array of int into PyTorch tensor
        x = torch.tensor(np.reshape(pattern, (1, len(pattern))), dtype=torch.long)  # Use dtype=torch.long
        # generate logits as output from the model
        prediction = model(x.to(device))
        # convert logits into one character
        index = int(prediction.argmax())
        result = int_to_char[index]
        print(result, end="")
        # append the new character into the prompt for the next iteration
        pattern.append(index)
        pattern = pattern[1:]
print()
print("Done.")


d not see 
Done.


In [None]:
#Use your model to predict the next seq_length characters and check the performance
# CODE HERE
seq_length = 7
raw_text = open(path, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()
raw_text = remove_punc(raw_text)
start = np.random.randint(0, len(raw_text)-seq_length)
prompt = raw_text[start:start+seq_length]
pattern = [char_to_int[c] for c in prompt]
with torch.no_grad():
    for i in range(n):
        # format input array of int into PyTorch tensor
        x = np.reshape(pattern, (1, len(pattern), 1))
        x = torch.tensor(x, dtype=torch.float32)
        # generate logits as output from the model
        prediction = model(x.to())
        # convert logits into one character
        index = int(prediction.argmax())
        result = int_to_char[index]
        print(result, end="")
        # append the new character into the prompt for the next iteration
        pattern.append(index)
        pattern = pattern[1:]
print()
print("Done.")

RuntimeError: ignored

2. Using the template above, train the RNN network and store the model as a checkpoint. Use validation data to tune your hyperparameters learning rate, batch size, num epochs, etc. And test the models performance on an arbitrary test data point (or a set of test data points). Also evaluate the performance on full test data using character error rate.

In [None]:
!pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.23.0 rapidfuzz-3.5.2


In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from Levenshtein import distance  # For calculating Character Error Rate (CER)

# Set hyperparameters
learning_rate = 0.001
batch_size = 64
n_epochs = 20

# Instantiate RNN model
rnn_model = CharModel_RNN(input_size, hidden_size, num_layers, output_size, embedding_dim)
rnn_model.to(device)

# Set up optimizer and loss function
optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(reduction="sum")

# Set up data loaders for training, validation, and test data
train_loader = DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
val_loader = DataLoader(data.TensorDataset(X_val, y_val), shuffle=False, batch_size=batch_size)
test_loader = DataLoader(data.TensorDataset(X_test, y_test), shuffle=False, batch_size=batch_size)

# Training loop
best_model = None
best_loss = np.inf

for epoch in range(n_epochs):
    rnn_model.train()
    total_loss = 0

    for X_batch, y_batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{n_epochs}'):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Forward pass
        y_pred = rnn_model(X_batch)

        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader.dataset)

    # Validation
    rnn_model.eval()
    val_loss = 0

    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            y_val_pred = rnn_model(X_val_batch)
            val_loss += loss_fn(y_val_pred, y_val_batch).item()

    average_val_loss = val_loss / len(val_loader.dataset)

    print(f"Epoch {epoch + 1}/{n_epochs}, Training Loss: {average_loss:.4f}, Validation Loss: {average_val_loss:.4f}")

    # Save the best model based on validation loss
    if average_val_loss < best_loss:
        best_loss = average_val_loss
        best_model = rnn_model.state_dict()

# Save the best model as a checkpoint
torch.save(best_model, "rnn_model_checkpoint.pth")

# Load the best model from the checkpoint
rnn_model.load_state_dict(torch.load("rnn_model_checkpoint.pth"))
rnn_model.eval()

# Evaluate performance on an arbitrary test data point
seq_length = 7
start = np.random.randint(0, len(X_test)-seq_length)
prompt = X_test[start:start+seq_length].to(device)
next_char_actual = y_test[start+seq_length-1]

pattern = [char_to_int[c] for c in prompt.tolist()]

# Prediction
predicted_chars = []
with torch.no_grad():
    for i in range(n):
        x = torch.tensor(np.reshape(pattern, (1, len(pattern))), dtype=torch.long).to(device)
        prediction = rnn_model(x)
        index = int(prediction.argmax())
        result = int_to_char[index]
        predicted_chars.append(result)
        pattern.append(index)
        pattern = pattern[1:]

predicted_sequence = "".join(predicted_chars)
print(f"Prompt: \"{prompt}\"")
print(f"Next chars (actual): {int_to_char[next_char_actual]}")
print(f"Prediction: {predicted_sequence}")

# Evaluate performance on the full test data using Character Error Rate (CER)
def calculate_cer(predictions, targets):
    return sum(distance(prediction, target) for prediction, target in zip(predictions, targets)) / sum(len(target) for target in targets)

test_predictions = []
test_targets = []

with torch.no_grad():
    for X_test_batch, y_test_batch in tqdm(test_loader, desc="Evaluating on test data"):
        X_test_batch, y_test_batch = X_test_batch.to(device), y_test_batch.to(device)
        y_test_pred = rnn_model(X_test_batch)
        predicted_indices = y_test_pred.argmax(dim=2)
        test_predictions.extend(["".join(int_to_char[i.item()] for i in indices) for indices in predicted_indices])
        test_targets.extend(["".join(int_to_char[i.item()] for i in seq) for seq in y_test_batch])

cer = calculate_cer(test_predictions, test_targets)
print(f"Character Error Rate (CER) on the full test data: {cer:.4f}")


Epoch 1/20: 100%|██████████| 9208/9208 [00:31<00:00, 290.07it/s]


Epoch 1/20, Training Loss: 1.6386, Validation Loss: 1.4763


Epoch 2/20: 100%|██████████| 9208/9208 [00:27<00:00, 332.06it/s]


Epoch 2/20, Training Loss: 1.4320, Validation Loss: 1.4141


Epoch 3/20: 100%|██████████| 9208/9208 [00:27<00:00, 339.22it/s]


Epoch 3/20, Training Loss: 1.3833, Validation Loss: 1.3828


Epoch 4/20: 100%|██████████| 9208/9208 [00:26<00:00, 343.50it/s]


Epoch 4/20, Training Loss: 1.3580, Validation Loss: 1.3663


Epoch 5/20: 100%|██████████| 9208/9208 [00:27<00:00, 337.42it/s]


Epoch 5/20, Training Loss: 1.3424, Validation Loss: 1.3578


Epoch 6/20: 100%|██████████| 9208/9208 [00:27<00:00, 339.37it/s]


Epoch 6/20, Training Loss: 1.3311, Validation Loss: 1.3488


Epoch 7/20: 100%|██████████| 9208/9208 [00:25<00:00, 365.35it/s]


Epoch 7/20, Training Loss: 1.3230, Validation Loss: 1.3461


Epoch 8/20: 100%|██████████| 9208/9208 [00:25<00:00, 358.98it/s]


Epoch 8/20, Training Loss: 1.3170, Validation Loss: 1.3422


Epoch 9/20: 100%|██████████| 9208/9208 [00:26<00:00, 350.45it/s]


Epoch 9/20, Training Loss: 1.3117, Validation Loss: 1.3391


Epoch 10/20: 100%|██████████| 9208/9208 [00:26<00:00, 348.21it/s]


Epoch 10/20, Training Loss: 1.3083, Validation Loss: 1.3339


Epoch 11/20: 100%|██████████| 9208/9208 [00:27<00:00, 339.67it/s]


Epoch 11/20, Training Loss: 1.3053, Validation Loss: 1.3321


Epoch 12/20: 100%|██████████| 9208/9208 [00:26<00:00, 343.47it/s]


Epoch 12/20, Training Loss: 1.3033, Validation Loss: 1.3280


Epoch 13/20: 100%|██████████| 9208/9208 [00:26<00:00, 352.66it/s]


Epoch 13/20, Training Loss: 1.3014, Validation Loss: 1.3294


Epoch 14/20: 100%|██████████| 9208/9208 [00:25<00:00, 359.17it/s]


Epoch 14/20, Training Loss: 1.2994, Validation Loss: 1.3330


Epoch 15/20: 100%|██████████| 9208/9208 [00:24<00:00, 368.75it/s]


Epoch 15/20, Training Loss: 1.2993, Validation Loss: 1.3269


Epoch 16/20: 100%|██████████| 9208/9208 [00:27<00:00, 332.11it/s]


Epoch 16/20, Training Loss: 1.2978, Validation Loss: 1.3313


Epoch 17/20: 100%|██████████| 9208/9208 [00:26<00:00, 346.90it/s]


Epoch 17/20, Training Loss: 1.2967, Validation Loss: 1.3300


Epoch 18/20: 100%|██████████| 9208/9208 [00:26<00:00, 342.74it/s]


Epoch 18/20, Training Loss: 1.2968, Validation Loss: 1.3386


Epoch 19/20: 100%|██████████| 9208/9208 [00:26<00:00, 342.08it/s]


Epoch 19/20, Training Loss: 1.2961, Validation Loss: 1.3277


Epoch 20/20: 100%|██████████| 9208/9208 [00:26<00:00, 342.77it/s]


Epoch 20/20, Training Loss: 1.2956, Validation Loss: 1.3264


TypeError: ignored

3. Using the template above, train the GRU network and store the model as a checkpoint. Use validation data to tune your hyperparameters learning rate, batch size, num epochs, etc. And test the models performance on an arbitrary test data point (or a set of test data points). Also evaluate the performance on full test data using character error rate.

In [None]:
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from Levenshtein import distance  # For calculating Character Error Rate (CER)

# Set hyperparameters
learning_rate = 0.001
batch_size = 64
n_epochs = 20

# Instantiate GRU model
gru_model = CharModel_GRU(input_size, hidden_size, num_layers, output_size, embedding_dim)
gru_model.to(device)

# Set up optimizer and loss function
optimizer = optim.Adam(gru_model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss(reduction="sum")

# Set up data loaders for training, validation, and test data
train_loader = DataLoader(data.TensorDataset(X_train, y_train), shuffle=True, batch_size=batch_size)
val_loader = DataLoader(data.TensorDataset(X_val, y_val), shuffle=False, batch_size=batch_size)
test_loader = DataLoader(data.TensorDataset(X_test, y_test), shuffle=False, batch_size=batch_size)

# Training loop
best_model = None
best_loss = np.inf

for epoch in range(n_epochs):
    gru_model.train()
    total_loss = 0

    for X_batch, y_batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{n_epochs}'):
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Forward pass
        y_pred = gru_model(X_batch)

        # Compute loss
        loss = loss_fn(y_pred, y_batch)
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader.dataset)

    # Validation
    gru_model.eval()
    val_loss = 0

    with torch.no_grad():
        for X_val_batch, y_val_batch in val_loader:
            X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
            y_val_pred = gru_model(X_val_batch)
            val_loss += loss_fn(y_val_pred, y_val_batch).item()

    average_val_loss = val_loss / len(val_loader.dataset)

    print(f"Epoch {epoch + 1}/{n_epochs}, Training Loss: {average_loss:.4f}, Validation Loss: {average_val_loss:.4f}")

    # Save the best model based on validation loss
    if average_val_loss < best_loss:
        best_loss = average_val_loss
        best_model = gru_model.state_dict()

# Save the best model as a checkpoint
torch.save(best_model, "gru_model_checkpoint.pth")

# Load the best model from the checkpoint
gru_model.load_state_dict(torch.load("gru_model_checkpoint.pth"))
gru_model.eval()

# Evaluate performance on an arbitrary test data point
seq_length = 7
start = np.random.randint(0, len(X_test)-seq_length)
prompt = X_test[start:start+seq_length].to(device)
next_char_actual = y_test[start+seq_length-1]

pattern = [char_to_int[c] for c in prompt.tolist()]

# Prediction
predicted_chars = []
with torch.no_grad():
    for i in range(n):
        x = torch.tensor(np.reshape(pattern, (1, len(pattern))), dtype=torch.long).to(device)
        prediction = gru_model(x)
        index = int(prediction.argmax())
        result = int_to_char[index]
        predicted_chars.append(result)
        pattern.append(index)
        pattern = pattern[1:]

predicted_sequence = "".join(predicted_chars)
print(f"Prompt: \"{prompt}\"")
print(f"Next chars (actual): {int_to_char[next_char_actual]}")
print(f"Prediction: {predicted_sequence}")

# Evaluate performance on the full test data using Character Error Rate (CER)
test_predictions = []
test_targets = []

with torch.no_grad():
    for X_test_batch, y_test_batch in tqdm(test_loader, desc="Evaluating on test data"):
        X_test_batch, y_test_batch = X_test_batch.to(device), y_test_batch.to(device)
        y_test_pred = gru_model(X_test_batch)
        predicted_indices = y_test_pred.argmax(dim=2)
        test_predictions.extend(["".join(int_to_char[i.item()] for i in indices) for indices in predicted_indices])
        test_targets.extend(["".join(int_to_char[i.item()] for i in seq) for seq in y_test_batch])

cer = calculate_cer(test_predictions, test_targets)
print(f"Character Error Rate (CER) on the full test data: {cer:.4f}")


Epoch 1/20: 100%|██████████| 9208/9208 [00:34<00:00, 267.11it/s]


Epoch 1/20, Training Loss: 1.5718, Validation Loss: 1.4088


Epoch 2/20: 100%|██████████| 9208/9208 [00:28<00:00, 326.50it/s]


Epoch 2/20, Training Loss: 1.3569, Validation Loss: 1.3436


Epoch 3/20: 100%|██████████| 9208/9208 [00:27<00:00, 330.27it/s]


Epoch 3/20, Training Loss: 1.3073, Validation Loss: 1.3171


Epoch 4/20: 100%|██████████| 9208/9208 [00:35<00:00, 263.04it/s]


Epoch 4/20, Training Loss: 1.2806, Validation Loss: 1.3002


Epoch 5/20: 100%|██████████| 9208/9208 [00:28<00:00, 318.56it/s]


Epoch 5/20, Training Loss: 1.2643, Validation Loss: 1.2911


Epoch 6/20: 100%|██████████| 9208/9208 [00:29<00:00, 314.10it/s]


Epoch 6/20, Training Loss: 1.2533, Validation Loss: 1.2830


Epoch 7/20: 100%|██████████| 9208/9208 [00:28<00:00, 322.35it/s]


Epoch 7/20, Training Loss: 1.2447, Validation Loss: 1.2830


Epoch 8/20: 100%|██████████| 9208/9208 [00:28<00:00, 324.48it/s]


Epoch 8/20, Training Loss: 1.2383, Validation Loss: 1.2789


Epoch 9/20: 100%|██████████| 9208/9208 [00:29<00:00, 309.00it/s]


Epoch 9/20, Training Loss: 1.2334, Validation Loss: 1.2726


Epoch 10/20: 100%|██████████| 9208/9208 [00:29<00:00, 315.18it/s]


Epoch 10/20, Training Loss: 1.2293, Validation Loss: 1.2732


Epoch 11/20: 100%|██████████| 9208/9208 [00:29<00:00, 308.84it/s]


Epoch 11/20, Training Loss: 1.2258, Validation Loss: 1.2751


Epoch 12/20: 100%|██████████| 9208/9208 [00:28<00:00, 324.12it/s]


Epoch 12/20, Training Loss: 1.2243, Validation Loss: 1.2720


Epoch 13/20: 100%|██████████| 9208/9208 [00:28<00:00, 318.80it/s]


Epoch 13/20, Training Loss: 1.2218, Validation Loss: 1.2703


Epoch 14/20: 100%|██████████| 9208/9208 [00:29<00:00, 311.34it/s]


Epoch 14/20, Training Loss: 1.2190, Validation Loss: 1.2717


Epoch 15/20: 100%|██████████| 9208/9208 [00:28<00:00, 319.98it/s]


Epoch 15/20, Training Loss: 1.2201, Validation Loss: 1.2734


Epoch 16/20: 100%|██████████| 9208/9208 [00:28<00:00, 328.74it/s]


Epoch 16/20, Training Loss: 1.2184, Validation Loss: 1.2752


Epoch 17/20: 100%|██████████| 9208/9208 [00:28<00:00, 326.88it/s]


Epoch 17/20, Training Loss: 1.2175, Validation Loss: 1.2712


Epoch 18/20: 100%|██████████| 9208/9208 [00:30<00:00, 306.91it/s]


Epoch 18/20, Training Loss: 1.2178, Validation Loss: 1.2726


Epoch 19/20: 100%|██████████| 9208/9208 [00:28<00:00, 321.82it/s]


Epoch 19/20, Training Loss: 1.2178, Validation Loss: 1.2733


Epoch 20/20: 100%|██████████| 9208/9208 [00:29<00:00, 313.41it/s]


Epoch 20/20, Training Loss: 1.2173, Validation Loss: 1.2742


TypeError: ignored