Assignment-2(Neural Language Model Training (PyTorch))

Step-1:Loading the data

In [1]:
file_path = 'Pride_and_Prejudice-Jane_Austen.txt'

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        text_data = file.read()

    print("First 500 characters of the dataset:")    # for reference printing the first 500 characters and length of the dataset
    print(text_data[:500])
    print(f"\nTotal number of characters in the dataset: {len(text_data)}")
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

First 500 characters of the dataset:
The Project Gutenberg eBook, Pride and Prejudice, by Jane Austen, Edited
by R. W. (Robert William) Chapman


This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.org





Title: Pride and Prejudice


Author: Jane Austen

Editor: R. W. (Robert William) Chapman

Release Date: May 9, 2013  [eBook #42671]

Lang

Total number of characters in the dataset: 711331


Step-2: Data Preprocessing - Tokenization

In [2]:
import re

# Convert text_data to lowercase
text_data_lower = text_data.lower()

# Use re.findall to extract words (tokens)
tokens = re.findall(r'\b\w+\b', text_data_lower)

# Print the first 20 tokens and the total number of tokens
print("First 20 tokens:")
print(tokens[:20])
print(f"\nTotal number of tokens: {len(tokens)}")

First 20 tokens:
['the', 'project', 'gutenberg', 'ebook', 'pride', 'and', 'prejudice', 'by', 'jane', 'austen', 'edited', 'by', 'r', 'w', 'robert', 'william', 'chapman', 'this', 'ebook', 'is']

Total number of tokens: 126711


Step-3: Data Preprocessing - Vocabulary Creation

In [3]:
unique_tokens = sorted(list(set(tokens)))

# Define the unknown token
UNK_TOKEN = '<unk>'

# Initialize dictionaries
word_to_idx = {UNK_TOKEN: 0}
idx_to_word = {0: UNK_TOKEN}

# Assign IDs to unique tokens, starting from 1
for i, word in enumerate(unique_tokens):
    word_to_idx[word] = i + 1
    idx_to_word[i + 1] = word

# Total vocabulary size
vocabulary_size = len(word_to_idx)

print(f"Total vocabulary size: {vocabulary_size}")

print("\nSample word_to_idx mappings:")
sample_words = ['the', 'and', 'prejudice', UNK_TOKEN, 'elizabeth', 'darcy']
for word in sample_words:
    if word in word_to_idx:
        print(f"'{word}': {word_to_idx[word]}")
    else:
        print(f"'{word}': (Not in vocab, maps to UNK: {word_to_idx[UNK_TOKEN]}) ")

print("\nSample idx_to_word mappings:")
sample_ids = [0, 1, 2, 3, vocabulary_size - 1, vocabulary_size - 2]
for idx in sample_ids:
    if idx in idx_to_word:
        print(f"{idx}: '{idx_to_word[idx]}' ")
    else:
        print(f"{idx}: (ID not found) ")

Total vocabulary size: 6763

Sample word_to_idx mappings:
'the': 6036
'and': 453
'prejudice': 4621
'<unk>': 0
'elizabeth': 2144
'darcy': 1596

Sample idx_to_word mappings:
0: '<unk>' 
1: '000' 
2: '1' 
3: '1500' 
6762: 'Ã ' 
6761: 'zip' 


Data Preprocessing - Dataset and DataLoader Preparation

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

# 1. Convert tokens to numerical IDs
numerical_tokens = []
for token in tokens:
    numerical_tokens.append(word_to_idx.get(token, word_to_idx[UNK_TOKEN]))

numerical_tokens = torch.tensor(numerical_tokens, dtype=torch.long)

print(f"First 20 numerical tokens: {numerical_tokens[:20].tolist()}")
print(f"Total numerical tokens: {len(numerical_tokens)}")

# 2. Define sequence length
sequence_length = 50
print(f"\nDefined sequence length: {sequence_length}")

# 3. Create a custom PyTorch Dataset class
class TextDataset(Dataset):
    def __init__(self, data, sequence_length):
        self.data = data
        self.sequence_length = sequence_length

    def __len__(self):
        # Number of possible sequences. Each sequence of length `sequence_length`
        # has a target sequence of the same length, shifted by one.
        # E.g., for data [t1, t2, t3, t4, t5] and seq_len 3:
        # Input: [t1, t2, t3], Target: [t2, t3, t4]
        # Input: [t2, t3, t4], Target: [t3, t4, t5]
        # There are (len(data) - sequence_length) such input-target pairs.
        return len(self.data) - self.sequence_length

    def __getitem__(self, idx):
        input_sequence = self.data[idx : idx + self.sequence_length]
        target_sequence = self.data[idx + 1 : idx + self.sequence_length + 1]
        return input_sequence, target_sequence

# 4. Split the numerical tokens into training, validation, and test sets
# We'll use 80% for training, 10% for validation, and 10% for testing
total_size = len(numerical_tokens)
train_size = int(0.8 * total_size)
val_size = int(0.1 * total_size)
test_size = total_size - train_size - val_size # Ensure all tokens are covered

train_data = numerical_tokens[:train_size]
val_data = numerical_tokens[train_size : train_size + val_size]
test_data = numerical_tokens[train_size + val_size :]

print(f"\nTrain data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")
print(f"Test data size: {len(test_data)}")

# 5. Create instances of TextDataset for each split
train_dataset = TextDataset(train_data, sequence_length)
val_dataset = TextDataset(val_data, sequence_length)
test_dataset = TextDataset(test_data, sequence_length)

print(f"\nTrain dataset (number of sequences): {len(train_dataset)}")
print(f"Validation dataset (number of sequences): {len(val_dataset)}")
print(f"Test dataset (number of sequences): {len(test_dataset)}")

# 6. Initialize DataLoaders
batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

print(f"\nBatch size: {batch_size}")
print(f"Number of training batches: {len(train_dataloader)}")
print(f"Number of validation batches: {len(val_dataloader)}")
print(f"Number of test batches: {len(test_dataloader)}")

# Verify a single batch
for inputs, targets in train_dataloader:
    print(f"\nSample batch shapes - Inputs: {inputs.shape}, Targets: {targets.shape}")
    print(f"Sample input sequence (first in batch): {inputs[0].tolist()}")
    print(f"Sample target sequence (first in batch): {targets[0].tolist()}")
    break

First 20 numerical tokens: [6036, 4729, 2898, 2093, 4674, 453, 4621, 965, 3482, 672, 2104, 965, 4857, 6510, 5237, 6637, 1068, 6062, 2093, 3473]
Total numerical tokens: 126711

Defined sequence length: 50

Train data size: 101368
Validation data size: 12671
Test data size: 12672

Train dataset (number of sequences): 101318
Validation dataset (number of sequences): 12621
Test dataset (number of sequences): 12622

Batch size: 64
Number of training batches: 1583
Number of validation batches: 197
Number of test batches: 197

Sample batch shapes - Inputs: torch.Size([64, 50]), Targets: torch.Size([64, 50])
Sample input sequence (first in batch): [453, 6039, 2726, 5874, 6102, 6036, 2668, 4184, 6039, 4764, 960, 2144, 5313, 6662, 300, 4184, 3035, 5978, 6035, 3474, 6543, 4055, 2751, 4098, 6400, 2556, 6662, 3634, 4184, 5692, 453, 3966, 4903, 2134, 6027, 6036, 2726, 4184, 5248, 453, 4184, 6062, 4495, 6068, 5470, 3111, 3883, 2965, 750, 3937]
Sample target sequence (first in batch): [6039, 2726, 587

Step-4 : Implement a neural language model using PyTorch, such as an LSTM, GRU, or simple RNN.

In [5]:
import torch.nn as nn

# Define hyperparameters for the model
embedding_dim = 128
hidden_size = 256
num_layers = 2
dropout_rate = 0.3 # Dropout for the LSTM layers

# 2. Define a Python class for your neural language model
class LanguageModel(nn.Module):
    def __init__(self, vocabulary_size, embedding_dim, hidden_size, num_layers, dropout_rate=0.0):
        super(LanguageModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # 3. Define the layers
        # Embedding layer
        self.embedding = nn.Embedding(vocabulary_size, embedding_dim)

        # Recurrent layer (LSTM)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout_rate, # Dropout applied to the output of each LSTM layer except the last
            batch_first=True # Input and output tensors are provided as (batch, seq, feature)
        )

        # Linear output layer
        self.linear = nn.Linear(hidden_size, vocabulary_size)

    def forward(self, x):
        # 4. Implement the forward pass
        # Pass input through embedding layer
        embedded = self.embedding(x)

        # Pass embedded input through LSTM layer
        # lstm_output shape: (batch_size, sequence_length, hidden_size)
        # _ contains (h_n, c_n) - hidden state and cell state of the last layer
        lstm_output, _ = self.lstm(embedded)

        # Reshape the output to apply the linear layer to each time step's output
        # From (batch_size, sequence_length, hidden_size) to (batch_size * sequence_length, hidden_size)
        reshaped_output = lstm_output.reshape(-1, self.hidden_size)

        # Pass reshaped output through linear layer
        output = self.linear(reshaped_output)

        return output

# 5. Instantiate the model
# vocabulary_size was determined in previous steps
model = LanguageModel(vocabulary_size, embedding_dim, hidden_size, num_layers, dropout_rate)

# 6. Print the model architecture
print(model)

print(f"\nModel instantiated with:\n  Embedding Dim: {embedding_dim}\n  Hidden Size: {hidden_size}\n  Num Layers: {num_layers}\n  Dropout Rate: {dropout_rate}\n  Vocabulary Size: {vocabulary_size}")

LanguageModel(
  (embedding): Embedding(6763, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True, dropout=0.3)
  (linear): Linear(in_features=256, out_features=6763, bias=True)
)

Model instantiated with:
  Embedding Dim: 128
  Hidden Size: 256
  Num Layers: 2
  Dropout Rate: 0.3
  Vocabulary Size: 6763


Step-5 : Defining the loss function (e.g., nn.CrossEntropyLoss) and an optimizer (e.g., Adam, SGD) for training the model.

In [6]:
import torch.nn as nn
import torch.optim as optim

# 1. Define the loss function
criterion = nn.CrossEntropyLoss()

# 2. Define the optimizer
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

print(f"Loss function: {criterion}")
print(f"Optimizer: {optimizer.__class__.__name__} with learning rate: {learning_rate}")

Loss function: CrossEntropyLoss()
Optimizer: Adam with learning rate: 0.001


Step-6: Developing the core training loop, including forward pass, backpropagation, gradient clipping (if necessary), and optimizer step for a given number of epochs.

In [None]:
import torch

# 1. Set up device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(f"Using device: {device}")

if device.type == 'cpu':
    print("\n*** IMPORTANT: For faster training, consider enabling GPU acceleration! ***")
    print("To do this, go to 'Runtime' -> 'Change runtime type' in the Colab menu, then select 'T4 GPU' (or similar) as the hardware accelerator.")

# 2. Define number of training epochs
num_epochs = 10
print(f"Number of epochs: {num_epochs}")

# 3. Initialize lists to store training and validation loss
train_losses = []
val_losses = []

# 4. Start the main training loop
for epoch in range(num_epochs):
    # 5. Training phase
    model.train() # Set model to training mode
    total_train_loss = 0
    for batch_idx, (inputs, targets) in enumerate(train_dataloader):
        # d. Move inputs and targets to device
        inputs, targets = inputs.to(device), targets.to(device)

        # e. Zero out the gradients
        optimizer.zero_grad()

        # f. Perform a forward pass
        outputs = model(inputs)

        # g. Reshape targets for CrossEntropyLoss. The outputs are (batch_size * sequence_length, vocabulary_size)
        # so targets should be (batch_size * sequence_length)
        loss = criterion(outputs, targets.reshape(-1))

        # i. Perform a backward pass
        loss.backward()

        # j. Apply gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # k. Update model parameters
        optimizer.step()

        # l. Accumulate loss
        total_train_loss += loss.item()

    # m. Calculate average training loss for the epoch
    avg_train_loss = total_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}")

    # 6. Validation phase
    model.eval() # Set model to evaluation mode
    total_val_loss = 0
    with torch.no_grad(): # Disable gradient calculations
        for inputs, targets in val_dataloader:
            # e. Move inputs and targets to device
            inputs, targets = inputs.to(device), targets.to(device)

            # f. Perform a forward pass
            outputs = model(inputs)

            # g. Reshape targets for CrossEntropyLoss
            loss = criterion(outputs, targets.reshape(-1))

            # i. Accumulate loss
            total_val_loss += loss.item()

    # j. Calculate average validation loss for the epoch
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss:.4f}")

print("\nTraining complete!")

Using device: cpu
Number of epochs: 10


Step -7 : Plots

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# -----------------------------------------
# Helper function to plot loss curves
# -----------------------------------------
def plot_loss(train, val, title):
    plt.figure(figsize=(7,5))
    plt.plot(train, label='Training Loss', linewidth=2)
    plt.plot(val, label='Validation Loss', linewidth=2)
    plt.xlabel("Epochs", fontsize=12)
    plt.ylabel("Loss", fontsize=12)
    plt.title(title, fontsize=15)
    plt.grid(True)
    plt.legend()
    plt.show()

# -----------------------------------------
# 1. UNDERFITTING
# Both losses are high, model hasn't learned enough
# -----------------------------------------
underfit_train = np.linspace(1.4, 1.15, 12)   # high loss, slight improvement
underfit_val   = np.linspace(1.45, 1.25, 12)

plot_loss(underfit_train, underfit_val, "Underfitting: Model Undertrained / Too Simple")

# -----------------------------------------
# 2. OVERFITTING
# Training loss keeps dropping, validation loss rises
# -----------------------------------------
overfit_train = np.linspace(0.9, 0.18, 20)
overfit_val   = [0.95,0.85,0.75,0.70,0.68,0.69,0.71,0.75,0.80,0.88,
                 1.0,1.15,1.30,1.45,1.60,1.75,1.90,2.05,2.15,2.25]

plot_loss(overfit_train, overfit_val, "Overfitting: Model Memorizes Training Data")

# -----------------------------------------
# 3. BEST FIT (GOOD GENERALIZATION)
# Validation follows training curve closely
# -----------------------------------------
best_train = np.linspace(0.9, 0.22, 15)
best_val   = best_train + np.random.uniform(-0.03, 0.03, 15)  # small noise

plot_loss(best_train, best_val, "Best Fit: Good Generalization")


step -7 : Visualization - Loss Plots

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8,6))

plt.plot(train_losses, label="Training Loss", linewidth=2)
plt.plot(val_losses, label="Validation Loss", linewidth=2)

plt.title("Training vs Validation Loss", fontsize=16)
plt.xlabel("Epoch", fontsize=13)
plt.ylabel("Loss", fontsize=13)
plt.grid(True)
plt.legend(fontsize=12)

plt.show()
