In [1]:
import torch
from torchtext.datasets import WikiText2
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torch.utils.data import DataLoader, Dataset

import torchtext
torchtext.disable_torchtext_deprecation_warning()
from datasets import load_dataset
from torch.nn.utils.rnn import pad_sequence  # Import pad_sequence function

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# empty GPU memory
torch.cuda.empty_cache() 

In [3]:
# Load the WikiText-2 dataset (larger - wikitext-103-v1, smaller - wikitext-2-v1)
# raw versions can also be used if custom tokenization is required

# dataset = load_dataset("wikitext", "wikitext-2-v1")
dataset = load_dataset("wikitext", "wikitext-103-v1")
dataset

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 1801350
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

In [None]:
import re

# Tokenizer function
def tokenize(text):
    # Simple tokenizer that splits on non-alphabetic characters
    return re.findall(r'\b\w+\b', text.lower())

# Build vocabulary
counter = Counter()
for line in dataset['train']['text']:
    counter.update(tokenize(line))

# Define vocabulary size and special tokens
vocab_size = 10000
special_tokens = ['<unk>', '<pad>', '<bos>', '<eos>']
vocab = {word: idx + len(special_tokens) for idx, (word, _) in enumerate(counter.most_common(vocab_size - len(special_tokens)))}
for idx, token in enumerate(special_tokens):
    vocab[token] = idx

# Inverse vocabulary for decoding
inv_vocab = {idx: word for word, idx in vocab.items()}

# Encode function
def encode(text):
    tokens = tokenize(text)
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

# Add special tokens to each sentence
def add_special_tokens(encoded_text):
    return [vocab['<bos>']] + encoded_text + [vocab['<eos>']]

# Prepare dataset
def prepare_dataset(split):
    encoded_texts = [add_special_tokens(encode(line)) for line in dataset[split]['text']]
    return encoded_texts

train_data = prepare_dataset('train')
valid_data = prepare_dataset('validation')
test_data = prepare_dataset('test')

# Data collate function for DataLoader
def collate_fn(batch):
    batch = [torch.tensor(item) for item in batch]
    batch = pad_sequence(batch, batch_first=True, padding_value=vocab['<pad>'])
    inputs = batch[:, :-1]
    targets = batch[:, 1:]
    return inputs, targets

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=batch_size, collate_fn=collate_fn)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn)

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ff3e089c9a0>>
Traceback (most recent call last):
  File "/home/shreyak_rekshda/ai-ml-test/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(
KeyboardInterrupt: 


In [6]:
import torch.nn as nn
import torch.optim as optim

class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5):
        super(RNNLanguageModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=vocab['<pad>'])
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        x = self.embedding(x)
        x = self.dropout(x)
        output, hidden = self.lstm(x, hidden)
        output = self.dropout(output)
        output = self.fc(output)
        return output, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new_zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size),
                weight.new_zeros(self.lstm.num_layers, batch_size, self.lstm.hidden_size))


In [7]:
class RNNLanguageModelv3(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers, dropout=0.5, tie_weights=True):
        super(RNNLanguageModelv3, self).__init__()
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Embedding layer with a padding index from your vocabulary
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=vocab['<pad>'])
        self.dropout = nn.Dropout(dropout)

        # LSTM layers with dropout applied between layers
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        
        # Optional layer normalization to stabilize LSTM outputs
        self.layer_norm = nn.LayerNorm(hidden_size)
        
        # Final linear layer to project LSTM outputs to vocabulary size
        self.fc = nn.Linear(hidden_size, vocab_size)
        
        # Tie weights between the embedding and the output layer if dimensions match.
        if tie_weights:
            if hidden_size != embed_size:
                print("Warning: Weight tying requires hidden_size == embed_size. Skipping weight tying.")
            else:
                self.fc.weight = self.embedding.weight

    def forward(self, x, hidden):
        # x shape: (batch, seq_len)
        x = self.embedding(x)            # => (batch, seq_len, embed_size)
        x = self.dropout(x)
        output, hidden = self.lstm(x, hidden)  # output => (batch, seq_len, hidden_size)
        output = self.layer_norm(output)       # Apply layer normalization
        output = self.dropout(output)
        output = self.fc(output)         # => (batch, seq_len, vocab_size)
        return output, hidden

    def init_hidden(self, batch_size):
        # Initialize hidden state (h_0, c_0) with zeros
        weight = next(self.parameters())
        return (weight.new_zeros(self.num_layers, batch_size, self.hidden_size),
                weight.new_zeros(self.num_layers, batch_size, self.hidden_size))


In [8]:
# Hyperparameters 

# (v1)
# embed_size = 128      # Size of word embeddings
# hidden_size = 256     # Number of features in the hidden state of the RNN
# num_layers = 2        # Number of recurrent layers (e.g., LSTM layers)
# num_epochs = 10       # Number of training epochs
# learning_rate = 0.001 # Learning rate for the optimizer

# (v2)
# embed_size = 128      # Size of word embeddings
# hidden_size = 256     # Number of features in the hidden state of the RNN
# num_layers = 4        # Number of recurrent layers (e.g., LSTM layers)
# num_epochs = 100       # Number of training epochs
# learning_rate = 0.002 # Learning rate for the optimizer

# (v3,v4)
embed_size = 512      # Size of word embeddings
hidden_size = 512     # Number of features in the hidden state of the RNN
num_layers = 4        # Number of recurrent layers (e.g., LSTM layers)
num_epochs = 1       # Number of training epochs
learning_rate = 0.002 # Learning rate for the optimizer
dropout = 0.5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Initialize the model
# (v1)
# model = RNNLanguageModel(
#     vocab_size=len(vocab),   # Size of the vocabulary
#     embed_size=embed_size,   # Embedding size
#     hidden_size=hidden_size, # Hidden state size
#     num_layers=num_layers,   # Number of LSTM layers
#     dropout=0.5              # Dropout rate
# ).to(device)

# (v3)
model = RNNLanguageModelv3(
    vocab_size=len(vocab), 
    embed_size=embed_size, 
    hidden_size=hidden_size, 
    num_layers=num_layers, 
    dropout=dropout,
    tie_weights=True
).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>']) # Ignore padding in loss calculation
optimizer = optim.Adam(model.parameters(), lr=learning_rate) # Adam optimizer


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [169]:
import wandb
from tqdm import tqdm


# Initialize WandB
wandb.init(
    project="rnn-language-model",
    config={
        "embed_size": embed_size,
        "hidden_size": hidden_size,
        "num_layers": num_layers,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
    }
)

model_version = "v4"

# Function to train the model with logging and dynamic hidden state initialization.
# The function now accepts a `start_epoch` parameter.
def train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs, start_epoch=1, log_interval=100):
    model.train()  # Set the model to training mode

    for epoch in range(start_epoch, start_epoch + num_epochs):
        total_loss = 0
        num_batches = len(train_loader)
        
        # Initialize hidden state for the first batch; will update it dynamically later.
        hidden = None

        # Use tqdm for progress bar
        pbar = tqdm(enumerate(train_loader), total=num_batches, desc=f"Epoch {epoch}")
        for batch_idx, (inputs, targets) in pbar:
            inputs, targets = inputs.to(device), targets.to(device)
            current_batch_size = inputs.size(0)
            
            # Initialize or update hidden state dynamically based on current batch size.
            if hidden is None or hidden[0].size(1) != current_batch_size:
                hidden = model.init_hidden(current_batch_size)
            else:
                # Detach hidden state to prevent backpropagating through the entire history
                hidden = tuple([h.detach() for h in hidden])
            
            optimizer.zero_grad()  # Clear gradients
            
            # Forward pass
            output, hidden = model(inputs, hidden)
            loss = criterion(output.view(-1, len(vocab)), targets.view(-1))
            loss.backward()  # Backward pass
            
            # Clip gradients to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()  # Update parameters
            
            total_loss += loss.item()
            
            # Logging every `log_interval` steps
            if (batch_idx + 1) % log_interval == 0:
                avg_loss = total_loss / log_interval
                wandb.log({"Training Loss": avg_loss, "Epoch": epoch, "Batch": batch_idx})
                pbar.set_postfix(loss=f"{avg_loss:.4f}")
                total_loss = 0  # Reset loss tracker

        # Validate after each epoch
        val_loss = evaluate_model(model, valid_loader, criterion)
        wandb.log({"Validation Loss": val_loss, "Epoch": epoch})
        print(f"Epoch {epoch}: Validation Loss: {val_loss:.4f}")

        #Save a checkpoint after 10 epochs
        if epoch % 10 == 0:
            checkpoint = {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'validation_loss': val_loss
            }
            torch.save(checkpoint, f'checkpoint_{model_version}_epoch{epoch}.pth')
            print(f"Checkpoint saved at epoch {epoch}")

# Function to evaluate the model remains unchanged.
def evaluate_model(model, data_loader, criterion):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    num_batches = len(data_loader)
    hidden = None

    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            current_batch_size = inputs.size(0)
            if hidden is None or hidden[0].size(1) != current_batch_size:
                hidden = model.init_hidden(current_batch_size)
            else:
                hidden = tuple([h.detach() for h in hidden])
            output, hidden = model(inputs, hidden)
            loss = criterion(output.view(-1, len(vocab)), targets.view(-1))
            total_loss += loss.item()
    model.train()  # Switch back to training mode
    return total_loss / num_batches

#### Training from beginning

In [151]:
# Start training (DO NOT RUN THIS IF RESUMING TRAINING)
train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs=num_epochs, start_epoch=1)

Epoch 1: 100%|██████████| 28147/28147 [1:01:45<00:00,  7.60it/s, loss=4.8923]


Epoch 1: Validation Loss: 4.6244


In [167]:
# manually save the checkpoint
checkpoint = {
    'epoch': 1,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'validation_loss': 4.6244
}
torch.save(checkpoint, f'checkpoint_{model_version}_epoch{1}.pth')

#### Resume training from a checkpoint

In [170]:
# Load the previous checkpoint
checkpoint = torch.load('checkpoint_v4_epoch1.pth', map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
# Set the start epoch to one after the checkpoint's epoch
# start_epoch = checkpoint['epoch'] + 1
start_epoch = 1

print(f"Resuming training from epoch {start_epoch}")

# Specify the additional number of epochs you want to train for
additional_epochs = 1

# Resume training
train_model(model, train_loader, valid_loader, criterion, optimizer, num_epochs=additional_epochs, start_epoch=start_epoch)

Resuming training from epoch 1


Epoch 1:   1%|▏         | 372/28147 [00:49<1:01:31,  7.52it/s, loss=4.8977]


KeyboardInterrupt: 

#### Save model after training

In [152]:
wandb.finish()

0,1
Batch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇██
Epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Training Loss,█▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Validation Loss,▁

0,1
Batch,28099.0
Epoch,1.0
Training Loss,4.89228
Validation Loss,4.62441


In [153]:
# save trained model to disk
# saved_model_name = 'rnn_language_model_v1_160epochs.pth'
# saved_model_name = 'rnn_language_model_v3_60epochs.pth'
saved_model_name = 'rnn_language_model_v4_1epochs.pth'
torch.save(model.state_dict(), saved_model_name)

#### Inference testing

In [154]:
# Load the saved state dictionary
model.load_state_dict(torch.load(saved_model_name, map_location=device))

# Set the model to evaluation mode
model.eval()

print("Model loaded successfully: ",saved_model_name)

Model loaded successfully:  rnn_language_model_v4_1epochs.pth


In [156]:
def predict_next_word(model, input_text, vocab, inv_vocab, top_k=5):
    model.eval()  # Set the model to evaluation mode
    # Tokenize the input text
    tokens = tokenize(input_text)
    # Convert tokens to indices, using <unk> for unseen words
    input_ids = [vocab.get(token, vocab['<unk>']) for token in tokens]
    # Convert to tensor and add batch dimension (1, sequence_length)
    input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)
    
    # Initialize hidden state with batch size 1
    hidden = model.init_hidden(1)
    
    with torch.no_grad():
        # Forward pass: get output predictions and update hidden state
        output, hidden = model(input_tensor, hidden)
    
    # Get logits for the last token in the sequence
    logits = output[0, -1]  # Shape: [vocab_size]
    # Apply softmax to convert logits to probabilities
    probabilities = torch.softmax(logits, dim=0)
    # print(probabilities)
    # Get the indices of the top_k words with highest probability
    top_probs, top_indices = torch.topk(probabilities, top_k)
    # Map indices to words using the inverse vocabulary
    top_words = [inv_vocab[idx.item()] for idx in top_indices]
    
    return top_words

def generate_text_naive(prompt,num_tokens=100):
    generated_text = prompt
    for i in range(num_tokens):
        predicted_words = predict_next_word(model, generated_text, vocab, inv_vocab, top_k=5)
        next_word = predicted_words[0]
        if next_word == "<unk>" or next_word == "unk":
            next_word = predicted_words[2]
        
        generated_text += next_word + " "

    return generated_text

In [160]:
prompt = "Original novel written "

generated_text = generate_text_naive(prompt,10)
print("Generated text: ",generated_text)

Generated text:  Original novel written by david and a former writer of the new york times of the united states and the first two years of the film s release the film was released in the united states on november 3 2011 the film was released on dvd on november 3 2011 <eos> and a 


In [165]:
# Example usage:
prompt = "My name is the last time i am going to "
predicted_words = predict_next_word(model, prompt, vocab, inv_vocab, top_k=5)
print("Prompt:", prompt)
print("Next word predictions:", predicted_words)

Prompt: My name is the last time i am going to 
Next word predictions: ['be', '<unk>', 'do', 'get', 'make']
