# RNNs for Language Modeling

### Learning Objectives:

- Understand the motivation behind using Recurrent Neural Networks in NLP.
- Explore key concepts: sequence modeling, recurrence, hidden states, backpropagation through time (briefly).
- Implement RNN architectures (Simple RNN, LSTM) using PyTorch.
- Train an RNN-based language model on real-world data.
- Evaluate and visualize the results.

Recurrent Neural Networks (RNNs) are specifically designed to handle sequential data—data where the order matters, such as text, audio, or time-series data. Unlike traditional neural networks that process input independently, RNNs maintain a hidden state that captures information about previous inputs, allowing them to model sequences effectively.

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel
from transformers import default_data_collator
import numpy as np

from datasets import load_dataset

## Dataset: Loading & Preprocessing

We use the WikiText-2 dataset, a practical benchmark for language modeling tasks.

In [2]:
datasets = load_dataset("wikitext", "wikitext-2-raw-v1")
datasets

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

Let's tokenize the data clearly:

In [3]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenized_datasets = datasets.map(
    lambda x: tokenizer(x["text"], return_attention_mask=False),
    batched=True
)

print(tokenized_datasets)

DatasetDict({
    test: Dataset({
        features: ['text', 'input_ids'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text', 'input_ids'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text', 'input_ids'],
        num_rows: 3760
    })
})


In [4]:
tokenized_datasets["train"].to_pandas().head()


Unnamed: 0,text,input_ids
0,,[]
1,= Valkyria Chronicles III = \n,"[796, 569, 18354, 7496, 17740, 6711, 796, 220,..."
2,,[]
3,Senjō no Valkyria 3 : Unrecorded Chronicles (...,"[2311, 73, 13090, 645, 569, 18354, 7496, 513, ..."
4,"The game began development in 2010 , carrying...","[383, 983, 2540, 2478, 287, 3050, 837, 6872, 6..."


## Preparing Data for Language Modeling

Preparing Data for Language Modeling

In [5]:
block_size = 64

def group_texts(examples):
    concatenated = sum(examples["input_ids"], [])
    total_length = (len(concatenated) // block_size) * block_size
    input_ids = [concatenated[i:i + block_size] for i in range(0, total_length, block_size)]
    labels = [ids[1:] + [tokenizer.eos_token_id] for ids in input_ids]
    return {"input_ids": input_ids, "labels": labels}

lm_dataset = tokenized_datasets.map(group_texts, batched=True, remove_columns=["input_ids", "text"])

train_loader = DataLoader(lm_dataset["train"], batch_size=32, shuffle=True, collate_fn=default_data_collator)
valid_loader = DataLoader(lm_dataset["validation"], batch_size=32, collate_fn=default_data_collator)


In [6]:
lm_dataset["train"].to_pandas().head()

Unnamed: 0,input_ids,labels
0,"[796, 569, 18354, 7496, 17740, 6711, 796, 220,...","[569, 18354, 7496, 17740, 6711, 796, 220, 198,..."
1,"[16106, 2597, 2488, 12, 31, 2712, 2008, 983, 4...","[2597, 2488, 12, 31, 2712, 2008, 983, 4166, 41..."
2,"[983, 290, 5679, 262, 366, 17871, 5321, 366, 8...","[290, 5679, 262, 366, 17871, 5321, 366, 837, 2..."
3,"[569, 18354, 7496, 17740, 2873, 764, 2893, 340...","[18354, 7496, 17740, 2873, 764, 2893, 340, 173..."
4,"[6909, 764, 317, 1588, 1074, 286, 8786, 12118,...","[764, 317, 1588, 1074, 286, 8786, 12118, 262, ..."


In [7]:
# Load GPT-2 pretrained embeddings
model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
pretrained_model = AutoModel.from_pretrained(model_name)
pretrained_embeddings = pretrained_model.get_input_embeddings().weight

In [8]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super().__init__()
        vocab_size, embedding_dim = pretrained_embeddings.shape
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_ids):
        embeddings = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embeddings)
        logits = self.fc(lstm_out)
        return logits


In [9]:
vocab_size = tokenizer.vocab_size
embedding_dim = 32
hidden_dim = 64

model = LSTMLanguageModel(vocab_size, embedding_dim, hidden_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


LSTMLanguageModel(
  (embedding): Embedding(50257, 768)
  (lstm): LSTM(768, 64, batch_first=True)
  (fc): Linear(in_features=64, out_features=50257, bias=True)
)

In [10]:
optimizer = Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    for batch in tqdm(loader, desc="Training"):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids).permute(0, 2, 1)  # (B, vocab, seq_len)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(input_ids).permute(0, 2, 1)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(loader)


In [None]:
epochs = 3

for epoch in range(epochs):
    train_loss = train_epoch(model, train_loader)
    val_loss = evaluate(model, valid_loader)

    print(f"Epoch {epoch + 1}/{epochs} - Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Perplexity: {np.exp(val_loss):.2f}")


In [11]:
def generate(model, tokenizer, prompt, length=50):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

    generated = input_ids
    with torch.no_grad():
        for _ in range(length):
            logits = model(generated)
            next_token_logits = logits[:, -1, :]
            next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)
            generated = torch.cat((generated, next_token), dim=1)

    return tokenizer.decode(generated[0])

print(generate(model, tokenizer, prompt="Generative AI"))


Generative AIULE begging stacks dilemmascar Sessionsnorth pled exoticProject Brus Meta Productions tmparmac demonstrfordWEWEWE�north viruseshes tmp begging stacks stacks clustered begging leather705 begging OTHER tmpnorth pled exoticProject Brus Meta Productions tmparmac demonstrfordWEWEWE�
