# 🧠 Final Project: Autoregressive Language Model
This notebook demonstrates how to build a small autoregressive language model using PyTorch and Hugging Face tools.

In [1]:
# ✅ Install dependencies
!pip install torch transformers datasets wandb

Collecting torch
  Downloading torch-2.6.0-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting transformers
  Downloading transformers-4.51.1-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting wandb
  Downloading wandb-0.19.9-py3-none-win_amd64.whl.metadata (10 kB)
Collecting filelock (from torch)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting networkx (from torch)
  Downloading networkx-3.4.2-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2025.3.2-py3-none-any.whl.metadata (11 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy==1.13.1->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting rege

In [2]:
# 📌 Import Libraries
import torch
import torch.nn as nn
import pandas as pd
from transformers import GPT2Tokenizer
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 📚 Tokenizer Setup
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [4]:
# 🧾 Load Dataset
with open("shakespeare.txt", "r", encoding='utf-8') as f:
    text = f.read()

FileNotFoundError: [Errno 2] No such file or directory: 'shakespeare.txt'

In [None]:
# 📂 Custom Dataset
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, text, tokenizer, block_size=128):
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        self.input_ids = tokens["input_ids"]

    def __len__(self):
        return self.input_ids.size(1) - 1

    def __getitem__(self, idx):
        return self.input_ids[0, idx:idx+128], self.input_ids[0, idx+1:idx+129]

dataset = TextDataset(text, tokenizer)

In [None]:
# 🔧 Model Definition
class SimpleDecoderLM(nn.Module):
    def __init__(self, vocab_size, d_model=256, nhead=4, num_layers=3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model, nhead)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers)
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids).permute(1, 0, 2)
        tgt_mask = nn.Transformer().generate_square_subsequent_mask(embedded.size(0)).to(embedded.device)
        decoded = self.transformer_decoder(embedded, embedded, tgt_mask=tgt_mask)
        return self.fc_out(decoded.permute(1, 0, 2))

In [None]:
# ⚙️ Training Loop
from torch.utils.data import DataLoader
model = SimpleDecoderLM(len(tokenizer)).to("cpu")
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

wandb.init(project="lm-final-project")

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    loader = DataLoader(dataset, batch_size=1)
    for x, y in loader:
        logits = model(x)
        loss = loss_fn(logits.view(-1, logits.size(-1)), y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    wandb.log({"epoch": epoch + 1, "train_loss": total_loss / len(loader)})

In [None]:
# 💾 Save the model
torch.save(model.state_dict(), "simple_decoder_lm.pt")

In [None]:
# 🧪 Simple Text Generation
def generate(prompt, max_new_tokens=50):
    model.eval()
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
    for _ in range(max_new_tokens):
        logits = model(input_ids)
        next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)
        input_ids = torch.cat([input_ids, next_token], dim=1)
    return tokenizer.decode(input_ids[0])

generate("To be or not to be")

In [5]:
import ipywidgets as widgets
widgets.IntSlider()


IntSlider(value=0)