# Assignment 4

In this assignment, you will refactor the entire code to PyTorch, making it more modular and efficient.

## Importing Libraries

In [1]:
import os
from dataclasses import dataclass
from typing import List, Tuple
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import wandb
from utils import load_text, set_seed, configure_device

## Configuration

In [2]:
@dataclass
class MLPConfig:
    root_dir: str = os.getcwd() + "/../../"
    dataset_path: str = "data/names.txt"
    device: torch.device = torch.device('cpu')  # Automatic device configuration

    # Tokenizer
    vocab_size: int = 0  # Set later
    
    # Model
    context_size: int = 3
    d_embed: int = 8
    d_hidden: int = 64
    
    # Training
    val_size: float = 0.1
    batch_size: int = 32
    max_steps: int = 6000  # Max of max_steps = 6421
    lr: float = 0.01
    val_interval: int = 100
    log_interval: int = 100

    seed: int = 101

config = MLPConfig()

## Reproducibility

In [3]:
set_seed(config.seed)

Random seed set to 101


## Device

In [4]:
config.device = configure_device()

Running on mps


## Tokenizer

In [5]:
chars = [chr(i) for i in range(97, 123)]  # all alphabet characters
chars.insert(0, ".")  # Add special token
config.vocab_size = len(chars)
str2idx = {char: idx for idx, char in enumerate(chars)}
idx2str = {idx: char for char, idx in str2idx.items()}

## Dataset

In [6]:
names = load_text(config.root_dir + config.dataset_path).splitlines()

Loaded text data from /Users/jung-yoonsuh/Desktop/KHUDA/LLM101n/notebooks/Assignments/../../data/names.txt (length: 228145 characters).


## Preprocessing

In [7]:
# Train-Val Split
train_names, val_names = train_test_split(names, test_size=config.val_size, random_state=config.seed)

In [8]:
print(f"Train Size: {len(train_names)}")
print(f"Validation Size: {len(val_names)}")
print(f"Train Example: {train_names[0]}")
print(f"Validation Example: {val_names[0]}")

Train Size: 28829
Validation Size: 3204
Train Example: keyler
Validation Example: jessamae


In [9]:
def prepare_dataset(_names):
    _inputs, _targets = [], []

    for name in _names:
        context = [0] * config.context_size

        for char in name + ".":
            idx = str2idx[char]
            _inputs.append(context)
            _targets.append(idx)
            context = context[1:] + [idx]  # Shift the context by 1 character

    _inputs = torch.tensor(_inputs)
    _targets = torch.tensor(_targets)

    return _inputs, _targets

### Task 1: PyTorch DataLoader

We have been using plain Python lists to and then converted them to PyTorch tensors. This is not efficient since it is loading the entire dataset into memory.

PyTorch provides `Dataset` and `DataLoader` class to load the data in memory on the fly. [PyTorch Documentation](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

Refactor the `prepare_dataset` function into a PyTorch `Dataset` class and use the `DataLoader` to efficiently load the data in batches.

In [13]:
# Dataset
class NamesDataset(Dataset):
    ################################################################################
    # TODO:                                                                        #
    # PyTorch Dataset requires 3 methods:                                          #
    # __init__ method to initialize the dataset                                    #
    # __len__ method to return the size of the dataset                             #
    # __getitem__ method to return a sample from the dataset                       #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    def __init__(self, _names: List[str], context_size: int):
        """
        Initialize the dataset
        
        Args:
            _names (List[str]): List of names
            context_size (int): Context size of the model
        """
        super().__init__()
        self.context_size = context_size
        self.names = _names
        
    def __len__(self) -> int:
        """
        Return the number of samples in the dataset

        Returns:
            (int): Number of samples
        """
        length = len(self.names)
        return length
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Return a sample from the dataset

        Args:
            idx (int): Index of the sample

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Input and target tensors
        """
        name = self.names[idx]
        context = [0] * self.context_size
        inputs, targets = [], []
        
        for char in name + ".":
            idx = str2idx[char]
            inputs.append(context)
            targets.append(idx)
            context = context[1:] + [idx]
        
        input_ids = torch.tensor(inputs)
        target_id = torch.tensor(targets)
        
        return input_ids, target_id
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

In [14]:
# Initialize the dataset
train_dataset = NamesDataset(train_names, config.context_size)
val_dataset = NamesDataset(val_names, config.context_size)

In [16]:
print(f"Number of Train Samples: {len(train_dataset)}")
print(f"Number of Validation Samples: {len(val_dataset)}")
print(f"First train (input, target): {train_dataset[0]}")
print(f"First validation (input, target): {val_dataset[0]}")
print(f"Second train (input, target): {train_dataset[1]}")
print(f"Second validation (input, target): {val_dataset[1]}")

Number of Train Samples: 28829
Number of Validation Samples: 3204
First train (input, target): (tensor([[ 0,  0,  0],
        [ 0,  0, 11],
        [ 0, 11,  5],
        [11,  5, 25],
        [ 5, 25, 12],
        [25, 12,  5],
        [12,  5, 18]]), tensor([11,  5, 25, 12,  5, 18,  0]))
First validation (input, target): (tensor([[ 0,  0,  0],
        [ 0,  0, 10],
        [ 0, 10,  5],
        [10,  5, 19],
        [ 5, 19, 19],
        [19, 19,  1],
        [19,  1, 13],
        [ 1, 13,  1],
        [13,  1,  5]]), tensor([10,  5, 19, 19,  1, 13,  1,  5,  0]))
Second train (input, target): (tensor([[ 0,  0,  0],
        [ 0,  0, 20],
        [ 0, 20,  9],
        [20,  9, 20],
        [ 9, 20, 21],
        [20, 21, 19]]), tensor([20,  9, 20, 21, 19,  0]))
Second validation (input, target): (tensor([[ 0,  0,  0],
        [ 0,  0,  8],
        [ 0,  8,  1],
        [ 8,  1, 25],
        [ 1, 25,  7],
        [25,  7,  5],
        [ 7,  5, 14]]), tensor([ 8,  1, 25,  7,  5, 14,  0]))


In [19]:
# DataLoader
################################################################################
# TODO:                                                                        #
# Initialize the DataLoader for the training and validation datasets.          #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)

# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

In [20]:
# Example batch
_x, _y = next(iter(train_loader))
print(f"Input Shape: {_x.shape}")   # (batch_size, context_size)
print(f"Target Shape: {_y.shape}")  # (batch_size)
print(f"Input: {_x[0]}")
print(f"Target: {_y[0]}")

RuntimeError: stack expects each tensor to be equal size, but got [5, 3] at entry 0 and [7, 3] at entry 1

## Model

### Task 2: MLP Model

Initialize the weights of the model using the `Kaiming` initialization.

What are other activation functions that can be used instead of `tanh`? What are the advantages and disadvantages? Use different activation functions and compare the results.


In [21]:
class MLP(nn.Module):
    def __init__(self, vocab_size, context_size, d_embed, d_hidden):
        super().__init__()
        self.C = nn.Parameter(torch.randn(vocab_size, d_embed))
        self.W1 = nn.Parameter(torch.randn(context_size * d_embed, d_hidden))
        self.b1 = nn.Parameter(torch.randn(d_hidden))
        self.W2 = nn.Parameter(torch.randn(d_hidden, vocab_size))
        self.b2 = nn.Parameter(torch.randn(vocab_size))
        
    def forward(self, x):
        x_embed = self.C[x]
        x = x_embed.view(x.size(0), -1)
        
        h = F.relu(x @ self.W1 + self.b1)
        
        logits = torch.matmul(h, self.W2) + self.b2
        return logits


In [22]:
# Initialize the model
mlp = MLP(config.vocab_size, config.context_size, config.d_embed, config.d_hidden)
mlp.to(config.device) # Move the model to the device
print(mlp)
print("Number of parameters:", sum(p.numel() for p in mlp.parameters()))

MLP()
Number of parameters: 3571


## Training

### Task 3: Wandb Integration

[Weights and Biases](https://wandb.ai/site) is a platform to track your machine learning experiments. It is very useful to log the hyperparameters, metrics, and weights of the model. (We can't use matplotlib every time to visualize the results)

Create a free account on Wandb. Initialize the wandb run and log the hyperparameters and metrics.

**How to set up WANDB API KEY**
- Create an account on Wandb
- Go to `wandb.ai` -> `Settings` -> `API Keys` -> `Copy API Key`
- Set the API key as an environment variable `WANDB_API_KEY`
    - What is an environment variable? How to set it? Google `.env`

Note: Do not hardcode the API key in the script. Use environment variables.



In [26]:
wandb.login(key=os.environ.get("aa3f6196bb8c5303ba66c251386d5495551ea13e"))
wandb.init(
    project="Assignment-04",
    config={
        "d_embed": config.d_embed,
        "d_hidden": config.d_hidden,
        "lr": config.lr,
    },
    dir=config.root_dir
)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/jung-yoonsuh/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myoonsuh0615[0m ([33myoonsuh0615-kyung-hee-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Task 4: Training

Train the model. Change the hyperparameters and configurations. Log the results and analyze it.

In [27]:
def train(
        model: nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        max_steps: int,
        lr: float,
        val_interval: int,
        log_interval: int,
        device: torch.device,
):
    """
    Train the model for a fixed number of steps.

    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for the training data.
        val_loader (DataLoader): DataLoader for the validation data.
        max_steps (int): Maximum number of steps to train.
        lr (float): Learning rate.
        val_interval (int): Interval for validation.
        log_interval (int): Interval for logging.
        device (torch.device): Device to run the model on.
    """
    wandb.watch(model, log="all", log_freq=log_interval)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    progress_bar = tqdm(enumerate(train_loader, start=1), total=max_steps, desc="Training")
    running_loss = 0.0

    for step, (train_inputs, train_targets) in progress_bar:
        if step > max_steps:
            break
        model.train()
        train_inputs, train_targets = train_inputs.to(device), train_targets.to(device)
        optimizer.zero_grad()
        logits = model(train_inputs)
        loss = F.cross_entropy(logits, train_targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        progress_bar.set_postfix(loss=f"{running_loss / step:.4f}")

        if step % val_interval == 0:
            model.eval()
            val_loss = 0.0
            total_samples = 0
            with torch.no_grad():
                for val_inputs, val_targets in val_loader:
                    val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)
                    val_logits = model(val_inputs)
                    batch_loss = F.cross_entropy(val_logits, val_targets)
                    val_loss += batch_loss.item() * val_inputs.size(0)
                    total_samples += val_inputs.size(0)
            wandb.log({"Val Loss": val_loss / total_samples}, step=step)

        if step % log_interval == 0:
            wandb.log({"Train Loss": running_loss / step}, step=step)

    progress_bar.close()
    wandb.finish()

Note: Unfortunatley PyTorch does not support infinite DataLoader. The train will stop when it reaches the end of the DataLoader. (max_steps=6421)

In [28]:
train(
    model=mlp,
    train_loader=train_loader,
    val_loader=val_loader,
    max_steps=config.max_steps,
    lr=config.lr,
    val_interval=config.val_interval,
    log_interval=config.log_interval,
    device=config.device
)

Training:   0%|          | 0/6000 [00:00<?, ?it/s]


RuntimeError: stack expects each tensor to be equal size, but got [8, 3] at entry 0 and [6, 3] at entry 2

In [None]:
################################################################################
# TODO:                                                                        #
# Analyze the results                                                          #
# What hyperparameters worked well? What activation did you use? etc.          #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
#
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

## Inference

In [None]:
def generate_name(model: nn.Module, context_size: int, decoder: dict, end_id: int, device: torch.device) -> str:
    """
    Generate a name using the model.

    Args:
        model (nn.Module): Model to generate the name.
        context_size (int): Context size of the model.
        decoder (dict): Decoder dictionary to convert indices to characters.
        end_id (int): End token id.
        device (torch.device): Device to run the model on

    Returns:
        (str): Generated name
    """
    new_name = []
    context = [end_id] * context_size

    while True:
        x = torch.tensor(context).unsqueeze(0).to(device)
        logits = model(x)
        probs = F.softmax(logits, dim=-1)
        idx = torch.multinomial(probs, num_samples=1).item()
        new_name.append(decoder[idx])
        context = context[1:] + [idx]
        if idx == end_id:
            break

    return "".join(new_name)

In [None]:
for _ in range(5):
    print(generate_name(
        model=mlp,
        context_size=config.context_size,
        decoder=idx2str,
        end_id=str2idx["."],
        device=config.device
    ))