# Assignment 4

In this assignment, you will refactor the entire code to PyTorch, making it more modular and efficient.

## Importing Libraries

In [1]:
import os
from dataclasses import dataclass
from typing import List, Tuple
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
import wandb
from utils import load_text, set_seed, configure_device

## Configuration

In [2]:
@dataclass
class MLPConfig:
    root_dir: str = os.getcwd() + "/../../"
    dataset_path: str = "data/names.txt"
    device: torch.device = torch.device('cpu')  # Automatic device configuration

    # Tokenizer
    vocab_size: int = 0  # Set later
    
    # Model
    context_size: int = 3
    d_embed: int = 8
    d_hidden: int = 64
    
    # Training
    val_size: float = 0.1
    batch_size: int = 32
    max_steps: int = 6000  # Max of max_steps = 6421
    lr: float = 0.01
    val_interval: int = 100
    log_interval: int = 100

    seed: int = 101

config = MLPConfig()

## Reproducibility

In [3]:
set_seed(config.seed)

Random seed set to 101


## Device

In [4]:
config.device = configure_device()

Running on cpu


## Tokenizer

In [5]:
chars = [chr(i) for i in range(97, 123)]  # all alphabet characters
chars.insert(0, ".")  # Add special token
config.vocab_size = len(chars)
str2idx = {char: idx for idx, char in enumerate(chars)}
idx2str = {idx: char for char, idx in str2idx.items()}

## Dataset

In [6]:
names = load_text(config.root_dir + config.dataset_path).splitlines()

Loaded text data from /home/user/LLM101n/notebooks/Assignments/../../data/names.txt (length: 228145 characters).


## Preprocessing

In [7]:
# Train-Val Split
train_names, val_names = train_test_split(names, test_size=config.val_size, random_state=config.seed)

In [8]:
print(f"Train Size: {len(train_names)}")
print(f"Validation Size: {len(val_names)}")
print(f"Train Example: {train_names[0]}")
print(f"Validation Example: {val_names[0]}")

Train Size: 28829
Validation Size: 3204
Train Example: keyler
Validation Example: jessamae


In [9]:
def prepare_dataset(_names):
    _inputs, _targets = [], []

    for name in _names:
        context = [0] * config.context_size

        for char in name + ".":
            idx = str2idx[char]
            _inputs.append(context)
            _targets.append(idx)
            context = context[1:] + [idx]  # Shift the context by 1 character

    _inputs = torch.tensor(_inputs)
    _targets = torch.tensor(_targets)

    return _inputs, _targets

### Task 1: PyTorch DataLoader

We have been using plain Python lists to and then converted them to PyTorch tensors. This is not efficient since it is loading the entire dataset into memory.

PyTorch provides `Dataset` and `DataLoader` class to load the data in memory on the fly. [PyTorch Documentation](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

Refactor the `prepare_dataset` function into a PyTorch `Dataset` class and use the `DataLoader` to efficiently load the data in batches.

In [10]:
# Dataset
class NamesDataset(Dataset):
    ################################################################################
    # TODO:                                                                        #
    # PyTorch Dataset requires 3 methods:                                          #
    # __init__ method to initialize the dataset                                    #
    # __len__ method to return the size of the dataset                             #
    # __getitem__ method to return a sample from the dataset                       #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    def __init__(self, _names: List[str], context_size: int):
        """
        Initialize the dataset

        Args:
            _names (List[str]): List of names
            context_size (int): Context size of the model
        """

        self.inputs = []
        self.targets = []
        self.context_size = context_size

        for name in _names:
            context = [0] * context_size  

            for char in name + ".":
                idx = str2idx[char]  
                self.inputs.append(context)
                self.targets.append(idx)
                context = context[1:] + [idx]  

    def __len__(self) -> int:
        """
        Return the number of samples in the dataset

        Returns:
            (int): Number of samples
        """

        return len(self.inputs)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Return a sample from the dataset

        Args:
            idx (int): Index of the sample

        Returns:
            Tuple[torch.Tensor, torch.Tensor]: Input and target tensors
        """
        x = torch.tensor(self.inputs[idx], dtype=torch.long)
        y = torch.tensor(self.targets[idx], dtype=torch.long)

        return x, y
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

In [11]:
# Initialize the dataset
train_dataset = NamesDataset(train_names, config.context_size)
val_dataset = NamesDataset(val_names, config.context_size)

In [12]:
print(f"Number of Train Samples: {len(train_dataset)}")
print(f"Number of Validation Samples: {len(val_dataset)}")
print(f"First train (input, target): {train_dataset[0]}")
print(f"First validation (input, target): {val_dataset[0]}")
print(f"Second train (input, target): {train_dataset[1]}")
print(f"Second validation (input, target): {val_dataset[1]}")

Number of Train Samples: 205456
Number of Validation Samples: 22690
First train (input, target): (tensor([0, 0, 0]), tensor(11))
First validation (input, target): (tensor([0, 0, 0]), tensor(10))
Second train (input, target): (tensor([ 0,  0, 11]), tensor(5))
Second validation (input, target): (tensor([ 0,  0, 10]), tensor(5))


In [13]:
# DataLoader
################################################################################
# TODO:                                                                        #
# Initialize the DataLoader for the training and validation datasets.          #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

In [14]:
# Example batch
_x, _y = next(iter(train_loader))
print(f"Input Shape: {_x.shape}")   # (batch_size, context_size)
print(f"Target Shape: {_y.shape}")  # (batch_size)
print(f"Input: {_x[0]}")
print(f"Target: {_y[0]}")

Input Shape: torch.Size([32, 3])
Target Shape: torch.Size([32])
Input: tensor([14,  1, 18])
Target: 15


## Model

### Task 2: MLP Model

Initialize the weights of the model using the `Kaiming` initialization.

What are other activation functions that can be used instead of `tanh`? What are the advantages and disadvantages? Use different activation functions and compare the results.


In [19]:
class MLP(nn.Module):
    ################################################################################
    # TODO:                                                                        #
    # Define the __init__ and forward methods for the MLP model.                   #
    # Use the Kaiming initialization for the weights.                              #
    # Use other activation functions instead of tanh and compare the results.      #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

    def __init__(self, vocab_size: int, context_size: int, d_embed: int, d_hidden: int, activation_fn: str = "relu"):
        """
        MLP 모델 초기화

        Args:
            vocab_size (int): 전체 문자 집합 크기
            context_size (int): context window 크기
            d_embed (int): 임베딩 차원
            d_hidden (int): 은닉층 크기
            activation_fn (str): 활성화 함수 이름
        """
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, d_embed)
        self.linear1 = nn.Linear(context_size * d_embed, d_hidden)
        self.linear2 = nn.Linear(d_hidden, vocab_size)

        # 활성화 함수 매핑
        activations = {
            "relu": F.relu,
            "tanh": torch.tanh,
            "gelu": F.gelu,
            "leaky_relu": F.leaky_relu,
        }

        assert activation_fn in activations, f"지원하지 않는 활성화 함수: {activation_fn}"
        self.activation_fn = activations[activation_fn]

        # Kaiming 초기화
        nn.init.kaiming_uniform_(self.linear1.weight, nonlinearity=activation_fn)
        nn.init.kaiming_uniform_(self.linear2.weight, nonlinearity="linear")

    def forward(self, x):
        """
        순전파 계산

        Args:
            x (Tensor): (batch_size, context_size)

        Returns:
            Tensor: (batch_size, vocab_size)
        """
        x = self.embedding(x)                # (B, C, d_embed)
        x = x.view(x.size(0), -1)            # (B, C*d_embed)
        x = self.activation_fn(self.linear1(x))  # (B, d_hidden)
        logits = self.linear2(x)             # (B, vocab_size)
        return logits
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

In [20]:
# Initialize the model
mlp = MLP(config.vocab_size, config.context_size, config.d_embed, config.d_hidden)
mlp.to(config.device) # Move the model to the device
print(mlp)
print("Number of parameters:", sum(p.numel() for p in mlp.parameters()))

MLP(
  (embedding): Embedding(27, 8)
  (linear1): Linear(in_features=24, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=27, bias=True)
)
Number of parameters: 3571


## Training

### Task 3: Wandb Integration

[Weights and Biases](https://wandb.ai/site) is a platform to track your machine learning experiments. It is very useful to log the hyperparameters, metrics, and weights of the model. (We can't use matplotlib every time to visualize the results)

Create a free account on Wandb. Initialize the wandb run and log the hyperparameters and metrics.

**How to set up WANDB API KEY**
- Create an account on Wandb
- Go to `wandb.ai` -> `Settings` -> `API Keys` -> `Copy API Key`
- Set the API key as an environment variable `WANDB_API_KEY`
    - What is an environment variable? How to set it? Google `.env`

Note: Do not hardcode the API key in the script. Use environment variables.



In [22]:
wandb.login(key=os.environ.get("WANDB_API_KEY"))
wandb.init(
    project="Assignment-04",
    config={
        "d_embed": config.d_embed,
        "d_hidden": config.d_hidden,
        "lr": config.lr,
    },
    dir=config.root_dir
)

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

  ········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/user/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjsshin8128[0m ([33mjsshin8128-kyung-hee-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Task 4: Training

Train the model. Change the hyperparameters and configurations. Log the results and analyze it.

In [23]:
def train(
        model: nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        max_steps: int,
        lr: float,
        val_interval: int,
        log_interval: int,
        device: torch.device,
):
    """
    Train the model for a fixed number of steps.

    Args:
        model (nn.Module): The model to train.
        train_loader (DataLoader): DataLoader for the training data.
        val_loader (DataLoader): DataLoader for the validation data.
        max_steps (int): Maximum number of steps to train.
        lr (float): Learning rate.
        val_interval (int): Interval for validation.
        log_interval (int): Interval for logging.
        device (torch.device): Device to run the model on.
    """
    wandb.watch(model, log="all", log_freq=log_interval)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    progress_bar = tqdm(enumerate(train_loader, start=1), total=max_steps, desc="Training")
    running_loss = 0.0

    for step, (train_inputs, train_targets) in progress_bar:
        if step > max_steps:
            break
        model.train()
        train_inputs, train_targets = train_inputs.to(device), train_targets.to(device)
        optimizer.zero_grad()
        logits = model(train_inputs)
        loss = F.cross_entropy(logits, train_targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        progress_bar.set_postfix(loss=f"{running_loss / step:.4f}")

        if step % val_interval == 0:
            model.eval()
            val_loss = 0.0
            total_samples = 0
            with torch.no_grad():
                for val_inputs, val_targets in val_loader:
                    val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)
                    val_logits = model(val_inputs)
                    batch_loss = F.cross_entropy(val_logits, val_targets)
                    val_loss += batch_loss.item() * val_inputs.size(0)
                    total_samples += val_inputs.size(0)
            wandb.log({"Val Loss": val_loss / total_samples}, step=step)

        if step % log_interval == 0:
            wandb.log({"Train Loss": running_loss / step}, step=step)

    progress_bar.close()
    wandb.finish()

Note: Unfortunatley PyTorch does not support infinite DataLoader. The train will stop when it reaches the end of the DataLoader. (max_steps=6421)

In [24]:
train(
    model=mlp,
    train_loader=train_loader,
    val_loader=val_loader,
    max_steps=config.max_steps,
    lr=config.lr,
    val_interval=config.val_interval,
    log_interval=config.log_interval,
    device=config.device
)

Training: 100%|██████████████████████████████████████████| 6000/6000 [00:48<00:00, 122.67it/s, loss=2.5920]


0,1
Train Loss,█▇▇▆▆▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁
Val Loss,█▆▅▅▅▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁

0,1
Train Loss,2.59205
Val Loss,2.45502


In [None]:
################################################################################
# TODO:                                                                        #
# Analyze the results                                                          #
# What hyperparameters worked well? What activation did you use? etc.          #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# 활성화 함수 (activation function): ReLU
# - 가장 기본적인 활성화 함수로, 계산이 빠르고 학습 초기에 잘 작동함
# - 실제 실험에서도 안정적인 학습 곡선을 보였고, overfitting 없이 수렴함
# - 이후 실험에서는 GELU 또는 Leaky ReLU와 비교해볼 가치 있음

# 사용한 하이퍼파라미터
# - 임베딩 차원 (d_embed): 8
# - 은닉층 크기 (d_hidden): 64
# - 학습률 (lr): 0.01
# - 문맥 크기 (context_size): 3
# - 배치 사이즈 (batch_size): 32
# - 최대 학습 step 수 (max_steps): 6000

# 성능 결과
# - 최종 Train Loss: 약 2.59
# - 최종 Validation Loss: 약 2.45
# - 학습 곡선이 안정적으로 감소하며, 과적합(overfitting) 없이 수렴함

# 분석
# - 작은 모델 치고는 비교적 안정적인 수렴을 보여줌
# - context_size가 3으로 제한되어 있어, 모델이 긴 관계를 학습하기 어려움 → 이후 RNN/Transformer 구조로 확장 가능
# - ReLU 외의 다른 활성화 함수 실험 필요 (예: GELU는 부드러운 곡선으로 일반화에 더 도움될 수 있음)
# - learning rate 0.01은 꽤 잘 작동했지만, 너무 크면 발산 위험도 있으므로 sweep 실험 권장

# 향후 실험 아이디어
# - d_embed와 d_hidden을 늘려 모델 용량 증가
# - context_size를 4~5로 키워 더 긴 문맥 반영
# - optimizer를 Adam으로 바꾸고, 학습률 조정
# - 활성화 함수별 loss, 생성 샘플 품질 비교
# - Wandb sweep으로 하이퍼파라미터 자동 탐색
#
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

## Inference

In [25]:
def generate_name(model: nn.Module, context_size: int, decoder: dict, end_id: int, device: torch.device) -> str:
    """
    Generate a name using the model.

    Args:
        model (nn.Module): Model to generate the name.
        context_size (int): Context size of the model.
        decoder (dict): Decoder dictionary to convert indices to characters.
        end_id (int): End token id.
        device (torch.device): Device to run the model on

    Returns:
        (str): Generated name
    """
    new_name = []
    context = [end_id] * context_size

    while True:
        x = torch.tensor(context).unsqueeze(0).to(device)
        logits = model(x)
        probs = F.softmax(logits, dim=-1)
        idx = torch.multinomial(probs, num_samples=1).item()
        new_name.append(decoder[idx])
        context = context[1:] + [idx]
        if idx == end_id:
            break

    return "".join(new_name)

In [26]:
for _ in range(5):
    print(generate_name(
        model=mlp,
        context_size=config.context_size,
        decoder=idx2str,
        end_id=str2idx["."],
        device=config.device
    ))

aba.
melz.
jojahe.
qella.
n.
