# Assignment 2

In this assignment, you will continue with the Bigram Language Model from the Lecture. Make the training loop and inference for the model.

## Importing Libraries

In [1]:
import os
import math
from dataclasses import dataclass
import torch
from torch.nn import functional as F

import os
import random
import numpy as np
import torch


def set_seed(seed: int):
    """
    Set the random seed for reproducibility.

    Args:
        seed (int): The seed value to set.
    """
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"Random seed set to {seed}")


def configure_device() -> torch.device:
    """
    Configure the device for training.

    Returns:
        torch.device: The device to use for training.
    """
    if torch.cuda.is_available():
        device = torch.device("cuda")
        num_gpu = torch.cuda.device_count()
        print(f"Running on {num_gpu} {torch.cuda.get_device_name()} GPU(s)")
    elif torch.backends.mps.is_available():
        device = torch.device("mps")
        print(f"Running on {device}")
    else:
        device = torch.device("cpu")
        print(f"Running on {device}")
    return device


def load_text(file_path: str, encoding: str = 'utf-8') -> str:
    """
    Load and read text data from a file.

    Args:
        file_path (str): Path to the text file.
        encoding (str, optional): File encoding. Defaults to 'utf-8'.

    Returns:
        str: The content of the text file.
    """
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
        raise FileNotFoundError(f"File not found: {file_path}")

    with open(file_path, 'r', encoding=encoding) as f:
        text = f.read()

    print(f"Loaded text data from {file_path} (length: {len(text)} characters).")
    return text


## Configuration

In [None]:
@dataclass
class BigramConfig: # 프로젝트의 설정값들을 하나의 객체에 저장하는 역할
    root_dir: str = os.getcwd() + "/"
    dataset_path: str = "/names.txt"

    # Tokenizer
    vocab_size: int = 0  # Set later

    seed: int = 101
    
config = BigramConfig()

## Reproducibility

In [3]:
set_seed(config.seed)

Random seed set to 101


## Dataset

In [None]:
names = load_text(config.root_dir + config.dataset_path).splitlines()
# 불러온 텍스트 데이터를 줄 단위로 분리

Loaded text data from /Users/jimni/Downloads//names.txt (length: 228145 characters).


## Preprocessing

In [5]:
# Add special token
names = ["." + name + "." for name in names]

## Tokenizer

In [None]:
chars = [chr(i) for i in range(97, 123)]  # all alphabet characters
chars.insert(0, ".")  # Add special token
config.vocab_size = len(chars)
str2idx = {char: idx for idx, char in enumerate(chars)} # 문자를 인덱스로 변환
idx2str = {idx: char for char, idx in str2idx.items()} # 인덱스를 문자로 변환

## Model

In [7]:
# Initialize weights
W = torch.randn(config.vocab_size, config.vocab_size, requires_grad=True)
b = torch.randn(config.vocab_size, requires_grad=True)
params = [W, b]

## Training

#### Task 1: Train Bigram Language Model (Neural Network Approach)

Make the training loop for the Bigram Language Model.

In [8]:
# Set of Input, Target pairs
inputs, targets = [], []
for name in names:
    for char1, char2 in zip(name, name[1:]):
        input = str2idx[char1]
        target = str2idx[char2]
        inputs.append(input)
        targets.append(target)

# Convert to tensor
inputs = torch.tensor(inputs, dtype=torch.long)
targets = torch.tensor(targets, dtype=torch.long)

In [9]:
print(f"Number of Input, Target pairs: {len(inputs)}")
print(f"Input shape: {inputs.shape}")
print(f"Target shape: {targets.shape}")
print(f"First (Input, Target): ({inputs[0]}, {targets[0]})")
print(f"Second (Input, Target): ({inputs[1]}, {targets[1]})")

Number of Input, Target pairs: 228146
Input shape: torch.Size([228146])
Target shape: torch.Size([228146])
First (Input, Target): (0, 5)
Second (Input, Target): (5, 13)


In [None]:
################################################################################
# TODO:                                                                        #
# One-hot encode the input tensor.                                             #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
inputs_encoded = F.one_hot(inputs, num_classes=config.vocab_size)
''' One-hot 함수는 범주형 데이터를 기계 학습 모댈이 이해할 수 있는 이진 벡터로 변환하는 기법
각 범주를 고유한 인덱스로 지정한 후, 해당 인덱스 위치에 1을 두고 나머지 위치에는 0을 채우는 방식으로 작동'''
# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# Convert data type to float
inputs_encoded = inputs_encoded.float()

In [None]:
# Training Loop (신경망 학습 루프)
steps = 100
lr = 10

for step in range(1, steps + 1):
    # Forward pass
    ################################################################################
    # TODO:                                                                        #
    # Implement the forward pass.                                                  #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    logits = inputs_encoded @ W + b  
    probs = F.softmax(logits, dim=1)
    '''입력값을 가중치 행렬 W와 평향 b에 의해 선형 변환함
    softmax 함수를 적용해 각 출력 노드의 확률 분포를 계산함'''
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    # loss
    log_probs = torch.log(probs + 1e-9)  # Add small value to prevent log(0)
    loss = -log_probs[torch.arange(len(targets)), targets].mean()
    # 모델이 예측한 확률의 로그 값을 취한 후, 정답 인덱스에 해당하는 log-probability의 음수를 평균내어 loss값 구함
    
    # Backward pass
    ################################################################################
    # TODO:                                                                        #
    # Implement the backward pass.                                                 #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    loss.backward()
    '''loss.backward()를 호출해 PyTorch의 자동 미분 시스템이 각 파라미터 (W, b)에 대한 기울기를 계산함'''
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    # Update weights
    ################################################################################
    # TODO:                                                                        #
    # Update the weights of the model using the gradients.                         #
    ################################################################################
    # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    with torch.no_grad():
        for param in params:
            param -= lr * param.grad
            param.grad.zero_()
    # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    if step % 10 == 0:
        print(f"Step {step}, Loss {loss.item():.4f}")

Step 10, Loss 2.8777
Step 20, Loss 2.7156
Step 30, Loss 2.6413
Step 40, Loss 2.5989
Step 50, Loss 2.5718
Step 60, Loss 2.5532
Step 70, Loss 2.5397
Step 80, Loss 2.5294
Step 90, Loss 2.5214
Step 100, Loss 2.5150


## Inference

#### Task 2: Generate a Name

Create a function to generate a name using the trained Bigram Language Model.

In [12]:
# Create a function to generate a name
def generate_name():
    new_name = []
    start_idx = str2idx["."]
    
    while True:
        ################################################################################
        # TODO:                                                                        #
        # 1. Forward pass                                                              #
        # 2. Sample the next token                                                     #
        # 3. Decode the token                                                          #
        # 4. Update the start_idx                                                      #
        # 5. Break if the next character is "."                                        #
        ################################################################################
        # *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
        # One-hot encode the current token
        x = F.one_hot(torch.tensor([start_idx]), num_classes=config.vocab_size).float()
        
        # Compute logits and obtain probabilities
        logits = x @ W + b
        probs = F.softmax(logits, dim=1)
        
        # Sample the next token using multinomial sampling
        next_idx = torch.multinomial(probs, num_samples=1).item()
        
        # Decode the token back to its character representation
        next_char = idx2str[next_idx]
        
        # Update start_idx to the sampled token for the next iteration
        start_idx = next_idx
        
        # If the sampled token is the terminal marker, break the loop;
        # Otherwise, append it to the new_name list.
        if next_char == ".":
            break
        new_name.append(next_char)
        # *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****
    
    return ''.join(new_name)

# Generate 5 names
for _ in range(5):
    print(generate_name())


in
am
kasskve
aaion
are


## Extra Credit

We have already made our own custom auto-grad Tensor class. Let's use it!

Train the Bigram Language Model using our custom auto-grad Tensor class.

**Do not use any built-in PyTorch functions.** (other deep learning libraries are also prohibited)

In [13]:
class Tensor:
    def __init__(self, data, _children=(), _operation=''):
        self.data = data
        self._prev = set(_children)
        self.gradient = 0
        self._backward = lambda: None

    def __repr__(self):
        return f"tensor=({self.data})"

    def __add__(self, other):  # self + other
        output = Tensor(self.data + other.data, (self, other), '+')
        def _backward():
            self.gradient = 1 * output.gradient
            other.gradient = 1 * output.gradient
        output._backward = _backward
        return output

    def __mul__(self, other):  # self * other
        output = Tensor(self.data * other.data, (self, other), '*')
        def _backward():
            self.gradient = other.data * output.gradient
            other.gradient = self.data * output.gradient
        output._backward = _backward
        return output

    def tanh(self):  # tanh(self)
        output = Tensor(math.tanh(self.data), (self,), 'tanh')
        def _backward():
            self.gradient = (1.0 - math.tanh(self.data) ** 2) * output.gradient
        output._backward = _backward
        return output

    def __pow__(self, power):  # self ** power
        assert isinstance(power, (int, float)), "Power must be an int or a float"
        output = Tensor(self.data ** power, (self,), f'**{power}')
        def _backward():
            self.gradient = power * (self.data ** (power - 1)) * output.gradient
        output._backward = _backward
        return output

    def backward(self):
        topo = []
        visited = set()
        def build_topo(v):
            if v not in visited:
                visited.add(v)
                for child in v._prev:
                    build_topo(child)
                topo.append(v)
        build_topo(self)
        self.gradient = 1
        for node in reversed(topo):
            node._backward()

    def __neg__(self): # -self
        return self * Tensor(-1.0)

    def __sub__(self, other): # self - other
        return self + (-other)

In [14]:
################################################################################
# TODO:                                                                        #
################################################################################
# *****START OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****

# *****END OF YOUR CODE (DO NOT DELETE/MODIFY THIS LINE)*****