In [26]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [27]:
batch_size = 32 # 한 번에 병렬로 처리할 시퀀스의 수
block_size = 128 # 예측을 위한 최대 컨텍스트 길이
max_iters = 1000
eval_interval = 200
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 100
n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.2

In [28]:
torch.manual_seed(42)

<torch._C.Generator at 0x187f3e09510>

In [20]:
# 임베딩
# 데이터를 처음 조직하는 방법은 반드시 알고 있어야 함
# BE -> N+1 문제 -> 왜 생기는가? -> JPA
# DATA -> Size, Len
class DataHandler:
    '''
    데이터 로드해야 함
    문자 -> 숫자
    숫자 -> 문자
    '''
    def __init__(self, block_size, batch_size):
        self.block_size = block_size
        self.batch_size = batch_size
        # 파일 읽어오기
        with open("data/tiny_shakespeare.txt") as f:
            self.text = f.read()

        # 고유 문자 집합 및 매핑
        self.chars = sorted(list(set(self.text)))
        self.vocab_size = len(self.chars)
        self.stoi = {ch:i for i, ch in enumerate(self.chars)}
        self.itos = {i:ch for i, ch in enumerate(self.chars)}

        # 데이터 분할(학습, 검증)
        data = torch.tensor(self.encoder(self.text), dtype=torch.long)
        n = int(0.9 * len(data))
        self.train_data = data[:n]
        self.val_data = data[n:]

    def encoder(self, s):
        # 문자 > 숫자
        return [self.stoi[c] for c in s]

    def decoder(self, l):
        # 숫자 > 문자
        return "".join([self.itos[i] for i in l])

    def get_batch(self, split):
        data = self.train_data if split == "train" else self.val_data
        ix = torch.randint(len(data) - self.block_size, (self.batch_size))
        x = torch.stack([data[i:i+self.block_size] for i in ix])
        y = torch.stack([data[i+1:i+self.block_size] for i in ix])
        return x, y

In [29]:
class DataHandler:
    def __init__(self, data_path, block_size, batch_size, device):
        self.block_size = block_size
        self.batch_size = batch_size
        self.device = device
        
        with open(data_path, 'r', encoding='utf-8') as f:
            self.text = f.read()

        self.chars = sorted(list(set(self.text)))
        self.vocab_size = len(self.chars)
        self.stoi = { ch:i for i,ch in enumerate(self.chars) }
        self.itos = { i:ch for i,ch in enumerate(self.chars) }
        
        data = torch.tensor(self.encode(self.text), dtype=torch.long)
        n = int(0.9*len(data))
        self.train_data = data[:n]
        self.val_data = data[n:]

    def encode(self, s):
        return [self.stoi[c] for c in s]

    def decode(self, l):
        return ''.join([self.itos[i] for i in l])

    def get_batch(self, split):
        data = self.train_data if split == 'train' else self.val_data
        ix = torch.randint(len(data) - self.block_size, (self.batch_size,))
        x = torch.stack([data[i:i+self.block_size] for i in ix])
        y = torch.stack([data[i+1:i+self.block_size+1] for i in ix])
        x, y = x.to(self.device), y.to(self.device)
        return x, y

## 셀프 어텐션 메커니즘

### 같은 시퀸스 내에 단어들 간의 관계

> 기존: 인코더와 디코더의 사이 관계

> 셀프: 같은 시퀸스

- 문장에서 단어의 의미는 주변에 따라 결정
- 셀프 어텐션 계산
    1. 입력 준비(각 단어가 벡터)
    2. 유사도 계산
    3. 가중치 계산
    4. 문맥 벡터 생성

- 훈련 가능한 어텐션 메커니즘: 점곱 어텐션
    - Q: "무엇을 찾고 있어요?" - 현재 단어가 찾는 것
    - K: "무엇을 제공할 수 있어요?"
    - V: "실제로 사용할 값"

In [None]:
n_embd = 128
dropout = 0.1
block_size= 128
class Head(nn.Module):
    # Self-Attention의 단일 헤드(one head)
    def __init__(self, head_size):
        super(Head, self).__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        '''...'''

In [30]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        
        wei = q @ k.transpose(-2, -1) * C**-0.5
        
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        
        v = self.value(x)
        out = wei @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

In [31]:
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        out = self.dropout(out)
        return out

In [32]:
class FeedFoward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [33]:
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [34]:
class GPTLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # 1. 토큰 임베딩 테이블: 각 문자를 벡터로 변환
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        # 2. 포지션 임베딩 테이블: 위치 정보를 벡터로 변환
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        # 3. 트랜스포머 블록들
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        # 4. 최종 Layer Normalization
        self.ln_f = nn.LayerNorm(n_embd)
        # 5. 최종 선형 계층 (Logits 생성)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [35]:
@torch.no_grad()
def estimate_loss(model, data_handler):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = data_handler.get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [36]:
def train_model(model, data_handler):
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    
    for iter in range(max_iters):
        if iter % eval_interval == 0 or iter == max_iters - 1:
            losses = estimate_loss(model, data_handler)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        xb, yb = data_handler.get_batch('train')
        logits, loss = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    print("학습 완료. (Training finished.)")

In [37]:
def main():
    print(f"Using device: {device}")
    
    # 1. 데이터 준비
    data_handler = DataHandler('./data/tiny_shakespeare.txt', block_size, batch_size, device)
    print(f"Vocab size: {data_handler.vocab_size}")
    
    # 2. 모델 초기화
    model = GPTLanguageModel(data_handler.vocab_size)
    m = model.to(device)
    print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

    # 3. 학습 수행
    train_model(m, data_handler)
    
    # 4. 생성 (Generation)
    print("생성된 텍스트 샘플 (Generated sample):")
    context = torch.zeros((1, 1), dtype=torch.long, device=device)
    generated_indices = m.generate(context, max_new_tokens=500)[0].tolist()
    print(generated_indices)
    print(data_handler.decode(generated_indices))

In [38]:
if __name__ == '__main__':
    main()

Using device: cpu
Vocab size: 65
0.824897 M parameters
step 0: train loss 4.3037, val loss 4.3055
step 200: train loss 2.4406, val loss 2.4435
step 400: train loss 2.2260, val loss 2.2523
step 600: train loss 2.0182, val loss 2.0801
step 800: train loss 1.8942, val loss 1.9937
step 999: train loss 1.8051, val loss 1.9364
학습 완료. (Training finished.)
생성된 텍스트 샘플 (Generated sample):
[0, 32, 53, 1, 50, 39, 63, 1, 52, 53, 40, 50, 43, 1, 44, 53, 56, 1, 46, 47, 57, 1, 45, 53, 1, 46, 43, 39, 1, 57, 46, 39, 50, 57, 1, 42, 47, 42, 43, 42, 1, 57, 59, 41, 46, 5, 42, 0, 13, 1, 41, 39, 50, 50, 58, 47, 43, 52, 58, 1, 58, 46, 43, 1, 56, 43, 43, 54, 57, 6, 1, 40, 43, 1, 63, 53, 59, 58, 1, 42, 53, 1, 57, 46, 43, 51, 47, 50, 50, 42, 57, 8, 1, 35, 39, 56, 1, 44, 47, 56, 5, 57, 1, 58, 46, 43, 52, 43, 57, 0, 32, 39, 63, 1, 40, 50, 43, 1, 40, 43, 1, 52, 53, 58, 43, 57, 58, 1, 39, 52, 42, 57, 58, 39, 52, 58, 1, 57, 58, 47, 43, 45, 1, 46, 43, 52, 47, 45, 39, 50, 43, 0, 27, 1, 44, 47, 45, 46, 58, 47, 56, 1, 42, 

In [39]:
%pip install transformers

Collecting transformers
  Downloading transformers-4.57.5-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2026.1.15-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.2-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Downloading transformers-4.57.5-py3-none-any.whl (12.0 MB)
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   -- ------------------------------------- 0.8/12.0 MB 5.0 MB/s eta 0:00:03
   ------ --------------------------------- 2.1/12.0 MB 5.4 MB/s eta 0:00:02
   ------------ --------------------------- 3.7/12.0 MB 5.9 MB/s eta 0:00:02
   --------------- -------------------