In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# 데이터 로드
data = load_dataset("text", data_files={"train": "data/tiny_shakespeare.txt"})

# 토크나이저
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# 토큰화
def tokenize_fn(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=64)

tok_data = data.map(tokenize_fn, batched=True, remove_columns=["text"])

print(tok_data["train"][0])

Generating train split: 40000 examples [00:00, 2057266.74 examples/s]
Map: 100%|██████████| 40000/40000 [00:01<00:00, 32342.64 examples/s]

{'input_ids': [101, 2034, 6926, 1024, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}





In [3]:
# HF Dataset을 바로 torch 텐서로 꺼내 쓰기 위한 포맷 지정
tok_data = tok_data.with_format("torch")

In [4]:
print(tok_data["train"][0])

{'input_ids': tensor([ 101, 2034, 6926, 1024,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])}


In [None]:
import torch
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TokPosEmbedding(nn.Module):
    def __init__(self, vocab_size, max_len, d_model, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_len, d_model)
        self.layernorm = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids):
       batch_size, seq_len = input_ids.shape
       pos_ids = torch.arange(seq_len, device=input_ids.device).unsqueeze(0).expand(batch_size, seq_len)
       x = self.token_embedding(input_ids) + self.position_embedding(pos_ids)
       return self.dropout(self.layernorm(x))


In [15]:
vocab_size = 30522
max_len    = 64
d_model    = 256
dropout    = 0.1

In [16]:
embedding = TokPosEmbedding(vocab_size, max_len, d_model, dropout)
embedding = embedding.to(device)
embedding.eval()

TokPosEmbedding(
  (token_embedding): Embedding(30522, 256)
  (position_embedding): Embedding(64, 256)
  (layernorm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [17]:
batch = tok_data["train"][:16]

input_ids = batch["input_ids"].to(device)
attention_mask = batch["attention_mask"].to(device)

with torch.no_grad():
    x = embedding(input_ids)

print("input_ids shape:", input_ids.shape)
print("attention_mask shape:", attention_mask.shape)
print("embedding output:", x.shape)


input_ids shape: torch.Size([16, 64])
attention_mask shape: torch.Size([16, 64])
embedding output: torch.Size([16, 64, 256])


In [19]:
import torch

def build_causal_mask(seq_len, device):
    mask = torch.full((1, 1, seq_len, seq_len), float("-inf"), device = device) # 마지막 두 축 (s,s)는 "쿼리 토큰 위치(행), 키 토큰 위치(열)를 볼 수 있나?"
    mask = torch.triu(mask, diagonal = 1)
    return mask

def combine_attention_mask(attn_mask_01, causal_mask):
    is_pad = (attn_mask_01 == 0).unsqueeze(1).unsqueeze(2)  # (B,1,1,S) bool
    minus_inf = torch.tensor(float("-inf"), device=attn_mask_01.device)
    pad_mask = torch.where(is_pad, minus_inf, torch.tensor(0.0, device=attn_mask_01.device))  # (B,1,1,S)
    return causal_mask + pad_mask  # (B,1,S,S)로 브로드캐스트 합


In [20]:
import torch.nn.functional as F


In [22]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self):   
        super().__init__()
    
    def forward(self, Q, K, V, attn_mask=None):
        """
        Q, K, V : (Batch_size, Num_head, Sequence_length, d_k)
        attn_mask : (B, 1, S, S) 또는 (B, h, S, S)
        """
        d_k = Q.size(-1) # 각 head의 차원의 크기

        # Q와 K의 내적 -> 어텐션 유사도(score)
        scores = torch.matmul(Q, K.transpose(-1,-2)) / (d_k ** 0.5)

        # 마스크 더하기 (-inf가 있는 자리는 softmax 후 0이 됨)
        if attn_mask is not None:
            scores = scores + attn_mask

        # softmax로 확률 분포화 (가장 관련 있는 토큰의 가중치 업!)
        attn = F.softmax(scores, dim = -1)

        # Value 가중합
        out = torch.matmul(attn, V)

        return out, attn

In [24]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout):
        super().__init__()
        assert d_model % num_heads == 0
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        # input을 Q/K/V로 projection
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        self.attn_score = ScaledDotProductAttention()
        self.W_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def _split_heads(self, x):
        """
        B: 배치크기
        S: 시퀀스 길이(토큰 개수)
        D: 전체 임베딩 차원
        """
        B, S, D = x.shape
        x = x.view(B, S, self.num_heads, self.d_k).transpose(1, 2)
        return x

    def _merge_heads(self, x):
        # x: (B,h,S,d_k) -> (B,S,D)
        B, h, S, d_k = x.shape
        x = x.transpose(1, 2).contiguous().view(B, S, h * d_k)
        return x

    def forward(self, x, attn_mask=None):
        """
        x: (B,S,d_model)
        attn_mask: (B,1,S,S) 또는 (B,h,S,S)
        """
        Q = self._split_heads(self.W_q(x))
        K = self._split_heads(self.W_k(x))
        V = self._split_heads(self.W_v(x))

        out, attn = self.attn_score(Q, K, V, attn_mask)
        out = self._merge_heads(out)
        out = self.dropout(self.W_o(out))
        return out, attn

In [25]:
class PositionwiseFFN(nn.Module):
    def __init__(self, d_model, d_ff, dropout):
        super().__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.act = nn.GELU()
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        return self.fc2(self.dropout(self.act(self.fc1(x))))

In [26]:
import torch, random, numpy as np

In [27]:
print("pad_token_id:", tokenizer.pad_token_id)


pad_token_id: 0


In [28]:
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


In [29]:
splits = tok_data["train"].train_test_split(test_size=0.05, seed=42)
train_data = splits["train"]
valid_data = splits["test"]


In [30]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = PositionwiseFFN(d_model, d_ff, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, attn_mask=None):
        h, attn = self.mha(self.ln1(x), attn_mask=attn_mask)
        x = x + self.dropout(h) # Residual
        h2 = self.ffn(self.ln2(x))
        x = x + self.dropout(h2)
        return x, attn

In [36]:
# Cell 17 수정본
class DecoderOnlyLM(nn.Module):
    def __init__(self, vocab_size, max_len, d_model, num_heads, d_ff, num_layers, dropout):
        super().__init__()
        self.embedding = TokPosEmbedding(vocab_size, max_len, d_model, dropout)
        self.layers = nn.ModuleList([  # layer -> layers로 수정
            TransformerBlock(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(d_model)
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False)

        # Weight tying: 임베딩 가중치와 LM head 공유
        self.lm_head.weight = self.embedding.token_embedding.weight

    def forward(self, input_ids, attention_mask=None):
        x = self.embedding(input_ids)
        batch_size, sequence_length = input_ids.size()

        causal = build_causal_mask(sequence_length, device=input_ids.device)
        if attention_mask is not None:
            attn_mask = combine_attention_mask(attention_mask, causal)
        else:
            attn_mask = causal

        attns=[]
        for block in self.layers:  # 이제 정상 동작
            x, attn = block(x, attn_mask)
            attns.append(attn)

        x = self.ln_f(x)
        logits = self.lm_head(x)

        return logits, attns

In [37]:
# 하이퍼파라미터
num_layers = 4
num_heads  = 8
d_ff       = 4 * d_model      # 보통 4*d_model
lr         = 3e-4
weight_decay = 0.01
pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0

model = DecoderOnlyLM(
    vocab_size=vocab_size,
    max_len=max_len,
    d_model=d_model,
    num_heads=num_heads,
    d_ff=d_ff,
    num_layers=num_layers,
    dropout=dropout
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=weight_decay)
criterion = nn.CrossEntropyLoss(ignore_index=pad_id)


In [38]:
from torch.utils.data import DataLoader

def collate_fn(examples):
    # with_format("torch") 덕분에 텐서로 바로 스택 가능
    return {
        "input_ids": torch.stack([e["input_ids"] for e in examples]),
        "attention_mask": torch.stack([e["attention_mask"] for e in examples]),
    }

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=64, shuffle=False, collate_fn=collate_fn)

def make_lm_batch(batch):
    input_ids = batch["input_ids"].to(device)          # (B,S)
    attn_mask = batch["attention_mask"].to(device)     # (B,S)

    # 다음 토큰 예측: 입력/라벨 한 칸 시프트
    x_in  = input_ids[:, :-1].contiguous()             # (B,S-1)
    y_out = input_ids[:, 1:].contiguous()              # (B,S-1)
    m_in  = attn_mask[:, :-1].contiguous()             # (B,S-1)
    return x_in, m_in, y_out


In [39]:
import math

def train_one_epoch(model, loader, optimizer, criterion, clip=1.0):
    model.train()
    total_loss, total_tokens = 0.0, 0
    for step, batch in enumerate(loader):
        x_in, m_in, y_out = make_lm_batch(batch)
        logits, _ = model(x_in, attention_mask=m_in)   # (B,S-1,V)

        B, S1, V = logits.shape
        loss = criterion(logits.view(B*S1, V), y_out.view(-1))

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

        # 통계: 유효 토큰(패딩 제외)
        n_valid = (y_out != pad_id).sum().item()
        total_loss += loss.item() * n_valid
        total_tokens += n_valid

        if (step+1) % 50 == 0:
            ppl = math.exp(total_loss / max(total_tokens, 1))
            print(f"[train step {step+1}] loss(avg)={total_loss/total_tokens:.4f} | ppl={ppl:.2f}")

    avg_loss = total_loss / max(total_tokens, 1)
    return avg_loss, math.exp(avg_loss)

@torch.no_grad()
def evaluate(model, loader, criterion):
    model.eval()
    total_loss, total_tokens = 0.0, 0
    for batch in loader:
        x_in, m_in, y_out = make_lm_batch(batch)
        logits, _ = model(x_in, attention_mask=m_in)
        B, S1, V = logits.shape
        loss = criterion(logits.view(B*S1, V), y_out.view(-1))

        n_valid = (y_out != pad_id).sum().item()
        total_loss += loss.item() * n_valid
        total_tokens += n_valid

    avg_loss = total_loss / max(total_tokens, 1)
    return avg_loss, math.exp(avg_loss)


In [40]:
EPOCHS = 1  # 우선 1 에폭만 돌려보기(스모크 테스트)
best_val = float("inf")

for epoch in range(1, EPOCHS+1):
    tr_loss, tr_ppl = train_one_epoch(model, train_loader, optimizer, criterion)
    va_loss, va_ppl = evaluate(model, valid_loader, criterion)
    print(f"[epoch {epoch}] train_loss={tr_loss:.4f} (ppl={tr_ppl:.2f}) | valid_loss={va_loss:.4f} (ppl={va_ppl:.2f})")

    # 간단 체크포인트(유효성 손실이 좋아지면 저장)
    if va_loss < best_val:
        best_val = va_loss
        torch.save({"model": model.state_dict(),
                    "optimizer": optimizer.state_dict(),
                    "config": {
                        "vocab_size": vocab_size, "max_len": max_len, "d_model": d_model,
                        "num_heads": num_heads, "d_ff": d_ff, "num_layers": num_layers, "dropout": dropout
                    }}, "decoder_only_ckpt.pt")
        print("  -> checkpoint saved: decoder_only_ckpt.pt")


[train step 50] loss(avg)=43.8364 | ppl=10912167756289087488.00
[train step 100] loss(avg)=34.2628 | ppl=758833594836645.88
[train step 150] loss(avg)=29.9449 | ppl=10113906046277.23
[train step 200] loss(avg)=27.0795 | ppl=576092854723.49
[train step 250] loss(avg)=24.9267 | ppl=66915520908.06
[train step 300] loss(avg)=23.1895 | ppl=11777530997.74
[train step 350] loss(avg)=21.8241 | ppl=3006641202.49
[train step 400] loss(avg)=20.6322 | ppl=912939770.61
[train step 450] loss(avg)=19.6395 | ppl=338329479.27
[train step 500] loss(avg)=18.7655 | ppl=141172394.72
[train step 550] loss(avg)=17.9779 | ppl=64227199.57
[train step 600] loss(avg)=17.2921 | ppl=32347759.62
[train step 650] loss(avg)=16.6771 | ppl=17488582.51
[train step 700] loss(avg)=16.1293 | ppl=10112376.37
[train step 750] loss(avg)=15.6384 | ppl=6189789.33
[train step 800] loss(avg)=15.1928 | ppl=3964049.26
[train step 850] loss(avg)=14.7857 | ppl=2638421.56
[train step 900] loss(avg)=14.4120 | ppl=1815673.55
[train step

In [42]:
@torch.no_grad()
def generate(model, tokenizer, prompt_text, max_new_tokens=80, temperature=1.0, top_k=None):
    model.eval()
    enc = tokenizer(prompt_text, return_tensors="pt", add_special_tokens=False)
    x = enc["input_ids"].to(device)   # (1, S0)
    attn_mask_ones = torch.ones_like(x, dtype=torch.long)  # 프롬프트는 패딩 없음

    for _ in range(max_new_tokens):
        # 길이 유지: (최근 max_len 토큰만)
        x_cond = x[:, -max_len:]
        m_cond = attn_mask_ones[:, -max_len:]

        logits, _ = model(x_cond, attention_mask=m_cond)
        next_logits = logits[:, -1, :] / temperature  # 마지막 토큰의 분포

        if top_k is not None:
            v, ix = torch.topk(next_logits, top_k)
            mask = next_logits < v[:, [-1]]
            next_logits = next_logits.masked_fill(mask, -float("inf"))

        probs = F.softmax(next_logits, dim=-1)
        next_id = torch.multinomial(probs, num_samples=1)  # (1,1)

        x = torch.cat([x, next_id], dim=1)
        attn_mask_ones = torch.ones_like(x, dtype=torch.long)

    text = tokenizer.batch_decode(x.tolist(), skip_special_tokens=True)[0]
    return text

# 예시
sample = generate(model, tokenizer, prompt_text="To be, or not to be", max_new_tokens=60, temperature=1.0, top_k=50)
print("=== SAMPLE ===\n", sample)


=== SAMPLE ===
 to be, or not to be o ' d what : : ' t a ' d in the aus his must come, and i ' do that that that ay oure a. i think of, more but that a us


In [43]:
# 저장했던 체크포인트 불러와서 바로 생성만 해보기
ckpt = torch.load("decoder_only_ckpt.pt", map_location=device)
model.load_state_dict(ckpt["model"])
model.eval()

print(generate(model, tokenizer, "O Romeo, Romeo! wherefore art thou Romeo?", 80, 0.9, top_k=40))


o romeo, romeo! wherefore art thou romeo? see ' ll in the you it is, all a no a king you think, what you i ' ll a us us us us york my york you his, i will me on : in ' d to! ' d his : how ' d to? ' d '
