# Imports and Setup

In [1]:
# 라이브러리 설치
!pip install -q datasets portalocker

import torch
import torch.nn as nn
import torch.optim as optim
import math
import time
from collections import Counter
from datasets import load_dataset

# 1. Device & Seed Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

SEED = 42
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)


Using device: cuda


# Data Loading & Preprocessing

In [2]:
# 2. Data Loading & Processing
print("\nLoading Dataset...")
dataset = load_dataset("wikitext", "wikitext-2-v1")

def tokenizer(text):
    return text.lower().split()

class SimpleVocab:
    def __init__(self, token_to_idx, idx_to_token, unk_token="<unk>"):
        self.token_to_idx = token_to_idx
        self.idx_to_token = idx_to_token
        self.unk_token = unk_token
        self.unk_idx = token_to_idx.get(unk_token, 0)

    def __len__(self):
        return len(self.token_to_idx)

    def __getitem__(self, token):
        return self.token_to_idx.get(token, self.unk_idx)

    def lookup_token(self, idx):
        return self.idx_to_token.get(idx, self.unk_token)

def build_vocab(dataset, min_freq=2):
    counter = Counter()
    for item in dataset:
        counter.update(tokenizer(item['text']))

    # 빈도순 정렬
    sorted_by_freq = sorted(counter.items(), key=lambda x: x[1], reverse=True)

    # 특수 토큰
    token_to_idx = {"<unk>": 0, "<eos>": 1}
    idx_to_token = {0: "<unk>", 1: "<eos>"}

    idx = 2
    for token, freq in sorted_by_freq:
        if freq >= min_freq:
            token_to_idx[token] = idx
            idx_to_token[idx] = token
            idx += 1

    return SimpleVocab(token_to_idx, idx_to_token)

# Vocab 생성
vocab = build_vocab(dataset['train'])
print(f"Vocabulary Size: {len(vocab)}")

def data_process(split_name):
    data_iter = dataset[split_name]
    ids = []
    for item in data_iter:
        text = item['text']
        if not text.strip(): continue
        token_ids = [vocab[token] for token in tokenizer(text)]
        ids.extend(token_ids)
        ids.append(vocab.token_to_idx["<eos>"])
    return torch.tensor(ids, dtype=torch.long)

print("Processing Data...")
train_data = data_process('train')
val_data = data_process('validation')
test_data = data_process('test')

# [핵심 수정] 데이터 무결성 강제 확인 및 수정 (Clamping)
# 만약 데이터에 범위를 벗어나는 인덱스가 있다면 <unk>로 강제 치환
def clamp_data(tensor, vocab_size):
    max_val = tensor.max().item()
    if max_val >= vocab_size:
        print(f"WARNING: Found index {max_val} >= vocab size {vocab_size}. Fixing...")
        # 범위를 벗어나는 값은 0(<unk>)으로 변경
        tensor[tensor >= vocab_size] = 0
    return tensor

train_data = clamp_data(train_data, len(vocab))
val_data = clamp_data(val_data, len(vocab))
test_data = clamp_data(test_data, len(vocab))

# Batchify
def batchify(data, bsz):
    seq_len = data.size(0) // bsz
    data = data[:seq_len * bsz]
    data = data.view(bsz, seq_len).t().contiguous()
    return data.to(device)

BATCH_SIZE = 20
EVAL_BATCH_SIZE = 10

train_data = batchify(train_data, BATCH_SIZE)
val_data = batchify(val_data, EVAL_BATCH_SIZE)
test_data = batchify(test_data, EVAL_BATCH_SIZE)

print("Data Ready. Integrity Verified.")



Loading Dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

wikitext-2-v1/test-00000-of-00001.parque(…):   0%|          | 0.00/685k [00:00<?, ?B/s]

wikitext-2-v1/train-00000-of-00001.parqu(…):   0%|          | 0.00/6.07M [00:00<?, ?B/s]

wikitext-2-v1/validation-00000-of-00001.(…):   0%|          | 0.00/618k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Vocabulary Size: 28912
Processing Data...
Data Ready. Integrity Verified.


# GPT Model Definition

In [3]:
# 3. Model Definition
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0).transpose(0, 1))

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class GPTModel(nn.Module):
    def __init__(self, ntoken, d_model, nhead, d_hid, nlayers, dropout=0.2):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = nn.TransformerEncoderLayer(d_model, nhead, d_hid, dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, nlayers)
        self.encoder = nn.Embedding(ntoken, d_model)
        self.d_model = d_model
        self.decoder = nn.Linear(d_model, ntoken)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, src, src_mask):
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src, mask=src_mask)
        output = self.decoder(output)
        return output



# Training Function

In [4]:
# 4. Training Setup
ntokens = len(vocab)
emsize = 200
d_hid = 200
nlayers = 2
nhead = 2
dropout = 0.2
bptt = 35

model = GPTModel(ntokens, emsize, nhead, d_hid, nlayers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
lr = 0.001 # AdamW 기본 LR
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len].reshape(-1)
    return data, target

def generate_mask(sz):
    return nn.Transformer.generate_square_subsequent_mask(sz).to(device)

def train():
    model.train()
    total_loss = 0.
    start_time = time.time()
    src_mask = generate_mask(bptt)

    num_batches = len(train_data) // bptt

    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)

        if data.size(0) != bptt:
            src_mask = generate_mask(data.size(0))

        optimizer.zero_grad()
        output = model(data, src_mask)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch % 200 == 0 and batch > 0:
            cur_loss = total_loss / 200
            elapsed = time.time() - start_time
            print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
                  f'lr {scheduler.get_last_lr()[0]:02.5f} | loss {cur_loss:5.2f} | ppl {math.exp(cur_loss):8.2f}')
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval()
    total_loss = 0.
    src_mask = generate_mask(bptt)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            if data.size(0) != bptt:
                src_mask = generate_mask(data.size(0))
            output = eval_model(data, src_mask)
            output_flat = output.view(-1, ntokens)
            total_loss += data.size(0) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)




# Run Training

In [5]:
# 5. Run Training
print("\nStarting Training...")
epochs = 3
for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print(f'| end of epoch {epoch:3d} | time: {(time.time() - epoch_start_time):5.2f}s | '
          f'valid loss {val_loss:5.2f} | valid ppl {math.exp(val_loss):8.2f}')
    print('-' * 89)
    scheduler.step()



Starting Training...
| epoch   1 |   200/ 2965 batches | lr 0.00100 | loss  6.93 | ppl  1020.86
| epoch   1 |   400/ 2965 batches | lr 0.00100 | loss  6.24 | ppl   510.66
| epoch   1 |   600/ 2965 batches | lr 0.00100 | loss  6.00 | ppl   402.83
| epoch   1 |   800/ 2965 batches | lr 0.00100 | loss  5.93 | ppl   374.80
| epoch   1 |  1000/ 2965 batches | lr 0.00100 | loss  5.83 | ppl   342.00
| epoch   1 |  1200/ 2965 batches | lr 0.00100 | loss  5.82 | ppl   337.92
| epoch   1 |  1400/ 2965 batches | lr 0.00100 | loss  5.79 | ppl   325.44
| epoch   1 |  1600/ 2965 batches | lr 0.00100 | loss  5.78 | ppl   325.12
| epoch   1 |  1800/ 2965 batches | lr 0.00100 | loss  5.66 | ppl   287.20
| epoch   1 |  2000/ 2965 batches | lr 0.00100 | loss  5.66 | ppl   285.89
| epoch   1 |  2200/ 2965 batches | lr 0.00100 | loss  5.56 | ppl   259.35
| epoch   1 |  2400/ 2965 batches | lr 0.00100 | loss  5.61 | ppl   274.10
| epoch   1 |  2600/ 2965 batches | lr 0.00100 | loss  5.60 | ppl   271.51
| e

# Generation

In [23]:

# 6. Generation Test
def generate_text(model, prompt_str, max_len=50, temperature=1.0):
    model.eval()
    tokens = tokenizer(prompt_str)
    indices = [vocab[t] for t in tokens]
    inp = torch.tensor(indices, dtype=torch.long).unsqueeze(1).to(device)

    with torch.no_grad():
        for _ in range(max_len):
            mask = generate_mask(inp.size(0))
            output = model(inp, mask)
            word_weights = output[-1].squeeze().div(temperature).exp().cpu()
            word_idx = torch.multinomial(word_weights, 1)[0]
            inp = torch.cat([inp, torch.tensor([[word_idx]], device=device)], dim=0)

    word_idxs = inp.squeeze().tolist()
    return ' '.join([vocab.lookup_token(idx) for idx in word_idxs])

print("\nGenerated Text:")
print(generate_text(model, "the game was", max_len=30, temperature=0.8))


Generated Text:
the game was published in his second half of the regular season , o 'malley . <eos> = = = game = <eos> = = = virginia tech , jordan averaged 21 @.@
