In [None]:
# !pip install transformers torch  # 이미 설치했다면 생략 가능

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import GPT2Tokenizer
import json


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


In [None]:
class GPTBlock(nn.Module):
    def __init__(self, dim, heads, mlp_dim):
        super().__init__()
        self.ln1 = nn.LayerNorm(dim)
        self.attn = nn.MultiheadAttention(embed_dim=dim, num_heads=heads, batch_first=True)
        self.ln2 = nn.LayerNorm(dim)
        self.mlp = nn.Sequential(
            nn.Linear(dim, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, dim)
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x))[0]
        x = x + self.mlp(self.ln2(x))
        return x

class MiniGPT(nn.Module):
    def __init__(self, vocab_size, dim=512, depth=8, heads=8, mlp_dim=2048, max_len=256):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, dim)
        self.pos_emb = nn.Embedding(max_len, dim)
        self.blocks = nn.Sequential(*[GPTBlock(dim, heads, mlp_dim) for _ in range(depth)])
        self.ln_f = nn.LayerNorm(dim)
        self.head = nn.Linear(dim, vocab_size, bias=False)

    def forward(self, x):
        B, T = x.shape
        pos = torch.arange(T, device=x.device).unsqueeze(0)
        x = self.token_emb(x) + self.pos_emb(pos)
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.head(x)


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # padding은 eos로 처리

class PromptCompletionDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length=128):
        self.samples = []
        with open(file_path, encoding="utf-8") as f:
            for line in f:
                item = json.loads(line)
                text = f"{item['prompt']} -> {item['completion']}"
                tokens = tokenizer.encode(text, truncation=True, max_length=max_length)
                self.samples.append(torch.tensor(tokens))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

def collate_fn(batch):
    return pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


pad_token: [PAD]
pad_token_id: 51200


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dataset = PromptCompletionDataset("traffic_data.jsonl", tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

model = MiniGPT(vocab_size=tokenizer.vocab_size).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
scaler = torch.cuda.amp.GradScaler()
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in dataloader:
        inputs = batch.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            logits = model(inputs)
            loss = loss_fn(logits[:, :-1].reshape(-1, tokenizer.vocab_size),
                           inputs[:, 1:].reshape(-1))
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss / len(dataloader):.4f}")
    torch.save(model.state_dict(), f"mini_gpt_epoch{epoch+1}.pth")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model.eval()

prompt = "(10,2) 지점에서 '정체' 상황을 적용해"
input_text = f"{prompt} ->"
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

for _ in range(50):
    with torch.no_grad():
        logits = model(input_ids)
        next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)
        input_ids = torch.cat([input_ids, next_token], dim=1)
        if next_token.item() == tokenizer.eos_token_id:
            break

output = tokenizer.decode(input_ids[0])
print(output)


결과: {"action": "blocked덥������지량��"}_blockedreduction�blocked량 � �낮�_bl


In [21]:
import json
from tqdm import tqdm
from transformers import GPT2Tokenizer
import torch

# 모델 및 토크나이저 준비
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 평가용 모델 로딩
model = MiniGPT(vocab_size=tokenizer.vocab_size).to(device)
model.load_state_dict(torch.load("./models/mini_gpt_epoch5.pth", map_location=device))
model.eval()

# 평가용 데이터셋 로딩 (prompt, completion 포함)
with open("./create-dataset/dataset/test.jsonl", encoding="utf-8") as f:
    eval_data = [json.loads(line) for line in f]

# 평가 지표 집계
total = len(eval_data)
json_success = 0
exact_match = 0

# 평가 루프
for sample in tqdm(eval_data):
    prompt = sample["prompt"]
    gold = sample["completion"]

    # 모델 추론
    input_text = f"{prompt} ->"
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    for _ in range(50):
        with torch.no_grad():
            logits = model(input_ids)
            next_token = torch.argmax(logits[:, -1, :], dim=-1).unsqueeze(0)
            input_ids = torch.cat([input_ids, next_token], dim=1)
            if next_token.item() == tokenizer.eos_token_id:
                break

    # 디코딩 개선
    output_text = tokenizer.decode(
        input_ids[0],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )

    # 프롬프트 이후 부분만 추출
    try:
        generated = output_text.split("->", 1)[1].strip()
    except IndexError:
        generated = ""

    # 1. JSON 파싱 가능 여부
    try:
        _ = json.loads(generated)
        json_success += 1
    except:
        pass

    # 2. 문자열 정확도 비교 (또는 json.loads로 비교 가능)
    if generated == gold:
        exact_match += 1

# 결과 출력
print("\n--- 평가 결과 ---")
print(f"총 샘플 수: {total}")
print(f"JSON 파싱 성공률: {json_success / total:.2%}")
print(f"정확한 문자열 일치: {exact_match / total:.2%}")


 10%|█         | 210/2000 [01:17<10:56,  2.73it/s]


KeyboardInterrupt: 