In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### import

In [3]:
import json, torch
from pathlib import Path
from torch.utils.data import Dataset
from transformers import (
    BertTokenizerFast, BertForQuestionAnswering,
    TrainingArguments, Trainer
)

### 경로 지정

In [7]:
train_path = "/content/drive/MyDrive/NLP (1)/training.json"
val_path   = "/content/drive/MyDrive/NLP (1)/validation.json"
model_ckpt = "beomi/kcbert-base"

### 모델 불러오기

In [8]:
tokenizer = BertTokenizerFast.from_pretrained(model_ckpt)
tokenizer.model_max_length = 300            # ← 300 이하로 고정
MAX_LEN   = 256                             # 실제 입력 길이
DOC_STRIDE = 128                            # 슬라이딩 윈도

model = BertForQuestionAnswering.from_pretrained(model_ckpt)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 데이터 불러오기

In [9]:
def flatten(json_path):
    with open(json_path, encoding="utf-8") as f:
        raw = json.load(f)["data"]
    buf = []
    for art in raw:
        for para in art["paragraphs"]:
            ctx = para["context"]
            for qa in para["qas"]:
                for ans in qa["answers"]:
                    buf.append({
                        "id"     : qa["id"],
                        "context": ctx,
                        "question": qa["question"],
                        "answer_text" : ans["text"],
                        "answer_start": ans["answer_start"],
                    })
    return buf

train_samples = flatten(train_path)
val_samples   = flatten(val_path)


class KorQuAD(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]

        enc = tokenizer(
            ex["question"], ex["context"],
            max_length=MAX_LEN,
            truncation="only_second",
            stride=DOC_STRIDE,
            return_overflowing_tokens=False,
            return_offsets_mapping=True,
            padding="max_length",
            return_tensors="pt",
        )

        offset = enc.pop("offset_mapping")[0]
        ans_s, ans_e = ex["answer_start"], ex["answer_start"] + len(ex["answer_text"])

        tok_start = tok_end = None
        for i, (s, e) in enumerate(offset):
            if s <= ans_s < e: tok_start = i
            if s <  ans_e <= e: tok_end   = i
        # 답이 잘려 나가면 샘플 drop
        if tok_start is None or tok_end is None:
            return self.__getitem__((idx+1)%len(self))   # 재귀로 다음 샘플

        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["start_positions"] = torch.tensor(tok_start)
        item["end_positions"]   = torch.tensor(tok_end)
        return item

train_ds = KorQuAD(train_samples)
val_ds   = KorQuAD(val_samples)

### 훈련하기

In [11]:
training_args = TrainingArguments(
    output_dir       = "/content/qa-out",
    eval_strategy = "epoch",
    save_strategy    = "epoch",
    learning_rate    = 5e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    num_train_epochs = 5,
    weight_decay     = 0.01,
    report_to        = "none",
)

trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = train_ds,
    eval_dataset  = val_ds,
    tokenizer     = tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.8042,0.794534
2,0.5196,0.786869
3,0.3306,0.984986
4,0.1829,1.456881
5,0.0728,1.734986


TrainOutput(global_step=29020, training_loss=0.4284141697775159, metrics={'train_runtime': 4560.6064, 'train_samples_per_second': 50.901, 'train_steps_per_second': 6.363, 'total_flos': 3.032871455434752e+16, 'train_loss': 0.4284141697775159, 'epoch': 5.0})

### 평가하기

In [14]:
# ────────── flatten 후 그대로 보존 ──────────
train_samples = flatten_korquad(train_path)   # ← answers 포함
val_samples   = flatten_korquad(val_path)     # ← answers 포함

# … Dataset/Trainer 코드 …

# ----------  예측 ----------
start_logits, end_logits = trainer.predict(val_ds).predictions

# ----------  gold │ pred ----------
pred_texts = []
for (s_log, e_log), enc in zip(zip(start_logits, end_logits), val_ds):
    s = int(np.argmax(s_log));  e = int(np.argmax(e_log))
    if e < s: e = s
    pred_texts.append(
        tokenizer.decode(enc["input_ids"][s:e+1], skip_special_tokens=True).strip()
    )

gold_texts = [sample["answers"]["text"][0] for sample in val_samples]  # ★ 여기 수정


NameError: name 'flatten_korquad' is not defined

In [13]:
import re, string, collections, numpy as np
from itertools import zip_longest

# ── 1. SQuAD 공식 정규화 · 토큰화 ──────────────────────────
def _normalize(text: str) -> str:
    text = text.lower()
    text = "".join(ch for ch in text if ch not in string.punctuation)
    text = re.sub(r"\b(a|an|the)\b", " ", text)        # 관사 제거
    return " ".join(text.split())

def _tok(text: str):
    return _normalize(text).split()

# ── 2. 메트릭 함수 (Exact-Match / F1) ──────────────────────
def exact_match(pred: str, gold: str) -> int:
    return int(_normalize(pred) == _normalize(gold))

def f1_squad(pred: str, gold: str) -> float:
    p_toks, g_toks = map(_tok, (pred, gold))
    common = collections.Counter(p_toks) & collections.Counter(g_toks)
    same = sum(common.values())
    if same == 0:
        return 0.0
    precision = same / len(p_toks)
    recall    = same / len(g_toks)
    return 2 * precision * recall / (precision + recall)

# ── 3. 검증 셋 inference → 예측 문자열 ─────────────────────
start_logits, end_logits = trainer.predict(val_ds).predictions
pred_texts = []
for (s_log, e_log), sample in zip(zip(start_logits, end_logits), val_ds):
    s = int(np.argmax(s_log));  e = int(np.argmax(e_log))
    if e < s:                   e = s
    text = tokenizer.decode(sample["input_ids"][s:e+1],
                            skip_special_tokens=True).strip()
    pred_texts.append(text)

gold_texts = [ex["answers"]["text"][0] for ex in val_samples]

# ── 4. 지표 집계 ───────────────────────────────────────────
EM  = np.mean([exact_match(p, g) for p, g in zip_longest(pred_texts, gold_texts, fillvalue="")])
F1  = np.mean([f1_squad (p, g)   for p, g in zip_longest(pred_texts, gold_texts, fillvalue="")])
loss_val = trainer.evaluate(eval_dataset=val_ds).get("eval_loss", float("nan"))

print(f"📊  Validation │ Loss={loss_val:.4f} │ EM={EM:.4f} │ F1={F1:.4f}")

KeyError: 'answers'

In [16]:
# ── 5. 데모: 임의 샘플로 QA 인터랙션 ───────────────────────
def ask_demo(idx:int=None):
    """idx 없으면 검증셋 첫 항목 사용"""
    sample = val_samples[idx or 0]
    context = sample["context"]
    print("─ 지문 ─")
    print(context[:400], "..." if len(context)>400 else "")  # 길면 잘라서 표시
    print("\n(※ 위 지문 일부만 표시 - 전체는 model 에 입력됩니다)\n")
    q = input("🗨️  질문을 입력하세요: ").strip()
    enc = tokenizer(q, context,
                    truncation="only_second", max_length=512,
                    return_offsets_mapping=False, return_tensors="pt").to(trainer.model.device)
    with torch.no_grad():
        out = trainer.model(**enc)
    s = int(out.start_logits.argmax()); e = int(out.end_logits.argmax())
    if e < s: e = s
    answer = tokenizer.decode(enc["input_ids"][0][s:e+1],
                              skip_special_tokens=True).strip()
    print(f"🤖  답변: {answer}")

# 사용 예시
ask_demo(2)     # 셀 실행 후 콘솔에 질문 입력

─ 지문 ─
어느 마을에 릴리엔탈과 구스타프라는 형제가 살았어. 릴리엔탈과 구스타프는 어릴 때부터 하늘을 자유롭게 날아다니는 새를 부러워했어. 그래서 날마다 언덕에 올라 하늘을 나는 새를 구경하곤 했지. 

(※ 위 지문 일부만 표시 - 전체는 model 에 입력됩니다)

🗨️  질문을 입력하세요: 릴리엔탈은 무엇을 부러워했나요?
🤖  답변: 하늘을 자유롭게 날아다니는 새


### Save

In [None]:
# 모델과 토크나이저를 원하는 경로에 저장
save_path = './my_model'  # colab 내 디렉토리명

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
import shutil
shutil.make_archive('my_model', 'zip', './my_model')