In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import json
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForQuestionAnswering, TrainingArguments, Trainer

# Í≤ΩÎ°ú ÏÑ∏ÌåÖ
train_path = "/content/drive/MyDrive/nlpbook/downstream/korquad-v1/train.json"
val_path   = "/content/drive/MyDrive/nlpbook/downstream/korquad-v1/val.json"
model_ckpt = "klue/bert-base"      # ÎòêÎäî KR-BERT Í≥ÑÏó¥ Îì±
tokenizer  = BertTokenizerFast.from_pretrained(model_ckpt)
model      = BertForQuestionAnswering.from_pretrained(model_ckpt)
# KLUE BERTÎäî max_position_embeddings = 512

# 1. KorQuAD jsonÏùÑ QA Ïåç Î¶¨Ïä§Ìä∏Î°ú Î≥ÄÌôò
def flatten_korquad(json_path):
    with open(json_path, encoding="utf-8") as f:
        raw = json.load(f)["data"]
    out = []
    for article in raw:
        title = article["title"]
        for para in article["paragraphs"]:
            context = para["context"]
            for qa in para["qas"]:
                for ans in qa["answers"]:
                    out.append({
                        "id": qa["id"],
                        "title": title,
                        "context": context,
                        "question": qa["question"],
                        "answers": {
                            "text": [ans["text"]],
                            "answer_start": [ans["answer_start"]]
                        }
                    })
    return out

train_samples = flatten_korquad(train_path)
val_samples   = flatten_korquad(val_path)

# 2. Dataset ÌÅ¥ÎûòÏä§ Ï†ïÏùò
class KorQuADDataset(Dataset):
    def __init__(self, samples, tokenizer, max_length=512):
        self.samples = samples
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        sample = self.samples[idx]
        enc = self.tokenizer(
            sample["question"],
            sample["context"],
            max_length=self.max_length,
            truncation="only_second",
            padding="max_length",
            return_offsets_mapping=True,
            return_tensors="pt",
        )
        offset_mapping = enc.pop("offset_mapping")[0]
        answer = sample["answers"]["text"][0]
        start_char = sample["answers"]["answer_start"][0]
        end_char = start_char + len(answer)

        # Start/end token index Ï∞æÍ∏∞
        start_token, end_token = 0, 0
        for i, (start, end) in enumerate(offset_mapping):
            if start <= start_char < end:
                start_token = i
            if start < end_char <= end:
                end_token = i
        enc = {k: v.squeeze(0) for k, v in enc.items()}
        enc["start_positions"] = torch.tensor(start_token)
        enc["end_positions"] = torch.tensor(end_token)
        return enc

# 3. Tokenizer Î∞è Dataset ÏÉùÏÑ±
tokenizer = BertTokenizerFast.from_pretrained(model_ckpt)
train_dataset = KorQuADDataset(train_samples, tokenizer)
val_dataset   = KorQuADDataset(val_samples, tokenizer)

# 4. Î™®Îç∏ Î∂àÎü¨Ïò§Í∏∞
model = BertForQuestionAnswering.from_pretrained(model_ckpt)

# 5. TrainingArguments Î∞è Trainer Ï†ïÏùò
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="/content/qa-out",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir="/content/qa-logs",
    report_to="none",
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# 6. ÌïôÏäµ Ïã§Ìñâ
trainer.train()


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,1.7092,0.986757
2,0.6628,1.335749


TrainOutput(global_step=1218, training_loss=1.0531780512266362, metrics={'train_runtime': 629.9069, 'train_samples_per_second': 7.734, 'train_steps_per_second': 1.934, 'total_flos': 1273037798817792.0, 'train_loss': 1.0531780512266362, 'epoch': 2.0})

In [None]:
import transformers, inspect, textwrap, importlib
from transformers import TrainingArguments

print("transformers ver:", transformers.__version__)
print("TrainingArguments ÏúÑÏπò:", inspect.getfile(TrainingArguments))

# TrainingArguments ÏÉùÏÑ±Ïûê ÏãúÍ∑∏ÎãàÏ≤ò ÏÇ¥Ìé¥Î≥¥Í∏∞
sig = inspect.signature(TrainingArguments.__init__)
print("\n-- TrainingArguments.__init__ signature --")
print(textwrap.indent(str(sig), "  "))


transformers ver: 4.52.4
TrainingArguments ÏúÑÏπò: /usr/local/lib/python3.11/dist-packages/transformers/training_args.py

-- TrainingArguments.__init__ signature --


In [None]:
# ‚îÄ‚îÄ 1. evaluate() Î°ú Í≤ÄÏ¶ù(=> loss Ìè¨Ìï®) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
metrics = trainer.evaluate(eval_dataset=val_dataset)
# start/end logits ÏùÄ ÌïÑÏöîÌïòÎ©¥ predict_with_generate=True ÎåÄÏã† logits Î∞òÌôò ÏÇ¨Ïö©
start_logits, end_logits = trainer.predict(val_dataset).predictions

# ‚îÄ‚îÄ 2. ÏòàÏ∏° Î¨∏ÏûêÏó¥ ÎΩëÍ∏∞ ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
pred_texts = []
for (s_log, e_log), sample in zip(zip(start_logits, end_logits), val_dataset):
    s = int(np.argmax(s_log)); e = int(np.argmax(e_log))
    if e < s: e = s
    pred_texts.append(tokenizer.decode(sample["input_ids"][s:e+1], skip_special_tokens=True).strip())

# ‚îÄ‚îÄ 3. Ï†ïÎãµ¬∑ÏßÄÌëú Í≥ÑÏÇ∞ (EM/F1) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
gold_texts = [ex["answers"]["text"][0] for ex in val_samples]
EM  = np.mean([exact_match(p, g) for p, g in zip(pred_texts, gold_texts)])
F1  = np.mean([f1_squad (p, g) for p, g in zip(pred_texts, gold_texts)])

loss_val = metrics.get("eval_loss", float("nan"))
print(f"üìä Validation | Loss={loss_val:.4f}  EM={EM:.4f}  F1={F1:.4f}")


üìä Validation | Loss=1.3357  EM=0.6429  F1=0.7192


In [None]:
# ‚ïî‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïó
# ‚ïë  üü¢  Cell 8     ‚ïë  ÏÇ¨Ïö©Ïûê ÏßàÎ¨∏ Ïù∏ÌÑ∞ÎûôÌã∞Î∏å ÏòàÏ∏°
# ‚ïö‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïù
import textwrap, torch

# ‚îÄ‚îÄ 1) ÏßÄÎ¨∏ÏùÑ ÎØ∏Î¶¨ ÏßÄÏ†ï (ÏõêÌïòÎ©¥ Îã§Î•∏ ÎèôÌôîÎ°ú ÍµêÏ≤¥) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
fixed_context = textwrap.dedent("""
    ÏòõÎÇ† Ïñ¥Îäê ÎßàÏùÑÏóê Ìïú Ìï†Î®∏ÎãàÍ∞Ä ÏÇ¥ÏïòÏñ¥Ïöî.
    Ìï†Î®∏ÎãàÎäî ÏïåÎ°ùÎã¨Î°ù ÏòàÏÅú ÍΩÉÎì§ÏùÑ ÌåîÏïÑÏÑú ÏÇ¥Î¶ºÏùÑ Íæ∏Î†§ ÎÇòÍ∞îÏßÄÏöî.
    Í∑∏Îü¨Îçò Ïñ¥Îäê ÎÇ† Ìï†Î®∏ÎãàÍ∞Ä ÍΩÉÎ∞≠Ïóê Î¨ºÏùÑ Ï£ºÍ≥† ÏûàÎäîÎç∞,
    Í∞ëÏûêÍ∏∞ ÌïòÎäòÏóêÏÑú Ï£ºÎ®πÎßå Ìïú Ïö∞Î∞ïÏù¥ Îñ®Ïñ¥Ï°åÏñ¥Ïöî.
    ‚ÄúÏóêÍµ¨Î®∏Îãà!‚Äù ÍπúÏßù ÎÜÄÎûÄ Ìï†Î®∏ÎãàÎäî Ïö∞Î∞ïÏùÑ ÌîºÌï¥ Ïßë ÏïàÏúºÎ°ú Îõ∞Ïñ¥ Îì§Ïñ¥Í∞îÏñ¥Ïöî.
""").strip()

print("\nüìñ  ÏßÄÎ¨∏(context)")
print("-" * 40)
print(fixed_context)
print("-" * 40)

# ‚îÄ‚îÄ 2) ÏÇ¨Ïö©Ïûê ÏßàÎ¨∏ ÏûÖÎ†• ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
user_q = input("‚ùì  ÏßàÎ¨∏ÏùÑ ÏûÖÎ†•ÌïòÏÑ∏Ïöî: ").strip()
if not user_q:
    print("‚û°Ô∏è  ÏßàÎ¨∏Ïù¥ ÎπÑÏñ¥ ÏûàÏäµÎãàÎã§.")
else:
    # ‚îÄ‚îÄ 3) ÏòàÏ∏° Ìï®Ïàò ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    def qa_infer(question: str, context: str, max_len: int = 512) -> str:
        inputs = tokenizer(
            question, context,
            truncation="only_second", max_length=max_len,
            return_offsets_mapping=False, return_tensors="pt"
        ).to(model.device)
        with torch.no_grad():
            out = model(**inputs)
        s = int(torch.argmax(out.start_logits))
        e = int(torch.argmax(out.end_logits))
        if e < s: e = s
        answer_ids = inputs["input_ids"][0][s:e+1]
        return tokenizer.decode(answer_ids, skip_special_tokens=True).strip()

    # ‚îÄ‚îÄ 4) Í≤∞Í≥º Ï∂úÎ†• ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    pred_ans = qa_infer(user_q, fixed_context)
    print("\nüìù  ÏòàÏ∏° ÎãµÎ≥Ä:", pred_ans)



üìñ  ÏßÄÎ¨∏(context)
----------------------------------------
ÏòõÎÇ† Ïñ¥Îäê ÎßàÏùÑÏóê Ìïú Ìï†Î®∏ÎãàÍ∞Ä ÏÇ¥ÏïòÏñ¥Ïöî.
Ìï†Î®∏ÎãàÎäî ÏïåÎ°ùÎã¨Î°ù ÏòàÏÅú ÍΩÉÎì§ÏùÑ ÌåîÏïÑÏÑú ÏÇ¥Î¶ºÏùÑ Íæ∏Î†§ ÎÇòÍ∞îÏßÄÏöî.
Í∑∏Îü¨Îçò Ïñ¥Îäê ÎÇ† Ìï†Î®∏ÎãàÍ∞Ä ÍΩÉÎ∞≠Ïóê Î¨ºÏùÑ Ï£ºÍ≥† ÏûàÎäîÎç∞,
Í∞ëÏûêÍ∏∞ ÌïòÎäòÏóêÏÑú Ï£ºÎ®πÎßå Ìïú Ïö∞Î∞ïÏù¥ Îñ®Ïñ¥Ï°åÏñ¥Ïöî.
‚ÄúÏóêÍµ¨Î®∏Îãà!‚Äù ÍπúÏßù ÎÜÄÎûÄ Ìï†Î®∏ÎãàÎäî Ïö∞Î∞ïÏùÑ ÌîºÌï¥ Ïßë ÏïàÏúºÎ°ú Îõ∞Ïñ¥ Îì§Ïñ¥Í∞îÏñ¥Ïöî.
----------------------------------------
‚ùì  ÏßàÎ¨∏ÏùÑ ÏûÖÎ†•ÌïòÏÑ∏Ïöî: Ìï†Î®∏ÎãàÎäî Ïôú Î∞îÎã§Î°ú Í∞îÎÇòÏöî?

üìù  ÏòàÏ∏° ÎãµÎ≥Ä: Ìï†Î®∏ÎãàÎäî Ïôú Î∞îÎã§Î°ú Í∞îÎÇòÏöî? ÏòõÎÇ† Ïñ¥Îäê ÎßàÏùÑÏóê Ìïú Ìï†Î®∏ÎãàÍ∞Ä ÏÇ¥ÏïòÏñ¥Ïöî. Ìï†Î®∏ÎãàÎäî ÏïåÎ°ùÎã¨Î°ù ÏòàÏÅú ÍΩÉÎì§ÏùÑ ÌåîÏïÑÏÑú ÏÇ¥Î¶ºÏùÑ Íæ∏Î†§ ÎÇòÍ∞îÏßÄÏöî.
