In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### import

In [3]:
import json, torch
from pathlib import Path
from torch.utils.data import Dataset
from transformers import (
    BertTokenizerFast, BertForQuestionAnswering,
    TrainingArguments, Trainer
)

### Í≤ΩÎ°ú ÏßÄÏ†ï

In [7]:
train_path = "/content/drive/MyDrive/NLP (1)/training.json"
val_path   = "/content/drive/MyDrive/NLP (1)/validation.json"
model_ckpt = "beomi/kcbert-base"

### Î™®Îç∏ Î∂àÎü¨Ïò§Í∏∞

In [8]:
tokenizer = BertTokenizerFast.from_pretrained(model_ckpt)
tokenizer.model_max_length = 300            # ‚Üê 300 Ïù¥ÌïòÎ°ú Í≥†Ï†ï
MAX_LEN   = 256                             # Ïã§Ï†ú ÏûÖÎ†• Í∏∏Ïù¥
DOC_STRIDE = 128                            # Ïä¨ÎùºÏù¥Îî© ÏúàÎèÑ

model = BertForQuestionAnswering.from_pretrained(model_ckpt)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at beomi/kcbert-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞

In [9]:
def flatten(json_path):
    with open(json_path, encoding="utf-8") as f:
        raw = json.load(f)["data"]
    buf = []
    for art in raw:
        for para in art["paragraphs"]:
            ctx = para["context"]
            for qa in para["qas"]:
                for ans in qa["answers"]:
                    buf.append({
                        "id"     : qa["id"],
                        "context": ctx,
                        "question": qa["question"],
                        "answer_text" : ans["text"],
                        "answer_start": ans["answer_start"],
                    })
    return buf

train_samples = flatten(train_path)
val_samples   = flatten(val_path)


class KorQuAD(Dataset):
    def __init__(self, samples):
        self.samples = samples

    def __len__(self): return len(self.samples)

    def __getitem__(self, idx):
        ex = self.samples[idx]

        enc = tokenizer(
            ex["question"], ex["context"],
            max_length=MAX_LEN,
            truncation="only_second",
            stride=DOC_STRIDE,
            return_overflowing_tokens=False,
            return_offsets_mapping=True,
            padding="max_length",
            return_tensors="pt",
        )

        offset = enc.pop("offset_mapping")[0]
        ans_s, ans_e = ex["answer_start"], ex["answer_start"] + len(ex["answer_text"])

        tok_start = tok_end = None
        for i, (s, e) in enumerate(offset):
            if s <= ans_s < e: tok_start = i
            if s <  ans_e <= e: tok_end   = i
        # ÎãµÏù¥ ÏûòÎ†§ ÎÇòÍ∞ÄÎ©¥ ÏÉòÌîå drop
        if tok_start is None or tok_end is None:
            return self.__getitem__((idx+1)%len(self))   # Ïû¨Í∑ÄÎ°ú Îã§Ïùå ÏÉòÌîå

        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["start_positions"] = torch.tensor(tok_start)
        item["end_positions"]   = torch.tensor(tok_end)
        return item

train_ds = KorQuAD(train_samples)
val_ds   = KorQuAD(val_samples)

### ÌõàÎ†®ÌïòÍ∏∞

In [11]:
training_args = TrainingArguments(
    output_dir       = "/content/qa-out",
    eval_strategy = "epoch",
    save_strategy    = "epoch",
    learning_rate    = 5e-5,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    num_train_epochs = 5,
    weight_decay     = 0.01,
    report_to        = "none",
)

trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = train_ds,
    eval_dataset  = val_ds,
    tokenizer     = tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.8042,0.794534
2,0.5196,0.786869
3,0.3306,0.984986
4,0.1829,1.456881
5,0.0728,1.734986


TrainOutput(global_step=29020, training_loss=0.4284141697775159, metrics={'train_runtime': 4560.6064, 'train_samples_per_second': 50.901, 'train_steps_per_second': 6.363, 'total_flos': 3.032871455434752e+16, 'train_loss': 0.4284141697775159, 'epoch': 5.0})

### ÌèâÍ∞ÄÌïòÍ∏∞

Í∞ÄÏû• Í≤∞Í≥ºÍ∞Ä Ï¢ãÏïòÎçò Epoch 2 Î™®Îìà Î∂àÎü¨Ïò§Í∏∞

In [25]:
from transformers import BertForQuestionAnswering

checkpoint_path = "/content/qa-out/checkpoint-11608"
model = BertForQuestionAnswering.from_pretrained(checkpoint_path)

In [36]:
import re, string, collections, numpy as np
from itertools import zip_longest

# ‚îÄ‚îÄ 1. SQuAD Í≥µÏãù Ï†ïÍ∑úÌôî ¬∑ ÌÜ†ÌÅ∞Ìôî ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def _normalize(text: str) -> str:
    text = text.lower()
    text = "".join(ch for ch in text if ch not in string.punctuation)
    text = re.sub(r"\b(a|an|the)\b", " ", text)        # Í¥ÄÏÇ¨ Ï†úÍ±∞
    return " ".join(text.split())

def _tok(text: str):
    return _normalize(text).split()

# ‚îÄ‚îÄ 2. Î©îÌä∏Î¶≠ Ìï®Ïàò (Exact-Match / F1) ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def exact_match(pred: str, gold: str) -> int:
    return int(_normalize(pred) == _normalize(gold))

def f1_squad(pred: str, gold: str) -> float:
    p_toks, g_toks = map(_tok, (pred, gold))
    common = collections.Counter(p_toks) & collections.Counter(g_toks)
    same = sum(common.values())
    if same == 0:
        return 0.0
    precision = same / len(p_toks)
    recall    = same / len(g_toks)
    return 2 * precision * recall / (precision + recall)

# ‚îÄ‚îÄ 3. Í≤ÄÏ¶ù ÏÖã inference ‚Üí ÏòàÏ∏° Î¨∏ÏûêÏó¥ ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
start_logits, end_logits = trainer.predict(val_ds).predictions
pred_texts = []
for (s_log, e_log), sample in zip(zip(start_logits, end_logits), val_ds):
    s = int(np.argmax(s_log));  e = int(np.argmax(e_log))
    if e < s:                   e = s
    text = tokenizer.decode(sample["input_ids"][s:e+1],
                            skip_special_tokens=True).strip()
    pred_texts.append(text)

gold_texts = [ex["answer_text"] for ex in val_samples]

# ‚îÄ‚îÄ 4. ÏßÄÌëú ÏßëÍ≥Ñ ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
EM  = np.mean([exact_match(p, g) for p, g in zip_longest(pred_texts, gold_texts, fillvalue="")])
F1  = np.mean([f1_squad (p, g)   for p, g in zip_longest(pred_texts, gold_texts, fillvalue="")])
loss_val = trainer.evaluate(eval_dataset=val_ds).get("eval_loss", float("nan"))

print(f"üìä  Validation ‚îÇ Loss={loss_val:.4f} ‚îÇ EM={EM:.4f} ‚îÇ F1={F1:.4f}")

üìä  Validation ‚îÇ Loss=1.7350 ‚îÇ EM=0.4864 ‚îÇ F1=0.6076


In [16]:
# ‚îÄ‚îÄ 5. Îç∞Î™®: ÏûÑÏùò ÏÉòÌîåÎ°ú QA Ïù∏ÌÑ∞ÎûôÏÖò ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
def ask_demo(idx:int=None):
    """idx ÏóÜÏúºÎ©¥ Í≤ÄÏ¶ùÏÖã Ï≤´ Ìï≠Î™© ÏÇ¨Ïö©"""
    sample = val_samples[idx or 0]
    context = sample["context"]
    print("‚îÄ ÏßÄÎ¨∏ ‚îÄ")
    print(context[:400], "..." if len(context)>400 else "")  # Í∏∏Î©¥ ÏûòÎùºÏÑú ÌëúÏãú
    print("\n(‚Äª ÏúÑ ÏßÄÎ¨∏ ÏùºÎ∂ÄÎßå ÌëúÏãú - Ï†ÑÏ≤¥Îäî model Ïóê ÏûÖÎ†•Îê©ÎãàÎã§)\n")
    q = input("üó®Ô∏è  ÏßàÎ¨∏ÏùÑ ÏûÖÎ†•ÌïòÏÑ∏Ïöî: ").strip()
    enc = tokenizer(q, context,
                    truncation="only_second", max_length=512,
                    return_offsets_mapping=False, return_tensors="pt").to(trainer.model.device)
    with torch.no_grad():
        out = trainer.model(**enc)
    s = int(out.start_logits.argmax()); e = int(out.end_logits.argmax())
    if e < s: e = s
    answer = tokenizer.decode(enc["input_ids"][0][s:e+1],
                              skip_special_tokens=True).strip()
    print(f"ü§ñ  ÎãµÎ≥Ä: {answer}")

# ÏÇ¨Ïö© ÏòàÏãú
ask_demo(2)     # ÏÖÄ Ïã§Ìñâ ÌõÑ ÏΩòÏÜîÏóê ÏßàÎ¨∏ ÏûÖÎ†•

‚îÄ ÏßÄÎ¨∏ ‚îÄ
Ïñ¥Îäê ÎßàÏùÑÏóê Î¶¥Î¶¨ÏóîÌÉàÍ≥º Íµ¨Ïä§ÌÉÄÌîÑÎùºÎäî ÌòïÏ†úÍ∞Ä ÏÇ¥ÏïòÏñ¥. Î¶¥Î¶¨ÏóîÌÉàÍ≥º Íµ¨Ïä§ÌÉÄÌîÑÎäî Ïñ¥Î¶¥ ÎïåÎ∂ÄÌÑ∞ ÌïòÎäòÏùÑ ÏûêÏú†Î°≠Í≤å ÎÇ†ÏïÑÎã§ÎãàÎäî ÏÉàÎ•º Î∂ÄÎü¨ÏõåÌñàÏñ¥. Í∑∏ÎûòÏÑú ÎÇ†ÎßàÎã§ Ïñ∏ÎçïÏóê Ïò¨Îùº ÌïòÎäòÏùÑ ÎÇòÎäî ÏÉàÎ•º Íµ¨Í≤ΩÌïòÍ≥§ ÌñàÏßÄ. 

(‚Äª ÏúÑ ÏßÄÎ¨∏ ÏùºÎ∂ÄÎßå ÌëúÏãú - Ï†ÑÏ≤¥Îäî model Ïóê ÏûÖÎ†•Îê©ÎãàÎã§)

üó®Ô∏è  ÏßàÎ¨∏ÏùÑ ÏûÖÎ†•ÌïòÏÑ∏Ïöî: Î¶¥Î¶¨ÏóîÌÉàÏùÄ Î¨¥ÏóáÏùÑ Î∂ÄÎü¨ÏõåÌñàÎÇòÏöî?
ü§ñ  ÎãµÎ≥Ä: ÌïòÎäòÏùÑ ÏûêÏú†Î°≠Í≤å ÎÇ†ÏïÑÎã§ÎãàÎäî ÏÉà


### Save

In [32]:
# Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†ÄÎ•º ÏõêÌïòÎäî Í≤ΩÎ°úÏóê Ï†ÄÏû•
save_path = "/content/mymodel"

In [33]:
model.save_pretrained(save_path)

In [34]:
tokenizer.save_pretrained(save_path)

('/content/mymodel/tokenizer_config.json',
 '/content/mymodel/special_tokens_map.json',
 '/content/mymodel/vocab.txt',
 '/content/mymodel/added_tokens.json',
 '/content/mymodel/tokenizer.json')

In [35]:
import shutil
shutil.make_archive('mymodel', 'zip', './mymodel')

'/content/mymodel.zip'