In [1]:
from datasets import Dataset
from transformers import (
    AutoTokenizer, AutoModelForTokenClassification,
    TrainingArguments, Trainer
)
import json
import evaluate
import torch

# 1. 라벨 정의
LABEL_LIST = [
    "O",
    "B-이름", "I-이름",
    "B-주민번호", "I-주민번호",
    "B-전화번호", "I-전화번호",
    "B-이메일", "I-이메일",
    "B-카드번호", "I-카드번호"
]
label2id = {l: i for i, l in enumerate(LABEL_LIST)}
id2label = {i: l for l, i in label2id.items()}

# 2. 데이터 로딩 (doccano-like JSONL → BIO 변환)
def load_ner_dataset(path):
    texts, labels = [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            text = item["text"]
            bio = ["O"] * len(text)
            for s, e, lab in item["labels"]:
                bio[s] = f"B-{lab}"
                for i in range(s + 1, e):
                    bio[i] = f"I-{lab}"
            texts.append(list(text))
            labels.append([label2id[tag] for tag in bio])
    return Dataset.from_dict({"tokens": texts, "ner_tags": labels})

dataset = load_ner_dataset("ner_dataset_ko.jsonl").train_test_split(test_size=0.2, seed=42)
train_raw, eval_raw = dataset["train"], dataset["test"]

# 3. 토크나이저

# padding 값을 True 로 해서 다양한 길이 값 차이에 대비하게 해준다.
tok = AutoTokenizer.from_pretrained("klue/roberta-base")

def tokenize_and_align(batch):
    tokenized = tok(batch["tokens"], is_split_into_words=True, truncation=True, padding=True)
    labels = []
    for i, word_ids in enumerate(tokenized.word_ids(batch_index=i) for i in range(len(batch["tokens"]))):
        label_ids = []
        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)
            else:
                label_ids.append(batch["ner_tags"][i][wid])
        labels.append(label_ids)
    tokenized["labels"] = labels
    return tokenized

train_dataset = train_raw.map(tokenize_and_align, batched=True)
eval_dataset = eval_raw.map(tokenize_and_align, batched=True)

# 4. 모델 로드화
model = AutoModelForTokenClassification.from_pretrained(
    "klue/roberta-base",
    num_labels=len(LABEL_LIST),
    id2label=id2label,
    label2id=label2id
)

# 5. 평가 지표
metric = evaluate.load("seqeval")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = logits.argmax(-1)
    true_preds, true_labels = [], []
    for pred, lab in zip(preds, labels):
        cur_preds, cur_labels = [], []
        for p, l in zip(pred, lab):
            if l != -100:
                cur_preds.append(id2label[p])
                cur_labels.append(id2label[l])
        true_preds.append(cur_preds)
        true_labels.append(cur_labels)
    results = metric.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

# 6. 학습 설정
args = TrainingArguments(
    output_dir="./ner_model",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",            
    save_strategy="epoch",
    logging_dir="./logs"
)

# 7. Trainer 정의
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tok,
    compute_metrics=compute_metrics
)

# 8. 학습 시작
trainer.train()

# 9. 학습 저장
trainer.save_model("./ner_model")  
tok.save_pretrained("./ner_model")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 800/800 [00:00<00:00, 8531.19 examples/s]
Map: 100%|██████████| 200/200 [00:00<00:00, 7974.95 examples/s]
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.004774,1.0,1.0,1.0,1.0
2,No log,0.003191,1.0,1.0,1.0,1.0
3,No log,0.002883,1.0,1.0,1.0,1.0




('./ner_model\\tokenizer_config.json',
 './ner_model\\special_tokens_map.json',
 './ner_model\\vocab.txt',
 './ner_model\\added_tokens.json',
 './ner_model\\tokenizer.json')

In [7]:
import re
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from typing import List, Tuple

# 1. 라벨 매핑
LABEL_LIST = [
    "O",
    "B-이름", "I-이름",
    "B-주민번호", "I-주민번호",
    "B-전화번호", "I-전화번호",
    "B-이메일", "I-이메일",
    "B-카드번호", "I-카드번호"
]
id2label = {i: label for i, label in enumerate(LABEL_LIST)}

# 2. 모델 & 토크나이저 로드
model_path = "./ner_model"
model = AutoModelForTokenClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model.eval()

# 3. NER 추론 함수
def ner_predict(text: str) -> List[Tuple[str, str]]:
    tokens = list(text)
    tokenized = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True)
    with torch.no_grad():
        output = model(**tokenized)
    predictions = output.logits.argmax(dim=-1).squeeze().tolist()
    word_ids = tokenized.word_ids()
    merged = []
    prev_word_id = None
    for idx, wid in enumerate(word_ids):
        if wid is None or wid == prev_word_id:
            continue
        merged.append((tokens[wid], id2label[predictions[idx]]))
        prev_word_id = wid
    return merged

# 4. 엔티티 병합 함수
def merge_entities(tagged_tokens: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    result = []
    current_tag = None
    current_text = ""
    for char, tag in tagged_tokens:
        if tag.startswith("B-"):
            if current_tag:
                result.append((current_text, current_tag))
            current_tag = tag[2:]
            current_text = char
        elif tag.startswith("I-") and current_tag == tag[2:]:
            current_text += char
        else:
            if current_tag:
                result.append((current_text, current_tag))
                current_tag = None
            current_text = ""
    if current_tag:
        result.append((current_text, current_tag))
    return result

# 5. 마스킹 포맷 정의
def mask_entity(text: str, label: str) -> str:
    if label == "이름":
        return text[0] + "**"
    elif label == "주민번호":
        return re.sub(r"\d{6}-\d{7}", lambda m: m.group(0)[:6] + "-*******", text)
    elif label == "전화번호":
        return re.sub(r"\d{2,3}-\d{3,4}-\d{4}", lambda m: m.group(0)[:3] + "-****-" + m.group(0)[-4:], text)
    elif label == "이메일":
        local, _, domain = text.partition("@")
        return local[0] + "***@" + domain
    elif label == "카드번호":
        return re.sub(r"\d{4}-\d{4}-\d{4}-\d{4}", lambda m: m.group(0)[:4] + "-****-****-" + m.group(0)[-4:], text)
    else:
        return text

# 6. 정규표현식 보완 마스킹 (NER 탐지 누락 대비)
def regex_based_mask(text: str) -> str:
    text = re.sub(r"\d{6}-\d{7}", lambda m: m.group(0)[:6] + "-*******", text)  # 주민번호
    text = re.sub(r"\d{2,3}-\d{3,4}-\d{4}", lambda m: m.group(0)[:3] + "-****-" + m.group(0)[-4:], text)  # 전화번호
    text = re.sub(r"\b([a-zA-Z0-9._%+-]+)@([a-zA-Z0-9.-]+\.[a-zA-Z]{2,})\b", lambda m: m.group(1)[0] + "***@" + m.group(2), text)  # 이메일
    text = re.sub(r"\b(\d{4})-(\d{4})-(\d{4})-(\d{4})\b", lambda m: f"{m.group(1)}-****-****-{m.group(4)}", text)  # 카드번호
    return text

# 7. 통합 마스킹 함수 (NER + 정규표현식)
def mask_text(text: str) -> str:
    tagged = ner_predict(text)
    entities = merge_entities(tagged)
    masked_text = text
    already_masked = set()
    for original, label in entities:
        if original in already_masked:
            continue
        masked = mask_entity(original, label)
        masked_text = masked_text.replace(original, masked, 1)
        already_masked.add(original)
    final_text = regex_based_mask(masked_text)
    return final_text


In [15]:
sample = "홍길동의 주민번호는 900101-1234567, 전화번호는 010-3443-7935, 이메일은 djawjdgml56@naver.com, 카드번호는 1234-5678-1234-3456입니다."
print("원문:", sample)
print("마스킹:", mask_text(sample))


원문: 홍길동의 주민번호는 900101-1234567, 전화번호는 010-3443-7935, 이메일은 djawjdgml56@naver.com, 카드번호는 1234-5678-1234-3456입니다.
마스킹: 홍**의 주민번호는 900101-*******, 전화번호는 010-****-7935, 이메일은 d***@naver.com, 카드번호는 1234-****-1234-3456입니다.
