In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from collections import Counter

In [None]:
from datasets import load_dataset

dataset = load_dataset("klue", "ner")

# 위치, 조직, 사람 이름
target_tags = {
    "B-LC": "B-LC",
    "I-LC": "I-LC",
    "B-OG": "B-OG",
    "I-OG": "I-OG",
    "B-PS": "B-PS",
    "I-PS": "I-PS",
}

# NER 태그 숫자 <-> 이름 매핑 가져오기
ner_labels = dataset['train'].features['ner_tags'].feature.names
tag_to_id = {tag: i for i, tag in enumerate(ner_labels)}

def modify_tags(example):
    new_tags = []
    for tag in example['ner_tags']:
        tag_name = ner_labels[tag]
        if tag_name in target_tags:
            new_tags.append(tag)
        else:
            new_tags.append(tag_to_id["O"])  # 'O'로 변경
    example['ner_tags'] = new_tags
    return example

modified_dataset = dataset.map(modify_tags)

print("수정된 데이터 확인:")
for i in range(3):
    print("문장:", modified_dataset['train'][i]['sentence'])
    print("NER 태그 (수정):", [ner_labels[tag] for tag in modified_dataset['train'][i]['ner_tags']])
    print()



  from .autonotebook import tqdm as notebook_tqdm


수정된 데이터 확인:
문장: 특히 <영동고속도로:LC> <강릉:LC> 방향 <문막휴게소:LC>에서 <만종분기점:LC>까지 <5㎞:QT> 구간에는 승용차 전용 임시 갓길차로제를 운영하기로 했다.
NER 태그 (수정): ['O', 'O', 'O', 'B-LC', 'I-LC', 'I-LC', 'I-LC', 'I-LC', 'I-LC', 'O', 'B-LC', 'I-LC', 'O', 'O', 'O', 'O', 'B-LC', 'I-LC', 'I-LC', 'I-LC', 'I-LC', 'O', 'O', 'O', 'B-LC', 'I-LC', 'I-LC', 'I-LC', 'I-LC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

문장: <한군데:QT>서 필름을 너무 낭비한 작품입니다.
NER 태그 (수정): ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

문장: 하지만 이영화에는 감히 별 <5개:QT>를 주고싶다
NER 태그 (수정): ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']



In [None]:
train_data = modified_dataset['train']

all_tags = []
for data in train_data:
    all_tags.extend(data['ner_tags'])

tag_counts = Counter(all_tags)
tag_distribution = {ner_labels[tag]: count for tag, count in tag_counts.items()}

print("수정된 데이터의 태그 분포:")
print(tag_distribution)


수정된 데이터의 태그 분포:
{'O': 1047647, 'B-LC': 6663, 'I-LC': 18506, 'B-OG': 8491, 'I-OG': 26879, 'B-PS': 14453, 'I-PS': 28227}


In [None]:
modified_dataset['train'].to_json("klue_ner_train_modified.json", force_ascii=False)
modified_dataset['validation'].to_json("klue_ner_validation_modified.json", force_ascii=False)


Creating json from Arrow format: 100%|██████████| 22/22 [00:00<00:00, 72.37ba/s]
Creating json from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 66.55ba/s]


3331501

KLUE-BERT 모델을 사용하여 수정된 train 및 validation 데이터를 훈련시키고, O로 분류되지 않은 클래스에 대해 해당 텍스트를 MASK로 대체하는 코드

In [4]:
cd /content/drive/MyDrive/ner

/content/drive/MyDrive/ner


In [5]:
import json

with open("klue_ner_train_fixed.json", "r", encoding="utf-8") as f:
    train_data = json.load(f)

with open("klue_ner_validation_fixed.json", "r", encoding="utf-8") as f:
    validation_data  = json.load(f)

print("Train 데이터 샘플:", train_data[0])


Train 데이터 샘플: {'sentence': '특히 <영동고속도로:LC> <강릉:LC> 방향 <문막휴게소:LC>에서 <만종분기점:LC>까지 <5㎞:QT> 구간에는 승용차 전용 임시 갓길차로제를 운영하기로 했다.', 'tokens': ['특', '히', ' ', '영', '동', '고', '속', '도', '로', ' ', '강', '릉', ' ', '방', '향', ' ', '문', '막', '휴', '게', '소', '에', '서', ' ', '만', '종', '분', '기', '점', '까', '지', ' ', '5', '㎞', ' ', '구', '간', '에', '는', ' ', '승', '용', '차', ' ', '전', '용', ' ', '임', '시', ' ', '갓', '길', '차', '로', '제', '를', ' ', '운', '영', '하', '기', '로', ' ', '했', '다', '.'], 'ner_tags': [12, 12, 12, 2, 3, 3, 3, 3, 3, 12, 2, 3, 12, 12, 12, 12, 2, 3, 3, 3, 3, 12, 12, 12, 2, 3, 3, 3, 3, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12]}


In [7]:
pip install transformers datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
import numpy as np

model_name = "klue/bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=7)  # 총 클래스 수 설정

old_to_new_mapping = {2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 12: 0}

def remap_labels(data):
    # ner_tags를 새로운 매핑에 따라 변환
    data["ner_tags"] = [old_to_new_mapping.get(tag, 0) for tag in data["ner_tags"]]
    return data

def preprocess_data(data):
    sentences = data["tokens"]
    ner_tags = data["ner_tags"]

    tokenized_inputs = tokenizer(
        sentences,
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=True
    )

    labels = []
    word_ids = tokenized_inputs.word_ids(batch_index=0)
    for word_id in word_ids:
        if word_id is None:
            labels.append(-100)
        else:
            labels.append(ner_tags[word_id])

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = Dataset.from_list(train_data)
validation_dataset = Dataset.from_list(validation_data)

train_dataset = train_dataset.map(remap_labels)
validation_dataset = validation_dataset.map(remap_labels)

train_dataset = train_dataset.map(preprocess_data, batched=False)
validation_dataset = validation_dataset.map(preprocess_data, batched=False)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
validation_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/21008 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/21008 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [None]:
print("수정된 Train 데이터 샘플:", train_dataset[0])


수정된 Train 데이터 샘플: {'input_ids': tensor([   2, 1813, 1969, 1437,  856,  594, 1283,  848,  991,  553, 1026, 1129,
        1904, 1091, 1037, 1956,  578, 1282, 1421, 1258, 1038, 1558, 1175,  645,
        1540,  653, 1583,   25,  207,  615,  545, 1421,  793, 1324, 1468, 1632,
        1537, 1468, 1510, 1325,  551,  647, 1632,  991, 1545, 1022, 1471, 1437,
        1889,  645,  991, 1902,  809,   18,    3,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1

In [None]:
for i in range(3):  # 첫 3개 샘플 확인
    print(f"샘플 {i} ner_tags:", train_dataset[i]["labels"])


샘플 0 ner_tags: tensor([-100,    0,    0,    1,    2,    2,    2,    2,    2,    1,    2,    0,
           0,    1,    2,    2,    2,    2,    0,    0,    1,    2,    2,    2,
           2,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100])
샘플 1 ner_tags: tensor([-100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer
)


trainer.train()

save_directory = "./trained_model"

model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"모델이 {save_directory}에 저장되었습니다.")


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.1095,0.12808
2,0.0702,0.120672
3,0.0536,0.127283


모델이 ./trained_model에 저장되었습니다.


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

save_directory = "./trained_model"
model = AutoModelForTokenClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

text = "서울특별시는 이민철의 집입니다."

inputs = tokenizer(
    text,
    truncation=True,
    padding="max_length",
    max_length=128,
    return_tensors="pt"
)

model.eval()
with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=2)
predicted_tags = [model.config.id2label[p.item()] for p in predictions[0]]

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
for token, tag in zip(tokens, predicted_tags):
    print(f"{token:10} {tag}")


[CLS]      LABEL_0
서울특별시      LABEL_0
##는        LABEL_0
이민         LABEL_5
##철        LABEL_6
##의        LABEL_0
집          LABEL_0
##입니다      LABEL_0
.          LABEL_0
[SEP]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_5
[PAD]      LABEL_6
[PAD]      LABEL_6
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_5
[PAD]      LABEL_6
[PAD]      LABEL_6
[PAD]      LABEL_5
[PAD]      LABEL_6
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_5
[PAD]      LABEL_6
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      LABEL_0
[PAD]      L

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

save_directory = "./trained_model"
model = AutoModelForTokenClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

id2label = model.config.id2label

def mask_sensitive_info(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_tags = [id2label[tag.item()] for tag in predictions[0]]

    # 민감한 정보를 [MASK] 처리 LABEL_1 ~ LABEL_6에 대해서만
    masked_tokens = [
        "[MASK]" if tag in [f"LABEL_{i}" for i in range(1, 7)] else token
        for token, tag in zip(tokens, predicted_tags)
    ]

    print("Tokens and Tags:")
    for token, tag in zip(tokens, predicted_tags):
        print(f"Token: {token}, Tag: {tag}")

    return tokenizer.convert_tokens_to_string(masked_tokens)

example_text = "서울특별시 양천구는 이민철씨의 집입니다."
masked_text = mask_sensitive_info(example_text)
print("마스킹된 문장:", masked_text)


Tokens and Tags:
Token: [CLS], Tag: LABEL_0
Token: 서울특별시, Tag: LABEL_1
Token: 양천구, Tag: LABEL_2
Token: ##는, Tag: LABEL_0
Token: 이민, Tag: LABEL_5
Token: ##철, Tag: LABEL_6
Token: ##씨, Tag: LABEL_0
Token: ##의, Tag: LABEL_0
Token: 집, Tag: LABEL_0
Token: ##입니다, Tag: LABEL_0
Token: ., Tag: LABEL_0
Token: [SEP], Tag: LABEL_0
마스킹된 문장: [CLS] [MASK] [MASK]는 [MASK] [MASK]씨의 집입니다. [SEP]
