In [1]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
import numpy as np

class NERTrainer:
    def __init__(self, model_name='klue/bert-base', num_labels=3, label_names=['그 외', '매장명', '음식명']):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
        self.label_names = label_names

    def make_new_dataset(self, file_path):
        dataset = load_dataset('json', data_files=file_path)

        tokens_list = []
        ner_tags_list = []
        input_ids_list = []
        token_type_ids_list = []
        attention_mask_list = []
        labels_list = []

        file_name = file_path.split('.')[0] # 'train' 글자 추출 용도

        for i in range(len(dataset[file_name])):
            tokens = dataset[file_name][i]['tokens'] # dataset['train'][0]['tokens'] 형식
            tokens_list.append(tokens)

            ner_tags = dataset[file_name][i]['ner_tags']
            ner_tags_list.append(ner_tags)

            tokenized_inputs = self.tokenizer(tokens, truncation=True, is_split_into_words=True)
            # truncation=True : 시퀀스가 모델의 최대 길이를 초과할 경우, 초과 부분을 잘라냄
            input_ids = tokenized_inputs['input_ids']
            input_ids_list.append(input_ids)

            token_type_ids = tokenized_inputs['token_type_ids']
            token_type_ids_list.append(token_type_ids)

            attention_mask = tokenized_inputs['attention_mask']
            attention_mask_list.append(attention_mask)

            word_ids = tokenized_inputs.word_ids(batch_index=0)
            aligned_labels = [ner_tags[word_id] if word_id is not None else -100 for word_id in word_ids]
            labels_list.append(aligned_labels)

        data_dict = {
            'tokens' : tokens_list,
            'ner_tags' : ner_tags_list,
            'input_ids' : input_ids_list,
            'token_type_ids' : token_type_ids_list,
            'attention_mask' : attention_mask_list,
            'labels' : labels_list
        }

        new_dataset = Dataset.from_dict(data_dict)

        return new_dataset
    
    def split_training(self, new_dataset, test_size=0.2, lr=2e-5, batch_size=1, epoch=3, save_model=False):
        split_dataset = new_dataset.train_test_split(test_size=test_size)

        training_args = TrainingArguments(
            output_dir="./results",                 # 모델 학습 결과를 저장할 경로
            eval_strategy="epoch",                  # 각 epoch 이후에 평가
            learning_rate=lr,                     # 학습률
            per_device_train_batch_size=batch_size,          # 학습 시 배치 크기
            per_device_eval_batch_size=batch_size,           # 평가 시 배치 크기
            num_train_epochs=epoch,                     # 총 학습 epoch 수
            weight_decay=0.01,                      # 가중치 감소율 (과적합 방지)
            # seed=1                                  # 시드값
        )

        self.trainer = Trainer(
            model=self.model,                       # 훈련할 모델
            args=training_args,                     # 학습 파라미터
            train_dataset=split_dataset["train"],   # 훈련 데이터셋
            eval_dataset=split_dataset["test"],     # 평가 데이터셋
        )

        self.trainer.train()

        if save_model == True:
            self.model.save_pretrained("./path_to_save_model")
            self.tokenizer.save_pretrained("./path_to_save_tokenizer")

        return split_dataset
    
    def evaluate(self, split_dataset):
        predictions, label_ids, metrics = self.trainer.predict(split_dataset['train'])
        preds = np.argmax(predictions, axis=2)

        pred_list, target_list = [], []
        for pred, label in zip(preds, label_ids):
            pred_list.extend(pred)
            target_list.extend([0 if i==-100 else i for i in label])

        report = classification_report(target_list, pred_list, target_names=self.label_names)
        return report
    
    def pred_object(self, test_dataset):
        predictions, label_ids, metrics = self.trainer.predict(test_dataset)
        preds = np.argmax(predictions, axis=2)

        tokens = test_dataset['tokens']
        tokenized_inputs = self.tokenizer(tokens, is_split_into_words=True, truncation=True) # input_ids, token_type_ids, attention_mask 생성
        input_ids = tokenized_inputs['input_ids'] # BERT 모델의 내부 voca에 있는 단어에 매칭된 인덱스 값
        decode_tokens_list = []
        for i in input_ids:
            decode_tokens = self.tokenizer.convert_ids_to_tokens(i) # 인덱스 숫자 값을 다시 단어로 디코딩
            decode_tokens_list.append(decode_tokens)

        TOTAL_STORE, TOTAL_FOOD = [], []

        for index, decode_tokens in enumerate(decode_tokens_list):
            STORE, FOOD = [], []
            for idx, decode_token in enumerate(decode_tokens):
                if decode_token != '[CLS]' and decode_token != '[SEP]':
                    if preds[index][idx] == 1:
                        STORE.append(decode_token)
                    elif preds[index][idx] == 2:
                        FOOD.append(decode_token)

            # 리스트를 하나의 문자열로 만들고 ##를 없애기
            STORE_STR = ''
            FOOD_STR = ''
            for idx, token in enumerate(STORE):
                if idx == 0:
                    STORE_STR += token
                else:
                    if '##' in token:
                        STORE_STR += token.replace('##', '')
                    else:
                        STORE_STR += ' ' + token

            for idx, token in enumerate(FOOD):
                if idx == 0:
                    FOOD_STR += token
                else:
                    if '##' in token:
                        FOOD_STR += token.replace('##', '')
                    else:
                        FOOD_STR += ' ' + token
            
            TOTAL_STORE.append(STORE_STR)
            TOTAL_FOOD.append(FOOD_STR)

            TOTAL_LIST = list(zip(TOTAL_STORE, TOTAL_FOOD))

        return TOTAL_LIST

In [2]:
model_name = 'klue/bert-base'
num_labels = 3
label_names = ['그외', '매장명', '음식명']

ner = NERTrainer(model_name, num_labels, label_names)
new_dataset = ner.make_new_dataset('train.json')
split_dataset = ner.split_training(new_dataset=new_dataset, test_size=0.2, lr=2e-5, batch_size=1, epoch=10, save_model=True)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.021492358297109604, 'eval_runtime': 0.2265, 'eval_samples_per_second': 30.908, 'eval_steps_per_second': 30.908, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.006075933575630188, 'eval_runtime': 0.1989, 'eval_samples_per_second': 35.2, 'eval_steps_per_second': 35.2, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.008311931975185871, 'eval_runtime': 0.2053, 'eval_samples_per_second': 34.091, 'eval_steps_per_second': 34.091, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.006871882360428572, 'eval_runtime': 0.2114, 'eval_samples_per_second': 33.115, 'eval_steps_per_second': 33.115, 'epoch': 4.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.010608138516545296, 'eval_runtime': 0.2914, 'eval_samples_per_second': 24.02, 'eval_steps_per_second': 24.02, 'epoch': 5.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.012683370150625706, 'eval_runtime': 0.2834, 'eval_samples_per_second': 24.701, 'eval_steps_per_second': 24.701, 'epoch': 6.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.012303105555474758, 'eval_runtime': 0.2882, 'eval_samples_per_second': 24.288, 'eval_steps_per_second': 24.288, 'epoch': 7.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.011857167817652225, 'eval_runtime': 0.2676, 'eval_samples_per_second': 26.157, 'eval_steps_per_second': 26.157, 'epoch': 8.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.011697419919073582, 'eval_runtime': 0.202, 'eval_samples_per_second': 34.652, 'eval_steps_per_second': 34.652, 'epoch': 9.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.01171762216836214, 'eval_runtime': 0.2369, 'eval_samples_per_second': 29.546, 'eval_steps_per_second': 29.546, 'epoch': 10.0}
{'train_runtime': 84.5773, 'train_samples_per_second': 3.311, 'train_steps_per_second': 3.311, 'train_loss': 0.05086778572627476, 'epoch': 10.0}


In [3]:
report = ner.evaluate(split_dataset)
print(report)

  0%|          | 0/28 [00:00<?, ?it/s]

              precision    recall  f1-score   support

          그외       1.00      0.99      1.00       313
         매장명       0.96      1.00      0.98        80
         음식명       1.00      1.00      1.00        83

    accuracy                           0.99       476
   macro avg       0.99      1.00      0.99       476
weighted avg       0.99      0.99      0.99       476



In [4]:
test_pred = ner.pred_object(split_dataset['test'])

  0%|          | 0/7 [00:00<?, ?it/s]

In [5]:
test_pred

[('도미노 피자', '디럭스 피자'),
 ('카페베네', '아메리카노'),
 ('서브웨이', '이탈리안 BMT 샌드위치'),
 ('김밥천국', '김밥 과 라면'),
 ('던킨도너츠', '글레이즈드 도넛'),
 ('빽다방', '카라멜 마키아토'),
 ('파리바게뜨', '크림빵')]