In [1]:
from datasets import load_dataset # 파일로부터 데이터를 로드
from transformers import AutoTokenizer # BERT 토크나이저를 자동으로 불러옴
from transformers import AutoModelForTokenClassification # NER와 같은 토큰 분류 모델 불러옴
from transformers import Trainer
from transformers import TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report
import numpy as np

dataset = load_dataset('json', data_files='train.json')

tokenizer = AutoTokenizer.from_pretrained('klue/bert-base') # 한국어에 최적화된 BERT 모델

In [2]:
def make_dataset(dataset):
    tokens_list = []
    ner_tags_list = []
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    labels_list = []
    for i in range(len(dataset)):
        tokens = dataset[i]['tokens']
        tokens_list.append(tokens)

        ner_tags = dataset[i]['ner_tags']
        ner_tags_list.append(ner_tags)

        tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True)
        # truncation=True : 시퀀스가 모델의 최대 길이를 초과할 경우, 초과 부분을 잘라냄
        input_ids = tokenized_inputs['input_ids']
        input_ids_list.append(input_ids)

        token_type_ids = tokenized_inputs['token_type_ids']
        token_type_ids_list.append(token_type_ids)

        attention_mask = tokenized_inputs['attention_mask']
        attention_mask_list.append(attention_mask)

        word_ids = tokenized_inputs.word_ids(batch_index=0)
        aligned_labels = [ner_tags[word_id] if word_id is not None else -100 for word_id in word_ids]
        labels_list.append(aligned_labels)

    data_dict = {
        'tokens' : tokens_list,
        'ner_tags' : ner_tags_list,
        'input_ids' : input_ids_list,
        'token_type_ids' : token_type_ids_list,
        'attention_mask' : attention_mask_list,
        'labels' : labels_list
    }

    dataset = Dataset.from_dict(data_dict)

    return dataset

In [3]:
# 학습
new_dataset = make_dataset(dataset['train'])
split_dataset = new_dataset.train_test_split(test_size=0.2)

model = AutoModelForTokenClassification.from_pretrained("klue/bert-base", num_labels=3)
# NER 작업을 위한 BERT 모델, num_labels=3 : 모델의 출력 레이어가 3개의 클래스를 분류

training_args = TrainingArguments(
    output_dir="./results",                 # 모델 학습 결과를 저장할 경로
    eval_strategy="epoch",                  # 각 epoch 이후에 평가
    learning_rate=2e-5,                     # 학습률
    per_device_train_batch_size=1,          # 학습 시 배치 크기
    per_device_eval_batch_size=1,           # 평가 시 배치 크기
    num_train_epochs=1,                     # 총 학습 epoch 수
    weight_decay=0.01,                      # 가중치 감소율 (과적합 방지)
    # seed=1                                  # 시드값
)

trainer = Trainer(
    model=model,                            # 훈련할 모델
    args=training_args,                     # 학습 파라미터
    train_dataset=split_dataset["train"],   # 훈련 데이터셋
    eval_dataset=split_dataset["test"],     # 평가 데이터셋
)

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/28 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.12216877937316895, 'eval_runtime': 0.4383, 'eval_samples_per_second': 15.97, 'eval_steps_per_second': 15.97, 'epoch': 1.0}
{'train_runtime': 14.8926, 'train_samples_per_second': 1.88, 'train_steps_per_second': 1.88, 'train_loss': 0.4194725581577846, 'epoch': 1.0}


TrainOutput(global_step=28, training_loss=0.4194725581577846, metrics={'train_runtime': 14.8926, 'train_samples_per_second': 1.88, 'train_steps_per_second': 1.88, 'total_flos': 179132794542.0, 'train_loss': 0.4194725581577846, 'epoch': 1.0})

In [4]:
# 모델 저장
model.save_pretrained("./path_to_save_model")
tokenizer.save_pretrained("./path_to_save_tokenizer")

('./path_to_save_tokenizer/tokenizer_config.json',
 './path_to_save_tokenizer/special_tokens_map.json',
 './path_to_save_tokenizer/vocab.txt',
 './path_to_save_tokenizer/added_tokens.json',
 './path_to_save_tokenizer/tokenizer.json')

In [5]:
# 예측 수행 및 성능지표 확인
predictions, label_ids, metrics = trainer.predict(split_dataset['train'])

preds = np.argmax(predictions, axis=2)

pred_list, target_list = [], []
for pred, label in zip(preds, label_ids):
    pred_list.extend(pred)
    target_list.extend([0 if i==-100 else i for i in label])

report = classification_report(target_list, pred_list, target_names=['그 외', '매장명', '음식명'])
print(report)

  0%|          | 0/28 [00:00<?, ?it/s]

              precision    recall  f1-score   support

         그 외       1.00      0.98      0.99       282
         매장명       0.98      1.00      0.99        82
         음식명       0.93      0.98      0.95        84

    accuracy                           0.98       448
   macro avg       0.97      0.98      0.98       448
weighted avg       0.98      0.98      0.98       448



In [6]:
predictions, label_ids, metrics = trainer.predict(split_dataset['test'])
preds = np.argmax(predictions, axis=2)
preds

  0%|          | 0/7 [00:00<?, ?it/s]

array([[0, 1, 1, 1, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 1, 1, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2]])

In [7]:
def pred_object(dataset, index):
    predictions, label_ids, metrics = trainer.predict(dataset)
    preds = np.argmax(predictions, axis=2)

    tokens = dataset[index]['tokens']
    tokenized_inputs = tokenizer(tokens, is_split_into_words=True, truncation=True) # input_ids, token_type_ids, attention_mask 생성
    input_ids = tokenized_inputs['input_ids'] # BERT 모델의 내부 voca에 있는 단어에 매칭된 인덱스 값
    decode_tokens = tokenizer.convert_ids_to_tokens(input_ids) # 인덱스 숫자 값을 다시 단어로 디코딩
    # print(decode_tokens) # ex) ['[CLS]', '맥도날드', '에서', '빅', '##맥', '세트', '를', '구매', '##했', '##어요', '[SEP]']
    # print(preds[index]) # ex) [0 1 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0]

    STORE = []
    FOOD = []
    for idx, decode_token in enumerate(decode_tokens):
        if preds[index][idx] == 1:
            STORE.append(decode_token)
        elif preds[index][idx] == 2:
            FOOD.append(decode_token)

    # 리스트를 하나의 문자열로 만들고 ##를 없애기
    STORE_STR = ''
    FOOD_STR = ''
    for idx, token in enumerate(STORE):
        if idx == 0:
            STORE_STR += token
        else:
            if '##' in token:
                STORE_STR += token.replace('##', '')
            else:
                STORE_STR += ' ' + token

    for idx, token in enumerate(FOOD):
        if idx == 0:
            FOOD_STR += token
        else:
            if '##' in token:
                FOOD_STR += token.replace('##', '')
            else:
                FOOD_STR += ' ' + token
    
    return STORE_STR, FOOD_STR

In [43]:
pred_object(split_dataset['test'], 5)

  0%|          | 0/7 [00:00<?, ?it/s]

('스타벅스', '카페라떼')

In [9]:
predictions, label_ids, metrics = trainer.predict(split_dataset['test'])

  0%|          | 0/7 [00:00<?, ?it/s]

In [12]:
preds = np.argmax(predictions, axis=2)
preds

array([[0, 1, 1, 1, 0, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 0, 0],
       [0, 1, 1, 1, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2]])

In [20]:
tokenized_inputs = tokenizer(split_dataset['test']['tokens'], is_split_into_words=True, truncation=True)
input_ids = tokenized_inputs['input_ids']
decode_tokens_list = []
for i in input_ids:
    decode_tokens = tokenizer.convert_ids_to_tokens(i)
    decode_tokens_list.append(decode_tokens)

In [25]:
print(decode_tokens_list[0])

['[CLS]', '할리', '##스', '##커피', '에서', '카', '##푸', '##치', '##노', '를', '마셨', '##어요', '[SEP]']


In [32]:
TOTAL_STORE = []
TOTAL_FOOD = []
for index, decode_tokens in enumerate(decode_tokens_list):
    STORE, FOOD = [], []
    for idx, decode_token in enumerate(decode_tokens):
        if decode_token != '[CLS]' and decode_token != '[SEP]':
            if preds[index][idx] == 1:
                STORE.append(decode_token)
            elif preds[index][idx] == 2:
                FOOD.append(decode_token)

    # 리스트를 하나의 문자열로 만들고 ##를 없애기
    STORE_STR = ''
    FOOD_STR = ''
    for idx, token in enumerate(STORE):
        if idx == 0:
            STORE_STR += token
        else:
            if '##' in token:
                STORE_STR += token.replace('##', '')
            else:
                STORE_STR += ' ' + token

    for idx, token in enumerate(FOOD):
        if idx == 0:
            FOOD_STR += token
        else:
            if '##' in token:
                FOOD_STR += token.replace('##', '')
            else:
                FOOD_STR += ' ' + token
    
    TOTAL_STORE.append(STORE_STR)
    TOTAL_FOOD.append(FOOD_STR)

In [37]:
a = list(zip(TOTAL_STORE, TOTAL_FOOD))

In [39]:
a

[('할리스커피', '카푸치노'),
 ('김밥천국', '김밥 라면'),
 ('CU 편의점', '삼각김밥'),
 ('맥도날드', '빅맥 세트'),
 ('투썸플레이스', '뉴욕치즈케이크'),
 ('KFC', '핫윙'),
 ('크리스피크림 도넛', '오리지널 글레이즈드 도넛')]