In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm

import tensorflow_addons as tfa  # AdamW 옵티마이저 사용
from sklearn.utils import class_weight
import re

# Tensorflow 버전과 GPU 사용 여부 확인
print(f'TensorFlow version: {tf.__version__}')
!nvidia-smi


2024-11-29 13:40:10.264514: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
 The versions of TensorFlow you are currently using is 2.5.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


TensorFlow version: 2.5.0
Fri Nov 29 13:40:11 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.28.03              Driver Version: 560.28.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A5000               Off |   00000000:01:00.0 Off |                  Off |
| 30%   28C    P0             69W /  230W |     239MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                      

In [None]:
def load_data(data_path: str) -> pd.DataFrame:
    df = pd.read_excel(data_path,
                       usecols=['발명의 명칭', '요약', '메인IPC2', '대표청구항'])
    df['input'] = df.apply(
        lambda row: f"{row['발명의 명칭']}\n\n{row['요약']}", axis=1)
    df = df.drop(columns=['발명의 명칭', '요약', '대표청구항'])
    df.columns = ['메인IPC2', '대표청구항']
    return df


xlsx_path_test = '/home/billy/rd/dataset/DS학술제-모델링경진대회_Valid.xlsx'

vocab_path = "./pretrained/korpat_vocab.txt"
checkpoint_path = "./pretrained/model.ckpt-381250"
pretrained_model_dir = "./pretrained/"
save_model_path = "./korpat_bert_centerloss_model_best.h5"


MAX_SEQ_LEN = 256
BATCH_SIZE = 8

In [None]:
from korpat_tokenizer import Tokenizer

# 데이터 로드
df_test = load_data(xlsx_path_test)

# 레이블 인코딩
label_encoder = LabelEncoder()
df_test['label'] = label_encoder.fit_transform(df_test['메인IPC2'])

tokenizer = Tokenizer(vocab_path=vocab_path, cased=True)

# 데이터셋 생성
test_texts = df_test['대표청구항']
test_labels = df_test['label']
test_data = pd.DataFrame({'sentence': test_texts, 'label': test_labels})

num_classes = len(label_encoder.classes_)
print("레이블 클래스 수:", num_classes)

레이블 클래스 수: 3


In [None]:
# 데이터 전처리 함수
from tensorflow.keras.utils import to_categorical


def preprocessing_dataset(dataset, num_classes):
    tokens, indices, labels = [], [], []

    for label, sentence in tqdm(zip(dataset['label'], dataset['sentence']), desc="데이터 전처리 진행중"):
        # 간단한 데이터 정규화
        sentence = re.sub(r"[^가-힣a-zA-Z0-9\s]", "", sentence)  # 특수문자 제거
        sentence = re.sub(r"\s+", " ", sentence).strip()        # 여분의 공백 제거
        tokens.append(tokenizer.tokenize(sentence))
        ids, _ = tokenizer.encode(sentence, max_len=MAX_SEQ_LEN)
        indices.append(ids)
        # 레이블을 원-핫 인코딩으로 변환
        labels.append(to_categorical(label, num_classes=num_classes))

    x_data = np.array(indices)
    y_data = np.array(labels)
    return tokens, x_data, y_data


print("\n===> 테스트데이터 전처리 시작")
test_tokens, test_x, test_y = preprocessing_dataset(test_data, num_classes)


===> 테스트데이터 전처리 시작


데이터 전처리 진행중: 602it [00:01, 323.03it/s]


In [None]:
import bert
from bert import BertModelLayer


class ClsToken(tf.keras.layers.Layer):
    def call(self, inputs):
        return inputs[:, 0, :]


test_dataset = tf.data.Dataset.from_tensor_slices((test_x, test_y))
test_dataset = test_dataset.batch(BATCH_SIZE)

2024-11-29 13:40:13.595696: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2024-11-29 13:40:13.654614: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-11-29 13:40:13.656311: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA RTX A5000 computeCapability: 8.6
coreClock: 1.695GHz coreCount: 64 deviceMemorySize: 23.58GiB deviceMemoryBandwidth: 715.34GiB/s
2024-11-29 13:40:13.656326: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2024-11-29 13:40:13.658915: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2024-11-29 13:40:13.658952: I tensorflow/stream_executor/platform/

In [None]:
# 모델 로드
model = tf.keras.models.load_model(
    save_model_path,
    custom_objects={'BertModelLayer': BertModelLayer, 'ClsToken': ClsToken}
)

# 모델 예측 및 평가
test_correct_predictions = 0
y_preds = []
y_trues = []
for test_step_idx, (x_batch_test, y_batch_test) in enumerate(test_dataset):
    logits, features = model(x_batch_test, training=False)
    y_pred = tf.argmax(logits, axis=1)
    y_true = tf.argmax(y_batch_test, axis=1)
    test_correct_predictions += tf.reduce_sum(
        tf.cast(y_pred == y_true, tf.float32)).numpy()
    y_preds.extend(y_pred.numpy())
    y_trues.extend(y_true.numpy())

test_acc = test_correct_predictions / len(test_x)
print(f'\n테스트 정확도: {test_acc:.4f}')

# 분류 리포트 출력
print('\n분류 리포트:')
print(classification_report(y_trues, y_preds,
      target_names=label_encoder.classes_, digits=4))



2024-11-29 13:40:15.359282: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2024-11-29 13:40:15.601452: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2024-11-29 13:40:15.601488: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.



테스트 정확도: 0.9103

분류 리포트:
              precision    recall  f1-score   support

        G06F     0.8588    0.7374    0.7935        99
        G06Q     0.9427    0.9554    0.9490       448
        G16H     0.7460    0.8545    0.7966        55

    accuracy                         0.9103       602
   macro avg     0.8492    0.8491    0.8464       602
weighted avg     0.9110    0.9103    0.9095       602

