In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm

import tensorflow_addons as tfa  # AdamW 옵티마이저 사용
from sklearn.utils import class_weight
import re

# Tensorflow 버전과 GPU 사용 여부 확인
print(f'TensorFlow version: {tf.__version__}')
!nvidia-smi

2024-11-28 21:56:39.681170: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
 The versions of TensorFlow you are currently using is 2.5.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


TensorFlow version: 2.5.0
Thu Nov 28 21:56:40 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.28.03              Driver Version: 560.28.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA RTX A5000               Off |   00000000:01:00.0 Off |                  Off |
| 30%   29C    P0             76W /  230W |     239MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                      

In [None]:
def load_data(data_path: str) -> pd.DataFrame:
    df = pd.read_excel(data_path,
                       usecols=['발명의 명칭', '요약', '메인IPC2', '대표청구항'])
    df['input'] = df.apply(
        lambda row: f"{row['발명의 명칭']}\n\n{row['요약']}", axis=1)
    df = df.drop(columns=['발명의 명칭', '요약', '대표청구항'])
    df.columns = ['메인IPC2', '대표청구항']
    return df


# Paths to data files and model files
xlsx_path_train = '/home/billy/rd/dataset/DS학술제-모델링경진대회_Train.xlsx'
xlsx_path_test = '/home/billy/rd/dataset/DS학술제-모델링경진대회_Valid.xlsx'

vocab_path = "./pretrained/korpat_vocab.txt"
checkpoint_path = "./pretrained/model.ckpt-381250"
pretrained_model_dir = "./pretrained/"
save_model_path = "./korpat_bert_centerloss_model.h5"

MAX_SEQ_LEN = 256
BATCH_SIZE = 8
LR = 0.00003
EPOCHS = 5
lambda_c = 0.001

In [None]:
from korpat_tokenizer import Tokenizer
tokenizer = Tokenizer(vocab_path=vocab_path, cased=True)

# 엑셀 파일에서 데이터 로드
df_train = load_data(xlsx_path_train)
df_test = load_data(xlsx_path_test)

# 레이블 인코딩
label_encoder = LabelEncoder()
df_train['label'] = label_encoder.fit_transform(df_train['메인IPC2'])

# 레이블 인코딩 결과 출력
print("레이블 인코딩 결과:")
for label, index in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)):
    print(f"{label}: {index}")

# 테스트 데이터에서 보이지 않는 레이블 처리
unseen_labels = set(df_test['메인IPC2']) - set(label_encoder.classes_)
if unseen_labels:
    print("주의: 테스트 데이터셋에 훈련 데이터셋에 없는 레이블이 있습니다. 해당 샘플을 제거합니다.")
    print(f"제거될 레이블: {unseen_labels}")
    df_test = df_test[~df_test['메인IPC2'].isin(
        unseen_labels)].reset_index(drop=True)

# 테스트 레이블 변환
df_test['label'] = label_encoder.transform(df_test['메인IPC2'])

레이블 인코딩 결과:
G06F: 0
G06Q: 1
G16H: 2


In [None]:
from sklearn.model_selection import train_test_split

# 학습과 검증 데이터셋을 나누기
train_texts_full = df_train['대표청구항']
train_labels_full = df_train['label']

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts_full, train_labels_full, test_size=0.1, random_state=42, stratify=train_labels_full
)

# 테스트 데이터셋
test_texts = df_test['대표청구항']
test_labels = df_test['label']

# 학습, 검증, 테스트 데이터셋을 위한 DataFrame 생성
train_data = pd.DataFrame({'sentence': train_texts, 'label': train_labels})
val_data = pd.DataFrame({'sentence': val_texts, 'label': val_labels})
test_data = pd.DataFrame({'sentence': test_texts, 'label': test_labels})

num_classes = len(label_encoder.classes_)
print("레이블 클래스 수:", num_classes)

레이블 클래스 수: 3


In [None]:
# 데이터 전처리 함수
from tensorflow.keras.utils import to_categorical


def preprocessing_dataset(dataset, num_classes):
    tokens, indices, labels = [], [], []

    for label, sentence in tqdm(zip(dataset['label'], dataset['sentence']), desc="데이터 전처리 진행중"):
        # 간단한 데이터 정규화
        sentence = re.sub(r"[^가-힣a-zA-Z0-9\s]", "", sentence)  # 특수문자 제거
        sentence = re.sub(r"\s+", " ", sentence).strip()        # 여분의 공백 제거
        tokens.append(tokenizer.tokenize(sentence))
        ids, _ = tokenizer.encode(sentence, max_len=MAX_SEQ_LEN)
        indices.append(ids)
        # 레이블을 원-핫 인코딩 벡터로 변환
        labels.append(to_categorical(label, num_classes=num_classes))

    x_data = np.array(indices)
    y_data = np.array(labels)
    return tokens, x_data, y_data


print("===> 학습데이터 전처리 시작")
train_tokens, train_x, train_y = preprocessing_dataset(train_data, num_classes)

print("\n===> 검증데이터 전처리 시작")
val_tokens, val_x, val_y = preprocessing_dataset(val_data, num_classes)

print("\n===> 테스트데이터 전처리 시작")
test_tokens, test_x, test_y = preprocessing_dataset(test_data, num_classes)

===> 학습데이터 전처리 시작


데이터 전처리 진행중: 2163it [00:06, 331.36it/s]



===> 검증데이터 전처리 시작


데이터 전처리 진행중: 241it [00:00, 360.08it/s]



===> 테스트데이터 전처리 시작


데이터 전처리 진행중: 602it [00:01, 326.41it/s]


In [None]:
import bert
from bert import BertModelLayer


class ClsToken(tf.keras.layers.Layer):
    def call(self, inputs):
        return inputs[:, 0, :]


# 입력 레이어
input_ids = keras.layers.Input(
    shape=(MAX_SEQ_LEN,), dtype='int32', name='input_ids')

# BERT 레이어
bert_params = bert.params_from_pretrained_ckpt(pretrained_model_dir)
l_bert = BertModelLayer.from_params(bert_params, name="bert")

# BERT 출력
bert_output = l_bert(input_ids)
print("bert shape", bert_output.shape)

# CLS 토큰
cls_out = ClsToken(name='cls_token')(bert_output)

# 드롭아웃 추가
cls_out = keras.layers.Dropout(0.5)(cls_out)

# 출력 레이어
outputs = Dense(units=num_classes, activation='softmax',
                name='output')(cls_out)

# 로짓과 특징을 출력으로 가지는 모델 정의
model = keras.Model(inputs=input_ids, outputs=[outputs, cls_out])
model.build(input_shape=(None, MAX_SEQ_LEN))

# KorPatBERT 체크포인트에서 직접 BERT 가중치 로드
bert.load_stock_weights(l_bert, checkpoint_path)


# CenterLoss 레이어 정의 (정규화로 수정됨)
class CenterLoss(tf.keras.layers.Layer):
    def __init__(self, num_classes, feat_dim, **kwargs):
        super(CenterLoss, self).__init__(**kwargs)
        self.num_classes = num_classes
        self.feat_dim = feat_dim
        # 중심 초기화 및 정규화
        initializer = tf.keras.initializers.RandomUniform(minval=-1, maxval=1)
        centers = initializer(shape=(num_classes, feat_dim))
        self.centers = tf.Variable(tf.nn.l2_normalize(
            centers, axis=1), trainable=True, name='centers')

    def call(self, features, labels):
        labels = tf.cast(labels, tf.int32)
        # 특징 정규화
        normalized_features = tf.nn.l2_normalize(features, axis=1)
        # 중심 정규화 (정규화 상태 유지)
        normalized_centers = tf.nn.l2_normalize(self.centers, axis=1)
        centers_batch = tf.gather(normalized_centers, labels)
        # 특징과 중심의 차이 계산
        diff = normalized_features - centers_batch
        # 차이의 제곱 L2 노름 계산 (샘플별)
        per_sample_loss = tf.reduce_sum(tf.square(diff), axis=1)
        # 배치 전체의 평균 손실 계산
        loss = tf.reduce_mean(per_sample_loss)
        return loss


# CenterLoss 레이어 인스턴스화
hidden_size = cls_out.shape[-1]
center_loss_layer = CenterLoss(num_classes=num_classes, feat_dim=hidden_size)

# 옵티마이저 정의
optimizer = tfa.optimizers.AdamW(
    learning_rate=LR, weight_decay=1e-4)  # AdamW 사용


# 학습률 스케줄러 적용
def scheduler(epoch, lr):
    if epoch < EPOCHS * 0.6:
        return lr
    else:
        return lr * tf.math.exp(-0.1)


lr_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

# 교차 엔트로피를 위한 손실 함수 정의
loss_fn = keras.losses.CategoricalCrossentropy()

2024-11-28 21:56:50.771882: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2024-11-28 21:56:50.874713: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-11-28 21:56:50.877899: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA RTX A5000 computeCapability: 8.6
coreClock: 1.695GHz coreCount: 64 deviceMemorySize: 23.58GiB deviceMemoryBandwidth: 715.34GiB/s
2024-11-28 21:56:50.877911: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2024-11-28 21:56:50.880466: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2024-11-28 21:56:50.880506: I tensorflow/stream_executor/platform/

bert shape (None, 256, 768)
Done loading 196 BERT weights from: ./pretrained/model.ckpt-381250 into <bert.model.BertModelLayer object at 0x700bddaf67c0> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/LayerNorm/beta/adam_m
	bert/embeddings/LayerNorm/beta/adam_v
	bert/embeddings/LayerNorm/gamma/adam_m
	bert/embeddings/LayerNorm/gamma/adam_v
	bert/embeddings/position_embeddings/adam_m
	bert/embeddings/position_embeddings/adam_v
	bert/embeddings/token_type_embeddings
	bert/embeddings/token_type_embeddings/adam_m
	bert/embeddings/token_type_embeddings/adam_v
	bert/embeddings/word_embeddings/adam_m
	bert/embeddings/word_embeddings/adam_v
	bert/encoder/layer_0/attention/output/LayerNorm/beta/adam_m
	bert/encoder/layer_0/attention/output/LayerNorm/beta/adam_v
	bert/encoder/layer_0/attention/output/LayerNorm/gamma/adam_m
	bert/encoder/layer_0/attention/output/LayerNorm/gamma/adam

In [None]:
# 데이터셋 생성
train_dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y))
train_dataset = train_dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE)

val_dataset = tf.data.Dataset.from_tensor_slices((val_x, val_y))
val_dataset = val_dataset.batch(BATCH_SIZE)

test_dataset = tf.data.Dataset.from_tensor_slices((test_x, test_y))
test_dataset = test_dataset.batch(BATCH_SIZE)

In [None]:
# 학습 내용 정의
@tf.function
def train_step(x_batch, y_batch):
    with tf.GradientTape() as tape:
        logits, features = model(x_batch, training=True)
        # 교차 엔트로피 손실 계산
        loss_ce = loss_fn(y_batch, logits)
        # 레이블을 인덱스로 변환
        labels = tf.argmax(y_batch, axis=1)
        # 센터 손실 계산
        loss_center = center_loss_layer(features, labels)
        # 전체 손실
        loss = loss_ce + lambda_c * loss_center
    # 그래디언트 계산
    gradients = tape.gradient(
        loss, model.trainable_variables + [center_loss_layer.centers])
    # 파라미터 업데이트
    optimizer.apply_gradients(
        zip(gradients, model.trainable_variables + [center_loss_layer.centers]))
    return loss, loss_ce, loss_center, logits


# 검증 내용 정의
@tf.function
def val_step(x_batch, y_batch):
    logits, features = model(x_batch, training=False)
    loss_ce = loss_fn(y_batch, logits)
    labels = tf.argmax(y_batch, axis=1)
    loss_center = center_loss_layer(features, labels)
    loss = loss_ce + lambda_c * loss_center
    return loss, logits


for epoch in range(EPOCHS):
    print(f'\n에폭 {epoch + 1}/{EPOCHS}')
    print('-' * 30)
    # 학습 루프
    total_loss = 0.
    total_loss_ce = 0.
    total_loss_center = 0.
    num_batches = 0
    correct_predictions = 0

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        loss, loss_ce, loss_center, logits = train_step(
            x_batch_train, y_batch_train)
        # 메트릭 업데이트
        total_loss += loss.numpy()
        total_loss_ce += loss_ce.numpy()
        total_loss_center += loss_center.numpy()
        num_batches += 1
        y_pred = tf.argmax(logits, axis=1)
        y_true = tf.argmax(y_batch_train, axis=1)
        correct_predictions += tf.reduce_sum(
            tf.cast(y_pred == y_true, tf.float32)).numpy()
    train_acc = correct_predictions / len(train_x)
    train_loss = total_loss / num_batches
    print(f'훈련 손실 총합 {train_loss:.4f}, 정확도 {train_acc:.4f}')

    # 검증 루프
    total_val_loss = 0.
    num_val_batches = 0
    val_correct_predictions = 0
    for val_step_idx, (x_batch_val, y_batch_val) in enumerate(val_dataset):
        loss, logits = val_step(x_batch_val, y_batch_val)
        total_val_loss += loss.numpy()
        num_val_batches += 1
        y_pred = tf.argmax(logits, axis=1)
        y_true = tf.argmax(y_batch_val, axis=1)
        val_correct_predictions += tf.reduce_sum(
            tf.cast(y_pred == y_true, tf.float32)).numpy()
    val_acc = val_correct_predictions / len(val_x)
    val_loss = total_val_loss / num_val_batches
    print(f'검증 손실 총합 {val_loss:.4f}, 정확도 {val_acc:.4f}')

    # 학습률 스케줄러 업데이트
    lr = scheduler(epoch, optimizer.lr)
    optimizer.lr.assign(lr)
    print(f'현재 학습률: {optimizer.lr.numpy():.6f}')


# 학습된 모델 저장
model.save(save_model_path)

# 메모리 내 모델 삭제
# 저장된 모델을 가져와서 테스트 데이터셋을 테스트 하기 위함.
del model


에폭 1/5
------------------------------
Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


2024-11-28 21:56:55.147650: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:176] None of the MLIR Optimization Passes are enabled (registered 2)
2024-11-28 21:56:55.169211: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 3187200000 Hz
2024-11-28 21:56:55.851365: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2024-11-28 21:56:56.138419: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2024-11-28 21:56:56.138454: I tensorflow/stream_executor/cuda/cuda_blas.cc:1838] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


훈련 손실 총합 0.6252, 정확도 0.7748
검증 손실 총합 0.2680, 정확도 0.9046
현재 학습률: 0.000030

에폭 2/5
------------------------------
훈련 손실 총합 0.3270, 정확도 0.8803
검증 손실 총합 0.2434, 정확도 0.9129
현재 학습률: 0.000030

에폭 3/5
------------------------------
훈련 손실 총합 0.2289, 정확도 0.9168
검증 손실 총합 0.1771, 정확도 0.9378
현재 학습률: 0.000030

에폭 4/5
------------------------------
훈련 손실 총합 0.1482, 정확도 0.9482
검증 손실 총합 0.1748, 정확도 0.9378
현재 학습률: 0.000027

에폭 5/5
------------------------------
훈련 손실 총합 0.1186, 정확도 0.9593
검증 손실 총합 0.2560, 정확도 0.9212
현재 학습률: 0.000025


In [None]:
# 저장된 파일에서 학습된 모델 불러오기
model = tf.keras.models.load_model(
    save_model_path,
    custom_objects={'BertModelLayer': BertModelLayer, 'ClsToken': ClsToken}
)

# 테스트 데이터로 평가
test_correct_predictions = 0
y_preds = []
y_trues = []
for test_step_idx, (x_batch_test, y_batch_test) in enumerate(test_dataset):
    logits, features = model(x_batch_test, training=False)
    y_pred = tf.argmax(logits, axis=1)
    y_true = tf.argmax(y_batch_test, axis=1)
    test_correct_predictions += tf.reduce_sum(
        tf.cast(y_pred == y_true, tf.float32)).numpy()
    y_preds.extend(y_pred.numpy())
    y_trues.extend(y_true.numpy())

test_acc = test_correct_predictions / len(test_x)
print(f'\n테스트 정확도: {test_acc:.4f}')

# 분류 리포트 생성
print('\n분류 리포트:')
print(classification_report(y_trues, y_preds,
      target_names=label_encoder.classes_, digits=4))


테스트 정확도: 0.9103

분류 리포트:
              precision    recall  f1-score   support

        G06F     0.8588    0.7374    0.7935        99
        G06Q     0.9427    0.9554    0.9490       448
        G16H     0.7460    0.8545    0.7966        55

    accuracy                         0.9103       602
   macro avg     0.8492    0.8491    0.8464       602
weighted avg     0.9110    0.9103    0.9095       602

