In [None]:
class_map = {
    "협박 대화": 0,
    "갈취 대화": 1,
    "직장 내 괴롭힘 대화": 2,
    "기타 괴롭힘 대화": 3,
    "일반 대화": 4,
}
label_to_class = {v: k for k, v in class_map.items()}

In [None]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy
from datetime import datetime, timezone, timedelta

KST = timezone(timedelta(hours=9))
NOW = datetime.now(KST).strftime('%Y%m%d%H%M%S') # YYYYMMDDHHmmss
print(f"{NOW=}")

# Parameters
TRAIN_DATA_PATH = "data/train_with_normal_aihub.csv"
MODEL_NAME = "klue/bert-base"
SAVE_MODEL_PATH = f"models/bert_{NOW}"
MAX_LENGTH = 200
NUM_CLASSES = len(class_map)
BATCH_SIZE = 16
EPOCHS = 10

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME, 
                                          max_len=MAX_LENGTH,
                                          truncation=True, 
                                          padding=True)
model = TFBertForSequenceClassification.from_pretrained(MODEL_NAME,
                                                        num_labels=NUM_CLASSES, 
                                                        from_pt=True)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from konlpy.tag import Okt
import re

okt = Okt()

def preprocess_conversation(conversation):
    conversation = conversation.replace("\n", " ")
    # 특수문자 제거
    conversation = re.sub(r"[^ㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z?.!\s]", "", conversation)
    # 영어 소문자화
    conversation = conversation.lower()
    
    # 불용어 제거
    stopwords = ['은','는','이','가','을','를','에','이가','이는']
    stopword_removed = []
    for word in okt.morphs(conversation):
        if word in stopwords:
            continue
        stopword_removed.append(word)
        
    return " ".join(stopword_removed)


def tokenize(texts, labels, max_length):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
    dataset = tf.data.Dataset.from_tensor_slices((
        dict(encodings),
        labels
    ))
    return dataset

train_df = pd.read_csv(TRAIN_DATA_PATH)
train_df["class_num"] = train_df["class"].map(class_map)
train_df = train_df.drop_duplicates(subset=["conversation"])
train_df["conversation"] = train_df["conversation"].apply(preprocess_conversation)

X_train, X_val, y_train, y_val = train_test_split(train_df["conversation"], train_df["class_num"], 
                                                  test_size=0.2, random_state=42, 
                                                  stratify=train_df["class_num"])

train_dataset = (tokenize(X_train.tolist(), y_train, MAX_LENGTH)
           .shuffle(len(X_train))
           .batch(BATCH_SIZE)
           .prefetch(tf.data.experimental.AUTOTUNE))

val_dataset = (tokenize(X_val.tolist(), y_val, MAX_LENGTH)
           .shuffle(len(X_val))
           .batch(BATCH_SIZE)
           .prefetch(tf.data.experimental.AUTOTUNE))

In [None]:
optimizer = Adam(learning_rate=3e-5)
loss = SparseCategoricalCrossentropy(from_logits=True)
metrics = [SparseCategoricalAccuracy("accuracy")]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

histories = []

# Train the model
for i in range(EPOCHS):
    history = model.fit(train_dataset,
                        epochs=1,
                        validation_data=val_dataset)
    histories.append(history)
    
    # ModelCheckpoint callback으로 처리하려고 했는데 오류가 나서, epoch 마다 저장
    model.save_pretrained(f"{SAVE_MODEL_PATH}/{i+1}_{history.history['val_loss'][0]:.4f}")

In [None]:
import matplotlib.pyplot as plt

loss = [h.history["loss"][0] for h in histories]
accuracy = [h.history["accuracy"][0] for h in histories]

val_loss = [h.history["val_loss"][0] for h in histories]
val_accuracy = [h.history["val_accuracy"][0] for h in histories]

epochs = range(1, len(loss) + 1)

fig = plt.figure(figsize=(12, 5))

ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, loss, color='blue', label='train_loss')
ax1.plot(epochs, val_loss, color='red', label='val_loss')
ax1.set_title('Train and Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.grid()
ax1.legend()


ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, accuracy, color='blue', label='train_accuracy')
ax2.plot(epochs, val_accuracy, color='red', label='val_accuracy')
ax2.set_title('Train and Validation Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.grid()
ax2.legend()

plt.show()

## Evaluate

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

plt.rc("font", family="NanumBarunGothic")

def predict(model, X):
    start_index = 0
    end_index = start_index + BATCH_SIZE 
    X_batch = X[start_index:end_index]

    predictions = []
    while len(X_batch) > 0:
        X_batch_seq = tokenizer(X_batch.tolist(), truncation=True, padding=True, 
                                max_length=MAX_LENGTH, 
                                return_tensors="tf")
        batch_predictions_proba = model(X_batch_seq)
        batch_prediction = np.argmax(batch_predictions_proba.logits, axis=1)
        predictions += batch_prediction.tolist()
        
        start_index += BATCH_SIZE
        end_index += BATCH_SIZE
        X_batch = X[start_index:end_index]

    return predictions

def plot_confusion_matrix(y_true, y_pred): 
    cm = confusion_matrix(y_true, y_pred, normalize="true")
    fig , ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_map.keys())
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False) 
    plt.title("Normalized confusion matrix")
    plt.show() 


In [None]:
BEST_MODEL_PATH = "models/bert_20240626202534/3_0.2993" # validation metric을 확인 후 결정
model = TFBertForSequenceClassification.from_pretrained(BEST_MODEL_PATH)
predictions = predict(model, X_val)
plot_confusion_matrix(y_val, predictions)

## Submission

In [None]:
import json

def load_test_df():
    with open("data/test.json") as file:
        test_json = json.load(file)
    
    file_names = []
    conversations = []
    for file_name in test_json:
        conversation = test_json[file_name]["text"]

        file_names.append(file_name)
        conversations.append(conversation)
        
    return pd.DataFrame({"file_name": file_names, "conversation": conversations})
    
test_df = load_test_df()
test_df

In [None]:
test_df["conversation"] = test_df["conversation"].apply(preprocess_conversation)
X_test = test_df["conversation"]
X_test = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="tf")

In [None]:
import numpy as np

predictions_proba = model(X_test)
predictions = np.argmax(predictions_proba.logits, axis=1)
test_df["class"] = predictions
test_df["class_str"] = test_df["class"].apply(lambda x: label_to_class[x])
test_df

In [None]:
print(f"submission file: submissions/bert_finetuning_{NOW}.csv")
test_df[["file_name", "class"]].to_csv(f"submissions/bert_finetuning_{NOW}.csv", index=False)
pd.read_csv(f"submissions/bert_finetuning_{NOW}.csv")

### Test score: 0.73