# Baseline

- Bidirectional LSTM

In [None]:
class_map = {
    "협박 대화": 0,
    "갈취 대화": 1,
    "직장 내 괴롭힘 대화": 2,
    "기타 괴롭힘 대화": 3,
    "일반 대화": 4,
}
label_to_class = {v: k for k, v in class_map.items()}

## Training Model

In [None]:
import pandas as pd
from datetime import datetime, timezone, timedelta

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

KST = timezone(timedelta(hours=9))
NOW = datetime.now(KST).strftime('%Y%m%d%H%M%S') # YYYYMMDDHHmmss

# parameters
RANDOM_SEED = 42
TRAIN_CSV_DATA_PATH = "data/train_with_normal_nikl.csv"
TEST_JSON_DATA_PATH = "data/test.json"
SUBMISSION_CSV_DATA_PATH = "data/submission.csv"
EPOCHS = 50
MODEL_PATH = f"models/baseline_{NOW}.keras"

VOCAB_SIZE = 10_000
SEQ_LEN = 100

# fix random seed
tf.random.set_seed(RANDOM_SEED)

# Data preprocessing
data = pd.read_csv("data/train_with_normal_nikl.csv")
data["class_label"] = data["class"].apply(lambda x: class_map[x]) # str -> int

# Tokenization
tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<UNK>")
tokenizer.fit_on_texts(data["conversation"])
train_sequences = tokenizer.texts_to_sequences(data["conversation"])
train_sequences = pad_sequences(train_sequences, padding='post', maxlen=SEQ_LEN)

# Split the data
X_train, X_val, y_train, y_val = train_test_split(train_sequences, 
                                                  data["class_label"], 
                                                  test_size=0.2,
                                                  random_state=RANDOM_SEED, 
                                                  stratify=data["class_label"])

def get_model(X, vocab_size, num_classes):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=64, input_length=X.shape[-1]),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(num_classes, activation="softmax")
    ])
    
    model.compile(loss="sparse_categorical_crossentropy", 
                  optimizer="adam", 
                  metrics=["accuracy"])
    return model

model = get_model(X_train, VOCAB_SIZE, num_classes=len(class_map))
callbacks = [
    tf.keras.callbacks.ModelCheckpoint(MODEL_PATH,
                                       monitor="val_loss",
                                       save_best_only=True)
]
history = model.fit(X_train, y_train, 
                    epochs=EPOCHS,
                    validation_data=(X_val, y_val),
                    callbacks=callbacks)

In [None]:
import matplotlib.pyplot as plt

history_dict = history.history

loss = history_dict['loss']
val_loss = history_dict['val_loss']
epochs = range(1, len(loss) + 1)

fig = plt.figure(figsize=(12, 5))

ax1 = fig.add_subplot(1, 2, 1)
ax1.plot(epochs, loss, color='blue', label='train_loss')
ax1.plot(epochs, val_loss, color='red', label='val_loss')
ax1.set_title('Train and Validation Loss')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Loss')
ax1.grid()
ax1.legend()

accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']

ax2 = fig.add_subplot(1, 2, 2)
ax2.plot(epochs, accuracy, color='blue', label='train_accuracy')
ax2.plot(epochs, val_accuracy, color='red', label='val_accuracy')
ax2.set_title('Train and Validation Accuracy')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Accuracy')
ax2.grid()
ax2.legend()

plt.show()

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import numpy as np

plt.rc("font", family="NanumBarunGothic")

def plot_confusion_matrix(y_true, y_pred): 
    cm = confusion_matrix(y_true, y_pred, normalize="true")
    fig , ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_map.keys())
    disp.plot(cmap="Blues", values_format=".2f", ax=ax, colorbar=False) 
    plt.title("Normalized confusion matrix")
    plt.show() 
    
model = tf.keras.models.load_model(MODEL_PATH)
y_pred = np.argmax(model.predict(X_val), axis=1)
plot_confusion_matrix(y_val, y_pred)

# 모델이 일반대화는 잘 판별하는 반면, 나머지 대화들은 아쉬운 성능을 보인다
# 협박: "갈취", "기타 괴롭힘"과 혼동됨
# 갈취: "협박"과 혼동됨
# 직장 내 괴롭힘: "협박"과 혼동됨
# 기타 괴롭힘: "협박"과 혼동됨

## Submission

In [None]:
import json

def load_test_df():
    with open("data/test.json") as file:
        test_json = json.load(file)
    
    file_names = []
    conversations = []
    for file_name in test_json:
        conversation = test_json[file_name]["text"]

        file_names.append(file_name)
        conversations.append(conversation)
        
    return pd.DataFrame({"file_name": file_names, "conversation": conversations})
    
test_df = load_test_df()
test_df

In [None]:
# Tokenization
test_sequences = tokenizer.texts_to_sequences(test_df["conversation"])
test_sequences = pad_sequences(test_sequences, padding="post", maxlen=SEQ_LEN)

# Prediction
model = tf.keras.models.load_model(MODEL_PATH)
predictions_prob = model.predict(test_sequences)
predictions = np.argmax(predictions_prob, axis=1)
test_df["class"] = predictions
test_df["class_str"] = test_df["class"].apply(lambda x: label_to_class[x])
test_df

In [None]:
print(f"submission file: submissions/baseline_{NOW}.csv")
test_df[["file_name", "class"]].to_csv(f"submissions/baseline_{NOW}.csv", index=False)
pd.read_csv(f"submissions/baseline_{NOW}.csv")