# Baseline Code

## ÌïÑÏöîÌïú ÎùºÏù¥Î∏åÎü¨Î¶¨ ÏûÑÌè¨Ìä∏

In [1]:
import yaml
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import Dataset
import torch
import os

from callback.logger import log_message, log_target_distribution, LoggingCallback
from callback.early_stopping import EarlyStopping
from callback.checkpoint import save_model
from callback.save_results import save_submission

## Î™®Îç∏ Ï†ïÎ≥¥ Î∂àÎü¨Ïò§Í∏∞

In [2]:
# Load model name
with open("./config/model_name.yaml", "r") as f:
    model_name_config = yaml.safe_load(f)
model_key = model_name_config["model_name"]

# Load full config for that model
with open(f"./config/{model_key}.yaml", "r") as f:
    config = yaml.safe_load(f)

log_message(f"Running experiment: {config['experiment_name']}", config["log_dir"])

[2025-05-02 04:31:47] Running experiment: krbert_exp1


## ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä, Î™®Îç∏ Íµ¨ÌòÑ

In [3]:
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer_name"])
model = AutoModelForSequenceClassification.from_pretrained(
    config["pretrained_model_name"],
    num_labels=config["num_labels"]
)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/389 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/101k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/379M [00:00<?, ?B/s]

Some weights of the model checkpoint at snunlp/KR-BERT-char16424 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkp

## Ïª§Ïä§ÌÖÄ Îç∞Ïù¥ÌÑ∞ÏÖã ÌÅ¥ÎûòÏä§(ÌÜ†ÌÅ¨ÎÇòÏù¥Ïßï)

In [4]:
class ChatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.encodings = tokenizer(texts, padding='max_length', truncation=True,
                                   max_length=max_len, return_tensors="pt")
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

## Îç∞Ïù¥ÌÑ∞ Î∂àÎü¨Ïò§Í∏∞

In [5]:
train_df = pd.read_csv(config["train_file"])
texts = train_df["clean_text"].tolist()
labels = train_df["label"].tolist()

dataset = ChatDataset(texts, labels, tokenizer, config["max_length"])

In [6]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

train_dataset = ChatDataset(train_texts, train_labels, tokenizer, config["max_length"])
val_dataset = ChatDataset(val_texts, val_labels, tokenizer, config["max_length"])

In [7]:
label_counts = pd.Series(train_labels).value_counts().sort_index()

print("üìä train label ÌÅ¥ÎûòÏä§ Î∂ÑÌè¨:")
print(label_counts)

üìä train label ÌÅ¥ÎûòÏä§ Î∂ÑÌè¨:
0     717
1     785
2     783
3     875
4    1388
dtype: int64


In [8]:
label_counts = pd.Series(val_labels).value_counts().sort_index()

print("üìä validation label ÌÅ¥ÎûòÏä§ Î∂ÑÌè¨:")
print(label_counts)

üìä validation label ÌÅ¥ÎûòÏä§ Î∂ÑÌè¨:
0    179
1    196
2    196
3    219
4    348
dtype: int64


## Îß§Ìä∏Î¶≠ Ìï®Ïàò

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

## Î™®Îç∏ Ï§ÄÎπÑ

In [10]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    evaluation_strategy=config["eval_strategy"],
    save_strategy=config["save_strategy"],
    learning_rate=float(config["learning_rate"]),
    weight_decay=config["weight_decay"],
    warmup_ratio=config["warmup_ratio"],
    logging_dir=config["log_dir"],
    save_total_limit=config["save_total_limit"],
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_strategy="steps",
    logging_steps=50, 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),
        LoggingCallback(),
    ]
)

## Î™®Îç∏ ÌïôÏäµ

In [11]:
trainer.train()
save_model(model, config["output_dir"], epoch=config["epochs"])

***** Running training *****
  Num examples = 4548
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 429


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.5614,0.275224,0.911248,0.897271
2,0.2226,0.230191,0.919156,0.907573
3,0.1333,0.230144,0.92355,0.912407


***** Running Evaluation *****
  Num examples = 1138
  Batch size = 8


[2025-05-02 04:34:56] Epoch 01 | Train Loss: 0.0000 | Val Loss: 0.2752 | Val Acc: 0.9112 | Val F1(Macro): 0.8973


Saving model checkpoint to ../ckpoints/krbert/checkpoint-143
Configuration saved in ../ckpoints/krbert/checkpoint-143/config.json
Model weights saved in ../ckpoints/krbert/checkpoint-143/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1138
  Batch size = 8


[2025-05-02 04:38:06] Epoch 02 | Train Loss: 0.0000 | Val Loss: 0.2302 | Val Acc: 0.9192 | Val F1(Macro): 0.9076


Saving model checkpoint to ../ckpoints/krbert/checkpoint-286
Configuration saved in ../ckpoints/krbert/checkpoint-286/config.json
Model weights saved in ../ckpoints/krbert/checkpoint-286/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1138
  Batch size = 8


[2025-05-02 04:41:19] Epoch 03 | Train Loss: 0.0000 | Val Loss: 0.2301 | Val Acc: 0.9236 | Val F1(Macro): 0.9124


Saving model checkpoint to ../ckpoints/krbert/checkpoint-429
Configuration saved in ../ckpoints/krbert/checkpoint-429/config.json
Model weights saved in ../ckpoints/krbert/checkpoint-429/pytorch_model.bin
Deleting older checkpoint [../ckpoints/krbert/checkpoint-143] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ../ckpoints/krbert/checkpoint-429 (score: 0.9124074846768455).


## ÌÖåÏä§Ìä∏

In [12]:
# 1. ÎîîÎ∞îÏù¥Ïä§ ÏÑ§Ï†ï
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 2. ÏûÖÎ†• ÌÖêÏÑúÎèÑ Í∞ôÏùÄ ÎîîÎ∞îÏù¥Ïä§Î°ú Ïù¥Îèô
test_df = pd.read_csv(config["test_file"])
test_encodings = tokenizer(
    test_df["clean_text"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=config["max_length"],
    return_tensors="pt"
)

# ‚¨áÔ∏è Ïó¨Í∏∞Í∞Ä ÌïµÏã¨
test_encodings = {k: v.to(device) for k, v in test_encodings.items()}

# 3. ÏòàÏ∏°
with torch.no_grad():
    outputs = model(**test_encodings)
    preds = outputs.logits.argmax(dim=-1).cpu().numpy()

In [13]:
save_submission(preds, "../data/original_data/submission.csv", config["submission_file"])
log_message("Experiment complete.", config["log_dir"])

[2025-05-02 04:41:55] Experiment complete.


In [14]:
# ÌååÏùº Î°úÎî©
df = pd.read_csv(config["submission_file"])

log_target_distribution(df, config["log_dir"])

[2025-05-02 04:41:55] üìä Target ÌÅ¥ÎûòÏä§ Î∂ÑÌè¨:
[2025-05-02 04:41:55] Label 0: 106
[2025-05-02 04:41:56] Label 1: 117
[2025-05-02 04:41:56] Label 2: 124
[2025-05-02 04:41:56] Label 3: 146
[2025-05-02 04:41:57] Label 4: 7
