# Baseline Code

## 필요한 라이브러리 임포트

In [1]:
import yaml
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import Dataset
import torch
import os

from callback.logger import log_message, log_target_distribution, LoggingCallback
from callback.early_stopping import EarlyStopping
from callback.checkpoint import save_model
from callback.save_results import save_submission

## 모델 정보 불러오기

In [2]:
# Load model name
with open("./config/model_name.yaml", "r") as f:
    model_name_config = yaml.safe_load(f)
model_key = model_name_config["model_name"]

# Load full config for that model
with open(f"./config/{model_key}.yaml", "r") as f:
    config = yaml.safe_load(f)

log_message(f"Running experiment: {config['experiment_name']}", config["log_dir"])

[2025-05-02 03:44:42] Running experiment: kcbert_exp1


## 토크나이저, 모델 구현

In [3]:
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer_name"])
model = AutoModelForSequenceClassification.from_pretrained(
    config["pretrained_model_name"],
    num_labels=config["num_labels"]
)

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/244k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

## 커스텀 데이터셋 클래스(토크나이징)

In [4]:
class ChatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.encodings = tokenizer(texts, padding='max_length', truncation=True,
                                   max_length=max_len, return_tensors="pt")
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

## 데이터 불러오기

In [5]:
train_df = pd.read_csv(config["train_file"])
texts = train_df["clean_text"].tolist()
labels = train_df["label"].tolist()

dataset = ChatDataset(texts, labels, tokenizer, config["max_length"])

In [6]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

train_dataset = ChatDataset(train_texts, train_labels, tokenizer, config["max_length"])
val_dataset = ChatDataset(val_texts, val_labels, tokenizer, config["max_length"])

In [7]:
label_counts = pd.Series(train_labels).value_counts().sort_index()

print("📊 train label 클래스 분포:")
print(label_counts)

📊 train label 클래스 분포:
0     717
1     785
2     783
3     875
4    1388
dtype: int64


In [8]:
label_counts = pd.Series(val_labels).value_counts().sort_index()

print("📊 validation label 클래스 분포:")
print(label_counts)

📊 validation label 클래스 분포:
0    179
1    196
2    196
3    219
4    348
dtype: int64


## 매트릭 함수

In [9]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

## 모델 준비

In [10]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    evaluation_strategy=config["eval_strategy"],
    save_strategy=config["save_strategy"],
    learning_rate=float(config["learning_rate"]),
    weight_decay=config["weight_decay"],
    warmup_ratio=config["warmup_ratio"],
    logging_dir=config["log_dir"],
    save_total_limit=config["save_total_limit"],
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_strategy="steps",
    logging_steps=50, 
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=2),
        LoggingCallback(),
    ]
)

## 모델 학습

In [None]:
trainer.train()
save_model(model, config["output_dir"], epoch=config["epochs"])

***** Running training *****
  Num examples = 4548
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 429


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.3883,0.228812,0.920914,0.909012
2,0.16,0.214796,0.924429,0.913642


***** Running Evaluation *****
  Num examples = 1138
  Batch size = 8


[2025-05-02 03:47:48] Epoch 01 | Train Loss: 0.0000 | Val Loss: 0.2288 | Val Acc: 0.9209 | Val F1(Macro): 0.9090


Saving model checkpoint to ../ckpoints/kcbert/checkpoint-143
Configuration saved in ../ckpoints/kcbert/checkpoint-143/config.json
Model weights saved in ../ckpoints/kcbert/checkpoint-143/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1138
  Batch size = 8


[2025-05-02 03:50:55] Epoch 02 | Train Loss: 0.0000 | Val Loss: 0.2148 | Val Acc: 0.9244 | Val F1(Macro): 0.9136


Saving model checkpoint to ../ckpoints/kcbert/checkpoint-286
Configuration saved in ../ckpoints/kcbert/checkpoint-286/config.json
Model weights saved in ../ckpoints/kcbert/checkpoint-286/pytorch_model.bin


## 테스트

In [None]:
# 1. 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 2. 입력 텐서도 같은 디바이스로 이동
test_df = pd.read_csv(config["test_file"])
test_encodings = tokenizer(
    test_df["clean_text"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=config["max_length"],
    return_tensors="pt"
)

# ⬇️ 여기가 핵심
test_encodings = {k: v.to(device) for k, v in test_encodings.items()}

# 3. 예측
with torch.no_grad():
    outputs = model(**test_encodings)
    preds = outputs.logits.argmax(dim=-1).cpu().numpy()

In [None]:
save_submission(preds, "../data/original_data/submission.csv", config["submission_file"])
log_message("Experiment complete.", config["log_dir"])

In [None]:
# 파일 로딩
df = pd.read_csv(config["submission_file"])

# target 컬럼의 분포 확인
target_counts = df["target"].value_counts().sort_index()

# 출력
print("📊 Target 클래스 분포:")
print(target_counts)

In [None]:
log_target_distribution(df, config["log_dir"])