# Baseline Code

## 필요한 라이브러리 임포트

In [21]:
import yaml
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.metrics import f1_score, accuracy_score
from torch.utils.data import Dataset
import torch
import os

from callback.logger import log_message
from callback.early_stopping import EarlyStopping
from callback.checkpoint import save_model
from callback.save_results import save_submission

## 모델 정보 불러오기

In [26]:
# Load model name
with open("./config/model_name.yaml", "r") as f:
    model_name_config = yaml.safe_load(f)
model_key = model_name_config["model_name"]

# Load full config for that model
with open(f"./config/{model_key}.yaml", "r") as f:
    config = yaml.safe_load(f)

log_message(f"Running experiment: {config['experiment_name']}", config["log_dir"])

[2025-05-02 02:38:05] Running experiment: klue_roberta_exp1


## 토크나이저, 모델 구현

In [8]:
tokenizer = AutoTokenizer.from_pretrained(config["tokenizer_name"])
model = AutoModelForSequenceClassification.from_pretrained(
    config["pretrained_model_name"],
    num_labels=config["num_labels"]
)

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classif

## 커스텀 데이터셋 클래스(토크나이징)

In [9]:
class ChatDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.encodings = tokenizer(texts, padding='max_length', truncation=True,
                                   max_length=max_len, return_tensors="pt")
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

## 데이터 불러오기

In [10]:
train_df = pd.read_csv(config["train_file"])
texts = train_df["clean_text"].tolist()
labels = train_df["label"].tolist()

dataset = ChatDataset(texts, labels, tokenizer, config["max_length"])

In [16]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, stratify=labels, random_state=42
)

train_dataset = ChatDataset(train_texts, train_labels, tokenizer, config["max_length"])
val_dataset = ChatDataset(val_texts, val_labels, tokenizer, config["max_length"])

In [36]:
label_counts = pd.Series(train_labels).value_counts().sort_index()

print("📊 train label 클래스 분포:")
print(label_counts)

📊 train label 클래스 분포:
0     717
1     785
2     783
3     875
4    1388
dtype: int64


In [37]:
label_counts = pd.Series(val_labels).value_counts().sort_index()

print("📊 validation label 클래스 분포:")
print(label_counts)

📊 validation label 클래스 분포:
0    179
1    196
2    196
3    219
4    348
dtype: int64


## 매트릭 함수

In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro")
    }

## 모델 준비

In [19]:
training_args = TrainingArguments(
    output_dir=config["output_dir"],
    num_train_epochs=config["epochs"],
    per_device_train_batch_size=config["batch_size"],
    evaluation_strategy=config["eval_strategy"],
    save_strategy=config["save_strategy"],
    learning_rate=float(config["learning_rate"]),
    weight_decay=config["weight_decay"],
    warmup_ratio=config["warmup_ratio"],
    logging_dir=config["log_dir"],
    save_total_limit=config["save_total_limit"],
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## 모델 학습

In [20]:
trainer.train()
save_model(model, config["output_dir"], epoch=config["epochs"])

***** Running training *****
  Num examples = 4548
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 429


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,No log,0.188305,0.947276,0.939119
2,No log,0.169094,0.953427,0.946375
3,No log,0.179714,0.952548,0.945795


***** Running Evaluation *****
  Num examples = 1138
  Batch size = 8
Saving model checkpoint to ../ckpoint/klue_roberta/checkpoint-143
Configuration saved in ../ckpoint/klue_roberta/checkpoint-143/config.json
Model weights saved in ../ckpoint/klue_roberta/checkpoint-143/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1138
  Batch size = 8
Saving model checkpoint to ../ckpoint/klue_roberta/checkpoint-286
Configuration saved in ../ckpoint/klue_roberta/checkpoint-286/config.json
Model weights saved in ../ckpoint/klue_roberta/checkpoint-286/pytorch_model.bin
Deleting older checkpoint [../ckpoint/klue_roberta/checkpoint-178] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 1138
  Batch size = 8
Saving model checkpoint to ../ckpoint/klue_roberta/checkpoint-429
Configuration saved in ../ckpoint/klue_roberta/checkpoint-429/config.json
Model weights saved in ../ckpoint/klue_roberta/checkpoint-429/pytorch_model.bin
Deleting older checkpoint [../ckpo

## 테스트

In [24]:
# 1. 디바이스 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 2. 입력 텐서도 같은 디바이스로 이동
test_df = pd.read_csv(config["test_file"])
test_encodings = tokenizer(
    test_df["clean_text"].tolist(),
    padding="max_length",
    truncation=True,
    max_length=config["max_length"],
    return_tensors="pt"
)

# ⬇️ 여기가 핵심
test_encodings = {k: v.to(device) for k, v in test_encodings.items()}

# 3. 예측
with torch.no_grad():
    outputs = model(**test_encodings)
    preds = outputs.logits.argmax(dim=-1).cpu().numpy()

In [31]:
save_submission(preds, "../data/original_data/submission.csv", config["submission_file"])
log_message("Experiment complete.", config["log_dir"])

[2025-05-02 02:46:19] Experiment complete.


In [32]:
# 파일 로딩
df = pd.read_csv(config["submission_file"])

# target 컬럼의 분포 확인
target_counts = df["target"].value_counts().sort_index()

# 출력
print("📊 Target 클래스 분포:")
print(target_counts)

📊 Target 클래스 분포:
0    114
1    100
2    111
3    164
4     11
Name: target, dtype: int64
