In [1]:
import json
import pandas as pd
from datasets import Dataset
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
import numpy as np





In [2]:
# 파인튜닝 데이터셋 읽어오기

FILE_PATH = "../fine-tuning-data/KLUE-sts"

def extract_essential_fields(data):
    rows = []
    for item in data:
        rows.append({
            "sentence1": item["sentence1"],
            "sentence2": item["sentence2"],
            "labels": float(item["labels"]["label"])
        })
    return pd.DataFrame(rows)

with open(f"{FILE_PATH}/klue-sts-v1.1_train.json", encoding='utf-8') as f:
    train_data = json.load(f)

with open(f"{FILE_PATH}/klue-sts-v1.1_dev.json", encoding='utf-8') as f:
    val_data = json.load(f)

train_df = extract_essential_fields(train_data)
val_df = extract_essential_fields(val_data)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [3]:
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
model = AutoModelForSequenceClassification.from_pretrained(
    "klue/roberta-base",
    num_labels=1 
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
def preprocess(example):
    return tokenizer(
        example["sentence1"],
        example["sentence2"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/11668 [00:00<?, ? examples/s]

Map:   0%|          | 0/519 [00:00<?, ? examples/s]

In [5]:
# Trainer가 학습에 쓸 수 있도록 label 설정
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [6]:
def compute_metrics(pred):
    preds, labels = pred
    preds = preds.flatten()
    labels = labels.flatten()
    
    pearson = pearsonr(preds, labels)[0]
    mse = mean_squared_error(labels, preds)
    mae = mean_absolute_error(labels, preds)

    return {
        'pearson': pearson,
        'mse': mse,
        'mae': mae
    }


In [7]:
training_args = TrainingArguments(
    output_dir="C:/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    save_total_limit=1
)




In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [9]:
# 1. 학습 전 평가
print("📊 학습 전 평가 결과:")
trainer.evaluate()

# 2. 학습
trainer.train()

# 3. 학습 후 평가
print("📊 학습 후 평가 결과:")
trainer.evaluate()


📊 학습 전 평가 결과:


Epoch,Training Loss,Validation Loss,Model Preparation Time,Pearson,Mse,Mae
1,0.524,0.519043,0.0,0.903155,0.519043,0.529355
2,0.1767,0.379416,0.0,0.922982,0.379416,0.451566
3,0.1074,0.487042,0.0,0.914861,0.487042,0.519524
4,0.0859,0.310401,0.0,0.926843,0.310401,0.421881
5,0.0567,0.402473,0.0,0.927653,0.402473,0.465382
6,0.0498,0.32285,0.0,0.932446,0.32285,0.41943
7,0.037,0.368098,0.0,0.925525,0.368098,0.452963
8,0.0319,0.320194,0.0,0.932796,0.320194,0.415787
9,0.025,0.371158,0.0,0.929914,0.371158,0.45389
10,0.0225,0.337652,0.0,0.932425,0.337652,0.431542


📊 학습 후 평가 결과:


{'eval_loss': 0.32019418478012085,
 'eval_model_preparation_time': 0.0,
 'eval_pearson': 0.9327959280897391,
 'eval_mse': 0.3201942443847656,
 'eval_mae': 0.41578707098960876,
 'eval_runtime': 1.2293,
 'eval_samples_per_second': 422.179,
 'eval_steps_per_second': 26.844,
 'epoch': 10.0}

In [10]:
predictions = trainer.predict(val_dataset)
preds = predictions.predictions.squeeze()
labels = predictions.label_ids.squeeze()

# 예시: 판다스로 저장
import pandas as pd

df_result = pd.DataFrame({
    "sentence1": val_df["sentence1"],
    "sentence2": val_df["sentence2"],
    "gold_label": labels,
    "predicted": preds
})

df_result.to_csv("klue-sts-predictions.csv", index=False)
