In [None]:
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import classification_report, mean_squared_error
import numpy as np
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 경로 설정
model_path = "/content/drive/MyDrive/koBERT_model/kobert_model.pth"
data_path = "/content/drive/MyDrive/validation_immoral.json"

In [None]:
# Dataset 클래스 정의
class KoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'is_immoral': torch.tensor(label["is_immoral"], dtype=torch.float),
            'intensity': torch.tensor(label["intensity"], dtype=torch.float),
        }

In [None]:
# 모델 정의
class CustomKoBERT(torch.nn.Module):
    def __init__(self, bert_model_name):
        super(CustomKoBERT, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.fc_is_immoral = torch.nn.Linear(768, 1)
        self.fc_intensity = torch.nn.Linear(768, 1)  # 강도

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        is_immoral = self.fc_is_immoral(pooled_output).squeeze(-1)
        intensity = self.fc_intensity(pooled_output).squeeze(-1)
        intensity = 1 + 2 * torch.sigmoid(intensity)  # 강도를 1~3 범위로 제한
        return is_immoral, intensity

In [None]:
# 모델 로드
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomKoBERT("monologg/kobert")

# .pth 파일 로드
state_dict = torch.load(model_path, map_location=device)
model.load_state_dict(state_dict, strict=False)
model.to(device)

In [None]:
# Validation 데이터 로드
with open(data_path, "r", encoding="utf-8") as file:
    validation_data = json.load(file)

texts = [item['text'] for item in validation_data]
labels = [
    {
        "is_immoral": 1 if item['is_immoral'] else 0,
        "intensity": 0 if not item['is_immoral'] else max(1, min(3, item['intensity'])),  # is_immoral=0인 경우 intensity=0
    }
    for item in validation_data
]

In [None]:
# `max_len` 및 `batch_size` 정의
max_len = 64
batch_size = 16

validation_dataset = KoBERTDataset(
    texts,
    labels,
    tokenizer=BertTokenizer.from_pretrained("monologg/kobert"),
    max_len=max_len
)
validation_loader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=False)

In [None]:
# 평가 함수
def evaluate_model(model, data_loader, device):
    model.eval()
    preds_is_immoral = []
    preds_intensity = []
    true_is_immoral = []
    true_intensity = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            true_is_immoral_batch = batch['is_immoral'].to(device)
            true_intensity_batch = batch['intensity'].to(device)

            # Forward pass
            is_immoral, intensity = model(input_ids, attention_mask)

            # Predictions
            preds_is_immoral.extend((is_immoral > 0).long().cpu().numpy())
            preds_intensity.extend(intensity.cpu().numpy())

            # True labels
            true_is_immoral.extend(true_is_immoral_batch.long().cpu().numpy())
            true_intensity.extend(true_intensity_batch.cpu().numpy())

    return (
        np.array(preds_is_immoral), np.array(preds_intensity),
        np.array(true_is_immoral), np.array(true_intensity)
    )

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

errors = np.abs(true_intensity - preds_intensity)

tolerance = 1.0  # 허용 오차 범위

accurate_predictions = (errors <= tolerance).sum()
total_predictions = len(errors)
accuracy = accurate_predictions / total_predictions

# MSE와 MAE
mse = mean_squared_error(true_intensity, preds_intensity)
mae = mean_absolute_error(true_intensity, preds_intensity)

print(f"전체 데이터에서 허용 오차(±{tolerance}) 내에서 정확히 예측한 비율: {accuracy * 100:.2f}%")
print(f"전체 데이터의 Mean Squared Error (MSE): {mse:.4f}")
print(f"전체 데이터의 Mean Absolute Error (MAE): {mae:.4f}")