In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

dataset = load_dataset("csv", data_files="final_set.csv")
tokenizer = AutoTokenizer.from_pretrained("monologg/koelectra-base-discriminator")

def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset["train"].train_test_split(test_size=0.2, seed=42)


In [2]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "monologg/koelectra-base-discriminator", num_labels=2
)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

training_args = TrainingArguments(
    output_dir="./koelectra-romance-scam",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=12,
    num_train_epochs=10,
    logging_dir="./logs"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [4]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [5]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=400, training_loss=0.03800376653671265, metrics={'train_runtime': 476.9078, 'train_samples_per_second': 10.065, 'train_steps_per_second': 0.839, 'total_flos': 1262933065728000.0, 'train_loss': 0.03800376653671265, 'epoch': 10.0})

In [9]:
trainer.save_model("./koelectra-romance-scam")
tokenizer.save_pretrained("./koelectra-romance-scam")
predictions = trainer.predict(tokenized_dataset["test"])


In [10]:
trainer.evaluate()
predictions = trainer.predict(tokenized_dataset["test"])
from sklearn.metrics import classification_report
labels = predictions.label_ids
preds = np.argmax(predictions.predictions, axis=1)
print(classification_report(labels, preds, digits=4))

              precision    recall  f1-score   support

           0     0.9655    1.0000    0.9825        56
           1     1.0000    0.9688    0.9841        64

    accuracy                         0.9833       120
   macro avg     0.9828    0.9844    0.9833       120
weighted avg     0.9839    0.9833    0.9833       120



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

model_path = "./koelectra-romance-scam"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

def predict_romance_scam(text: str):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        probs = F.softmax(outputs.logits, dim=1)
        pred = torch.argmax(probs, dim=1).item()
    label = "로맨스 스캠" if pred == 1 else "정상 대화"
    return label

text = """안녕하세요, 저는 미군 장교로 현재 시리아에 파병 중입니다. 우연히 당신의 프로필을 보고 메시지를 보냅니다. 
너무 인상 깊었습니다. [SEP] 안녕하세요. 시리아에 계시다니, 존경스러워요!!! [SEP] 사실 하나 말씀드릴 게 있어요."""
label = predict_romance_scam(text)
print(f"예측: {label}")

In [35]:
from transformers.trainer_callback import TrainerCallback
import os
import json

class LossRecorderCallback(TrainerCallback):
    def __init__(self):
        self.train_loss = []
        self.eval_loss = []

    def on_log(self, args, state, control, logs=None, **kwargs):
        if "loss" in logs:
            self.train_loss.append(logs["loss"])
        if "eval_loss" in logs:
            self.eval_loss.append(logs["eval_loss"])

loss_recorder = LossRecorderCallback()

trainer.add_callback(loss_recorder)
