In [None]:
!pip install transformers datasets scikit-learn --quiet


In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score, precision_score, recall_score


In [None]:
# JSON 파일 로드 및 컬럼 정리
df = pd.read_json("a - 복사본.json")
df = df.rename(columns=lambda x: x.strip())  # 공백 제거

# 라벨 리스트 생성
unsmile_labels = [col for col in df.columns if col != "문장"]
num_labels = len(unsmile_labels)

# Huggingface Dataset으로 변환 및 분할
dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
dataset = DatasetDict({'train': dataset['train'], 'valid': dataset['test']})


In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

def preprocess_function(examples):
    inputs = tokenizer(examples["문장"], truncation=True, padding="max_length", max_length=128)
    labels = [[ex[label] for label in unsmile_labels] for ex in examples]
    inputs["labels"] = labels
    return inputs

encoded_dataset = dataset.map(preprocess_function, batched=True)


In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)


In [None]:
def compute_metrics(pred):
    preds = (pred.predictions > 0.5).astype(int)
    labels = pred.label_ids
    return {
        'f1': f1_score(labels, preds, average='macro', zero_division=0),
        'precision': precision_score(labels, preds, average='macro', zero_division=0),
        'recall': recall_score(labels, preds, average='macro', zero_division=0),
    }


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()


In [None]:
# 예측 후 라벨 확인
preds = trainer.predict(encoded_dataset["valid"])
pred_labels = (preds.predictions > 0.5).astype(int)

# 예시 출력
for i in range(5):
    print("문장:", dataset["valid"][i]["문장"])
    print("예측 라벨:", {label: int(pred_labels[i][j]) for j, label in enumerate(unsmile_labels)})
    print("실제 라벨:", {label: dataset["valid"][i][label] for label in unsmile_labels})
    print()
