<a href="https://colab.research.google.com/github/karellen-kim/training-ml/blob/main/ch_bert/kobert_finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import warnings
warnings.filterwarnings(action='ignore')

# 1. 데이터 로드 (질병명 + 증상 질문 데이터)
df = pd.read_csv("data.csv")  # "질병명", "증상 문의 내용" 컬럼이 있어야 함
disease_labels = {disease: i for i, disease in enumerate(df["질병명"].unique())}
disease_labels

{'수족냉증': 0, '식중독': 1, '소화불량': 2, '질염': 3, '비염': 4}

In [5]:
# 2. 질병명을 정수 라벨(Label)로 변환
disease_labels = {disease: i for i, disease in enumerate(df["질병명"].unique())}
df["label"] = df["질병명"].map(disease_labels)
df.head()

Unnamed: 0,질병명,증상 문의 내용,label
0,수족냉증,수족냉증이 있으면 손발 외에도 몸에 다른 증상이 나타날 수 있나요?,0
1,수족냉증,수족냉증이 스트레스와 관련이 있을 수도 있나요?,0
2,수족냉증,손발이 차가운 것 외에도 마비 증상이 나타날 수도 있나요?,0
3,수족냉증,수족냉증이 있으면 손발 외에도 몸에 다른 증상이 나타날 수 있나요?,0
4,수족냉증,수족냉증이 있으면 감기에 더 자주 걸리나요?,0


In [8]:
# 3. 데이터셋 분할 (학습 80%, 테스트 20%)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["증상 문의 내용"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)
train_texts[:5]

['소화불량이 심하면 구역질이 나거나 구토를 할 수도 있나요?',
 '비염이 있으면 눈 가려움과 관련이 있을까요?',
 '수족냉증이 심하면 저림이나 통증이 동반될 수 있나요?',
 '질염이 심하면 냄새가 심해질 수도 있나요?',
 '질염이 심하면 냄새가 심해질 수도 있나요?']

In [9]:
train_labels[:5]

[2, 4, 0, 3, 3]

In [11]:
# 4. BERT 토크나이저 로드
model_name = "klue/bert-base"
tokenizer = BertTokenizer.from_pretrained(model_name)

# 5. 데이터 토큰화
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

In [12]:
train_encodings

{'input_ids': [[2, 6053, 2588, 2199, 2052, 7090, 2460, 5480, 2431, 2052, 717, 9253, 19903, 2138, 1892, 4438, 1513, 2075, 2182, 35, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 20445, 2052, 1513, 6076, 780, 26692, 2145, 3700, 2052, 1513, 16809, 2182, 35, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [2, 19261, 2576, 2304, 2052, 7090, 2460, 1535, 2323, 15351, 6595, 2052, 5904, 2651, 1295, 1513, 2075, 2182, 35, 3, 0, 0

In [13]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [14]:
# 6. PyTorch Dataset 생성
class DiseaseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = DiseaseDataset(train_encodings, train_labels)
test_dataset = DiseaseDataset(test_encodings, test_labels)
item = train_dataset.__getitem__(0)
item

{'input_ids': tensor([    2,  6053,  2588,  2199,  2052,  7090,  2460,  5480,  2431,  2052,
           717,  9253, 19903,  2138,  1892,  4438,  1513,  2075,  2182,    35,
             3,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [15]:
item.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
# 7. BERT 모델 불러오기 (다중 클래스 분류)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(disease_labels))

# 8. 평가 함수 정의 (정확도 계산)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# 9. 훈련 설정 (하이퍼파라미터)
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True
)

# 10. Trainer 객체 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# 11. 모델 학습
trainer.train()

# 12. 학습된 모델 저장
model.save_pretrained("./bert_disease_classifier")
tokenizer.save_pretrained("./bert_disease_classifier")

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [None]:
model = BertForSequenceClassification.from_pretrained("_kobert/bert_disease_classifier")
tokenizer = BertTokenizer.from_pretrained("_kobert/bert_disease_classifier")

df = pd.read_csv("data.csv")  # "질병명", "증상 문의 내용" 컬럼이 있어야 함
disease_labels = {disease: i for i, disease in enumerate(df["질병명"].unique())}

def predict_disease(symptom_text):
    inputs = tokenizer(symptom_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits).item()

    # 라벨을 질병명으로 변환
    predicted_disease = {v: k for k, v in disease_labels.items()}[predicted_class]
    return predicted_disease


# 예측 예제
symptom = "콧물이 계속 나오고 코막힘이 심해요."
predicted_disease = predict_disease(symptom)
print(f"예측된 질병: {predicted_disease}")