# 셀 1: 데이터 불러오기 및 컬럼명 변경

In [1]:
import pandas as pd

# 훈련 데이터 불러오기
try:
    train_df = pd.read_csv(
        "C:\TP\모델학습\data\symptom-disease-train-dataset-Translated.csv",
        encoding="utf-8",
    )
except UnicodeDecodeError:
    train_df = pd.read_csv(
        "C:\TP\모델학습\data\symptom-disease-train-dataset-Translated.csv",
        encoding="cp949",
    )

# 테스트 데이터 불러오기
try:
    test_df = pd.read_csv("C:\TP\모델학습\data\학습데이터(증상).csv", encoding="utf-8")
except UnicodeDecodeError:
    test_df = pd.read_csv("C:\TP\모델학습\data\학습데이터(증상).csv", encoding="cp949")

# 컬럼명 통일
train_df = train_df.rename(columns={"라벨": "label", "증상_번역": "text"})
test_df = test_df.rename(columns={"증상_번역": "text"})

# 셀 2: LabelEncoder로 라벨 통일 인코딩

In [2]:
from sklearn.preprocessing import LabelEncoder

# 문자열로 변환하여 통일
train_df["label"] = train_df["label"].astype(str)
test_df["label"] = test_df["label"].astype(str)

# 전체 라벨 기준으로 학습
all_labels = pd.concat([train_df["label"], test_df["label"]], axis=0)
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# 인코딩 적용
train_df["label"] = label_encoder.transform(train_df["label"])
test_df["label"] = label_encoder.transform(test_df["label"])

# 셀 3: 라벨 텐서로 변환

In [3]:
import torch

train_labels = torch.tensor(train_df["label"].tolist())
test_labels = torch.tensor(test_df["label"].tolist())

print("train_labels 최대값:", train_labels.max().item())
print("test_labels 최대값:", test_labels.max().item())
print("전체 라벨 수:", len(label_encoder.classes_))

train_labels 최대값: 865
test_labels 최대값: 889
전체 라벨 수: 890


# 셀 4: 토크나이저 및 데이터셋 준비

In [4]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset

model_name = "beomi/KcELECTRA-base-v2022"
tokenizer = AutoTokenizer.from_pretrained(model_name)


class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = TextDataset(train_df["text"].tolist(), train_labels)
test_dataset = TextDataset(test_df["text"].tolist(), test_labels)

  from .autonotebook import tqdm as notebook_tqdm


# 셀 5: 모델 불러오기 및 학습 준비

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import os
import shutil
import torch
from sklearn.metrics import accuracy_score

# 1) model_output과 logs 경로 정리
output_path = os.path.abspath("C:/TP/model_output")
logging_path = os.path.join(output_path, "logs")

if os.path.exists(output_path):
    if os.path.isfile(output_path):
        os.remove(output_path)
    else:
        shutil.rmtree(output_path)
os.makedirs(output_path, exist_ok=True)

if os.path.exists(logging_path):
    if os.path.isfile(logging_path):
        os.remove(logging_path)
    else:
        shutil.rmtree(logging_path)
os.makedirs(logging_path, exist_ok=True)

print("output_path exists and is dir:", os.path.isdir(output_path))
print("logging_path exists and is dir:", os.path.isdir(logging_path))

# 2) 모델 생성 (label_encoder와 model_name은 이미 준비된 상태라 가정)
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=num_labels
)


# 3) compute_metrics 함수 정의
def compute_metrics(p):
    preds = torch.argmax(torch.tensor(p.predictions), dim=1)
    labels = torch.tensor(p.label_ids)
    acc = accuracy_score(labels.cpu(), preds.cpu())
    return {"accuracy": acc}


# 4) TrainingArguments 설정
training_args = TrainingArguments(
    output_dir=output_path,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir=logging_path,
    logging_strategy="epoch",  # 로그 저장을 다시 활성화 (또는 "no"로 유지 가능)
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
)

# 5) Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  # ✅ 여기 추가!
)

output_path exists and is dir: True
logging_path exists and is dir: True


Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 셀 6: 모델 학습

In [11]:
trainer.train()

  item["labels"] = torch.tensor(self.labels[idx])
                                                  
 33%|███▎      | 353/1059 [1:13:11<41:53,  3.56s/it]

{'loss': 5.328, 'grad_norm': 16.07264518737793, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    

[A[A                                         
 33%|███▎      | 353/1059 [1:13:46<41:53,  3.56s/it]
[A
[A

{'eval_loss': 7.335833549499512, 'eval_accuracy': 0.0, 'eval_runtime': 34.9574, 'eval_samples_per_second': 34.327, 'eval_steps_per_second': 2.145, 'epoch': 1.0}


  item["labels"] = torch.tensor(self.labels[idx])
                                                    
 33%|███▎      | 353/1059 [1:40:39<41:53,  3.56s/it]

{'loss': 3.28, 'grad_norm': 16.976343154907227, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    
[A                                               

 33%|███▎      | 353/1059 [1:41:14<41:53,  3.56s/it]
[A
[A

{'eval_loss': 7.191505432128906, 'eval_accuracy': 0.0, 'eval_runtime': 35.5775, 'eval_samples_per_second': 33.729, 'eval_steps_per_second': 2.108, 'epoch': 2.0}


  item["labels"] = torch.tensor(self.labels[idx])
                                                    
 33%|███▎      | 353/1059 [2:08:07<41:53,  3.56s/it] 

{'loss': 2.529, 'grad_norm': 10.564023971557617, 'learning_rate': 0.0, 'epoch': 3.0}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                    

[A[A                                         
 33%|███▎      | 353/1059 [2:08:43<41:53,  3.56s/it] 
[A
[A

{'eval_loss': 7.513923168182373, 'eval_accuracy': 0.0, 'eval_runtime': 36.2089, 'eval_samples_per_second': 33.141, 'eval_steps_per_second': 2.071, 'epoch': 3.0}


                                                    
100%|██████████| 1059/1059 [1:22:30<00:00,  4.68s/it]

{'train_runtime': 4950.8373, 'train_samples_per_second': 3.414, 'train_steps_per_second': 0.214, 'train_loss': 3.7123362367394064, 'epoch': 3.0}





TrainOutput(global_step=1059, training_loss=3.7123362367394064, metrics={'train_runtime': 4950.8373, 'train_samples_per_second': 3.414, 'train_steps_per_second': 0.214, 'total_flos': 2241279891154944.0, 'train_loss': 3.7123362367394064, 'epoch': 3.0})

# 셀 7: 테스트 데이터 예측 및 결과 확인

In [12]:
preds = trainer.predict(test_dataset)
pred_labels = torch.argmax(torch.tensor(preds.predictions), dim=1)

# 디코딩하여 확인
decoded_preds = label_encoder.inverse_transform(pred_labels)
decoded_targets = label_encoder.inverse_transform(test_labels)

for i in range(10):
    print(f"입력: {test_df['text'].iloc[i]}")
    print(f"예측: {decoded_preds[i]} | 실제: {decoded_targets[i]}")
    print("---")

  item["labels"] = torch.tensor(self.labels[idx])
100%|██████████| 75/75 [00:33<00:00,  2.23it/s]

입력: I have been experiencing a skin rash on my arms, legs, and torso for the past few weeks. It is red, itchy, and covered in dry, scaly patches.
예측: 822 | 실제: Psoriasis
---
입력: My skin has been peeling, especially on my knees, elbows, and scalp. This peeling is often accompanied by a burning or stinging sensation.
예측: 822 | 실제: Psoriasis
---
입력: I have been experiencing joint pain in my fingers, wrists, and knees. The pain is often achy and throbbing, and it gets worse when I move my joints.
예측: 193 | 실제: Psoriasis
---
입력: There is a silver like dusting on my skin, especially on my lower back and scalp. This dusting is made up of small scales that flake off easily when I scratch them.
예측: 193 | 실제: Psoriasis
---
입력: My nails have small dents or pits in them, and they often feel inflammatory and tender to the touch. Even there are minor rashes on my arms.
예측: 394 | 실제: Psoriasis
---
입력: The skin on my palms and soles is thickened and has deep cracks. These cracks are painful and bleed 




In [13]:
eval_result = trainer.evaluate()
print(eval_result)

  item["labels"] = torch.tensor(self.labels[idx])
100%|██████████| 75/75 [00:34<00:00,  2.18it/s]

{'eval_loss': 7.335833549499512, 'eval_accuracy': 0.0, 'eval_runtime': 34.7436, 'eval_samples_per_second': 34.539, 'eval_steps_per_second': 2.159, 'epoch': 3.0}





In [14]:
trainer.save_model("C:/TP/final_model")

In [15]:
test_texts = ["열이 나고 기침이 심해요"]
tokens = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
outputs = model(**tokens)
preds = torch.argmax(outputs.logits, dim=1)
print("예측 결과:", label_encoder.inverse_transform(preds.numpy()))

예측 결과: ['186']


In [None]:
test_texts = ["열이 나고 기침이 심해요"]
tokens = tokenizer(test_texts, truncation=True, padding=True, return_tensors="pt")
outputs = model(**tokens)
preds = torch.argmax(outputs.logits, dim=1)

# 안전하게 클래스명으로 디코딩
pred_label = label_encoder.classes_[preds.item()]
print("예측 병명:", pred_label)

예측 병명: 186


In [18]:
train_df["label"].head()

0    260
1    300
2    696
3    115
4    516
Name: label, dtype: int64