In [2]:
!pip install transformers datasets scikit-learn --quiet


In [5]:
import pandas as pd
import json

# 載入 JSON 檔案
with open("sympton_dataset.json", encoding="utf-8") as f:
    raw_data = json.load(f)

df = pd.DataFrame(raw_data)

# 映射文字 label 為整數 id
label2id = {label: i for i, label in enumerate(sorted(df['label'].unique()))}
id2label = {i: label for label, i in label2id.items()}
df['label_id'] = df['label'].map(label2id)

# 檢查
df.head()


Unnamed: 0,text,label,label_id
0,昨天開始就一直拉肚子，幾乎每小時跑一次廁所。,腹瀉,20
1,今天上了五次廁所，大便都是水水的。,腹瀉,20
2,肚子咕嚕咕嚕叫，一吃東西就想拉肚子。,腹瀉,20
3,這幾天拉肚子，還伴有點噁心感。,腹瀉,20
4,大便變得很稀，還帶一點泡泡。,腹瀉,20


In [10]:
from datasets import Dataset
from transformers import BertTokenizer

# 初始化 tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")

# 建立 HuggingFace Dataset
dataset = Dataset.from_pandas(df[["text", "label_id"]])

# Tokenization 函數
def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

# 應用 tokenization
tokenized_dataset = dataset.map(tokenize_fn)
tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")
# 分割訓練與驗證集
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)


Map:   0%|          | 0/460 [00:00<?, ? examples/s]

In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [19]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

# 模型初始化
num_labels = len(label2id)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-chinese",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

# 訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


# 開始訓練
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# 預測結果
predictions = trainer.predict(split_dataset["test"])
pred_labels = predictions.predictions.argmax(axis=-1)
true_labels = predictions.label_ids

# 儲存基本統計
test_results = compute_metrics((predictions.predictions, true_labels))

# 印出評估結果
print("測試集結果:")
print(f"準確率: {test_results['accuracy']:.4f}")
print(f"F1 分數: {test_results['f1']:.4f}")
print(f"精確率: {test_results['precision']:.4f}")
print(f"召回率: {test_results['recall']:.4f}")

# 混淆矩陣
cm = confusion_matrix(true_labels, pred_labels)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=[id2label[i] for i in sorted(id2label)],
            yticklabels=[id2label[i] for i in sorted(id2label)])
plt.xlabel("預測標籤")
plt.ylabel("實際標籤")
plt.title("混淆矩陣")
plt.tight_layout()
plt.savefig("./trained_model/confusion_matrix.png")
print("混淆矩陣已保存至: ./trained_model/confusion_matrix.png")


Model complete,  change input below:

In [17]:
import torch

def predict_symptom(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
        pred = outputs.logits.argmax(dim=-1).item()
    return id2label[pred]

#在這輸入症狀描述
predict_symptom("最近作業好多喔好煩")


'睡眠品質不佳'