加cc.zh.100.ve

In [22]:
!apt-get -y install fonts-noto-cjk
!pip install jieba
!pip install torch jieba numpy scikit-learn matplotlib seaborn

'apt-get' ���O�����Υ~���R�O�B�i���檺�{���Χ妸�ɡC




In [23]:
import torch
import torch.nn as nn
import jieba
from collections import Counter
import numpy as np
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from matplotlib import rcParams
from matplotlib.font_manager import FontProperties



# ========== 加載 FastText 嵌入 ==========
def load_fasttext_embedding(path, vocab, embed_dim):
    print("Loading filtered fastText embedding...")

    embedding_dict = {}
    needed_words = set(vocab.keys())

    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        header = f.readline()
        for line in f:
            values = line.strip().split()
            if len(values) != embed_dim + 1:
                continue
            word = values[0]
            if word in needed_words:
                vector = np.asarray(values[1:], dtype='float32')
                embedding_dict[word] = vector


    matrix = np.zeros((len(vocab), embed_dim))
    unk_count = 0
    total_count = len(vocab) - 2

    for word, idx in vocab.items():
        if word in embedding_dict:
            matrix[idx] = embedding_dict[word]
        else:
            matrix[idx] = np.random.normal(scale=0.6, size=(embed_dim,))
            if word not in ["<PAD>", "<UNK>"]:
                unk_count += 1

    print(f"OOV rate: {unk_count / total_count:.4f}")
    return torch.tensor(matrix, dtype=torch.float)


def build_vocab(texts, min_freq=1):
    word_counts = Counter()
    for text in texts:
        
        words = jieba.lcut(text)
        word_counts.update(words)

    vocab = {"<PAD>": 0, "<UNK>": 1}
    idx = 2
    for word, freq in word_counts.items():
        if freq >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

def text_to_indices(text, vocab, max_len=50):
    words = jieba.lcut(text)
    indices = [vocab.get(word, vocab["<UNK>"]) for word in words]
    if len(indices) < max_len:
        indices += [vocab["<PAD>"]] * (max_len - len(indices))
    else:
        indices = indices[:max_len]
    return indices

class SymptomDataset(Dataset):
    def __init__(self, data, vocab, label_to_idx, max_len=50):
        self.data = data
        self.vocab = vocab
        self.label_to_idx = label_to_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]
        indices = text_to_indices(text, self.vocab, self.max_len)
        return torch.tensor(indices, dtype=torch.long), torch.tensor(self.label_to_idx[label], dtype=torch.long)

class SymptomGRUAttention(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes, dropout_rate=0.5):
        super(SymptomGRUAttention, self).__init__()
        num_embeddings, embed_dim = embedding_matrix.shape
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.attn = nn.Linear(hidden_dim * 2, 1)
        self.classifier = nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        gru_out = self.dropout(gru_out)
        attn_weights = torch.softmax(self.attn(gru_out), dim=1)
        context = torch.sum(attn_weights * gru_out, dim=1)
        context = self.dropout(context)
        out = self.classifier(context)
        return out, attn_weights

# ==========畫圖 ==========
def plot_confusion_matrix(y_true, y_pred, labels):
    cm = confusion_matrix(y_true, y_pred, labels=list(range(len(labels))))
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels, cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")
    plt.xticks(rotation=45, fontproperties=my_font)
    plt.yticks(fontproperties=my_font)
    plt.tight_layout()
    plt.show()

def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=10, patience=3):
    model.train()
    best_val_loss = float('inf')
    patience_counter = 0
    for epoch in range(num_epochs):
        total_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs, _ = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        val_loss = evaluate_loss(model, val_loader, criterion, device)
        # print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {total_loss/len(train_loader):.4f}, Val Loss: {val_loss:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pt")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break

    model.load_state_dict(torch.load("best_model.pt"))
    return model

def evaluate_loss(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs, _ = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(val_loader)

def evaluate_model(model, data_loader, device, idx_to_label):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for texts, labels in data_loader:
            texts = texts.to(device)
            outputs, _ = model(texts)
            preds = torch.argmax(outputs, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.numpy())

    used_indices = sorted(set(all_preds + all_labels))
    used_labels = [idx_to_label[i] for i in used_indices]
    # print("\n--- GRU+Attention Evaluation Report ---")
    # print("Accuracy:", accuracy_score(all_labels, all_preds))
    # print("Precision:", precision_score(all_labels, all_preds, average='macro', zero_division=0))
    # print("Recall:", recall_score(all_labels, all_preds, average='macro', zero_division=0))
    # print("F1 Score:", f1_score(all_labels, all_preds, average='macro', zero_division=0))
    # print(classification_report(all_labels, all_preds, labels=used_indices, target_names=used_labels, zero_division=0))
    #下面可看confusion matrix
    # plot_confusion_matrix(all_labels, all_preds, [idx_to_label[i] for i in range(len(idx_to_label))])


def predict_with_scores(model, text, vocab, idx_to_label, max_len=50, device='cpu', threshold=0.2):
    model.eval()
    with torch.no_grad():
        indices = text_to_indices(text, vocab, max_len)
        indices = torch.tensor([indices], dtype=torch.long).to(device)
        outputs, attn_weights = model(indices)
        probs = torch.softmax(outputs, dim=1)[0]
        confidence, pred_idx = torch.max(probs, dim=0)
        pred_label = idx_to_label[pred_idx.item()]
        score_dict = {idx_to_label[i]: float(probs[i]) for i in range(len(probs))}

        words = jieba.lcut(text)[:max_len]
        attn_weights = attn_weights.squeeze().cpu().numpy()[:len(words)]
        print("\n注意力權重：")
        for word, weight in zip(words, attn_weights):
            print(f"{word}: {weight:.4f}")

        return {
            "prediction": pred_label,
            "confidence": float(confidence),
            "scores": score_dict
        }

def run_inference_with_score(text, model, vocab, idx_to_label, max_len, device, threshold=0.35):
    result = predict_with_scores(model, text, vocab, idx_to_label, max_len, device)
    prediction = result["prediction"] if result["confidence"] >= threshold else "無法判斷"

    print(f"\n【輸入症狀】：{text}")
    print(f"【預測結果】：{prediction}")
    # print(f"【模型信心】：{result['confidence']:.4f}")
    # print("【各類別分數】：")
    sorted_scores = sorted(result["scores"].items(), key=lambda x: x[1], reverse=True)
    for label, score in sorted_scores:
        print(f"{label:<15} : {score:.4f}")

def main():
    max_len = 50
    embed_dim = 300
    hidden_dim = 16
    num_epochs = 10
    batch_size = 16
    dropout_rate = 0.3
    weight_decay = 1e-4
    patience = 3
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    with open("sympton_dataset.json", "r", encoding="utf-8") as f:
        dataset = json.load(f)

    texts = [item["text"] for item in dataset]
    labels = [item["label"] for item in dataset]

    # print("類別分佈：", Counter(labels))

    vocab = build_vocab(texts, min_freq=2)
    label_set = sorted(set(labels))
    label_to_idx = {label: idx for idx, label in enumerate(label_set)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}

    class_counts = Counter(labels)
    class_weights = torch.tensor([1.0 / class_counts[label] for label in label_set], dtype=torch.float).to(device)

    embedding_matrix = load_fasttext_embedding("zh_wiki_fasttext_300.txt", vocab, embed_dim)


    full_dataset = SymptomDataset(dataset, vocab, label_to_idx, max_len)
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(full_dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)

    model = SymptomGRUAttention(
        embedding_matrix=embedding_matrix,
        hidden_dim=hidden_dim,
        num_classes=len(label_to_idx),
        dropout_rate=dropout_rate
    ).to(device)

    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0025, weight_decay=weight_decay)

    model = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs, patience)
    evaluate_model(model, val_loader, device, idx_to_label)
    with open("vocab.json", "w", encoding="utf-8") as f:
        json.dump(vocab, f, ensure_ascii=False, indent=2)
    with open("label_to_idx.json", "w", encoding="utf-8") as f:
        json.dump(label_to_idx, f, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    main()


Building prefix dict from the default dictionary ...


Dumping model to file cache C:\Users\annie\AppData\Local\Temp\jieba.cache
Loading model cost 0.655 seconds.
Prefix dict has been built successfully.


Loading filtered fastText embedding...


FileNotFoundError: [Errno 2] No such file or directory: 'zh_wiki_fasttext_300.txt'

In [None]:
# ====== 第二區塊：playground ======
import torch, json, jieba, numpy as np


class SymptomGRUAttention(torch.nn.Module):
    def __init__(self, embedding_matrix, hidden_dim, num_classes, dropout_rate=0.3):
        super(SymptomGRUAttention, self).__init__()
        num_embeddings, embed_dim = embedding_matrix.shape
        self.embedding = torch.nn.Embedding.from_pretrained(embedding_matrix, freeze=False, padding_idx=0)
        self.gru = torch.nn.GRU(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.attn = torch.nn.Linear(hidden_dim * 2, 1)
        self.classifier = torch.nn.Linear(hidden_dim * 2, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        gru_out, _ = self.gru(embedded)
        gru_out = self.dropout(gru_out)
        attn_weights = torch.softmax(self.attn(gru_out), dim=1)
        context = torch.sum(attn_weights * gru_out, dim=1)
        context = self.dropout(context)
        out = self.classifier(context)
        return out, attn_weights


def text_to_indices(text, vocab, max_len=50):
    words = jieba.lcut(text)
    indices = [vocab.get(w, vocab["<UNK>"]) for w in words]
    return indices[:max_len] + [vocab["<PAD>"]] * max(0, max_len - len(indices))


with open("vocab.json", "r", encoding="utf-8") as f:
    vocab = json.load(f)
with open("label_to_idx.json", "r", encoding="utf-8") as f:
    label_to_idx = json.load(f)
idx_to_label = {int(v): k for k, v in label_to_idx.items()}

def load_embedding_matrix(path, vocab, embed_dim):
    embedding_dict = {}
    with open(path, 'r', encoding='utf-8') as f:
        f.readline()  # skip header
        for line in f:
            parts = line.strip().split()
            if len(parts) != embed_dim + 1: continue
            word, vector = parts[0], np.array(parts[1:], dtype='float32')
            if word in vocab:
                embedding_dict[word] = vector
    matrix = np.zeros((len(vocab), embed_dim))
    for word, idx in vocab.items():
        matrix[idx] = embedding_dict.get(word, np.random.normal(scale=0.6, size=(embed_dim,)))
    return torch.tensor(matrix, dtype=torch.float)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
embed_dim = 300
hidden_dim = 16
max_len = 50
embedding_matrix = load_embedding_matrix("zh_wiki_fasttext_300.txt", vocab, embed_dim)

model = SymptomGRUAttention(embedding_matrix, hidden_dim, len(label_to_idx)).to(device)
model.load_state_dict(torch.load("best_model.pt", map_location=device))
model.eval()


def predict_symptom(text, threshold=0.35):
    indices = text_to_indices(text, vocab, max_len)
    input_tensor = torch.tensor([indices], dtype=torch.long).to(device)
    with torch.no_grad():
        outputs, attn_weights = model(input_tensor)
        probs = torch.softmax(outputs, dim=1)[0]
        pred_idx = torch.argmax(probs).item()
        confidence = probs[pred_idx].item()
        label = idx_to_label[pred_idx] if confidence >= threshold else "無法判斷，請再輸入一次"
        top3 = sorted({idx_to_label[i]: float(p) for i, p in enumerate(probs)}.items(), key=lambda x: x[1], reverse=True)[:3]
        return {
            "輸入": text,
            "預測結果": label,
            "信心分數": round(confidence, 4),
            "前三高分類別": top3
        }

test_text = "我想下課"
result = predict_symptom(test_text)
print(f"\n【輸入】：{result['輸入']}")
print(f"【預測結果】：{result['預測結果']}")
# print("【前三分類分數】：")
# for label, score in result["前三高分類別"]:
#     print(f"{label:<10}: {score:.4f}")


FileNotFoundError: [Errno 2] No such file or directory: 'vocab.json'

從這邊以下開始run就好

In [None]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset
from transformers import BertTokenizer
import pandas as pd
import json

with open("sympton_dataset.json", encoding="utf-8") as f:
    raw_data = json.load(f)

df = pd.DataFrame(raw_data)

label2id = {label: i for i, label in enumerate(sorted(df['label'].unique()))}
id2label = {i: label for label, i in label2id.items()}
df['label_id'] = df['label'].map(label2id)

tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
dataset = Dataset.from_pandas(df[["text", "label_id"]])
def tokenize_fn(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=64)

tokenized_dataset = dataset.map(tokenize_fn)
tokenized_dataset = tokenized_dataset.rename_column("label_id", "labels")
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


num_labels = len(label2id)
model = BertForSequenceClassification.from_pretrained(
    "bert-base-chinese",
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()
model.save_pretrained("my_model/")
tokenizer.save_pretrained("my_model/")

import json
with open("my_model/id2label.json", "w", encoding="utf-8") as f:
    json.dump(id2label, f, ensure_ascii=False)


  from .autonotebook import tqdm as notebook_tqdm





Map: 100%|██████████| 468/468 [00:00<00:00, 2773.07 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,2.485,2.239037,0.446809,0.339539,0.446809,0.351095
2,1.4797,1.324282,0.744681,0.706434,0.744681,0.696454
3,1.0476,0.923804,0.829787,0.879433,0.829787,0.822796
4,0.5666,0.726023,0.765957,0.848582,0.765957,0.764894
5,0.4201,0.677718,0.829787,0.881206,0.829787,0.824316


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


INPUT改這!

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn.functional as F
import json


model = BertForSequenceClassification.from_pretrained("my_model/")
tokenizer = BertTokenizer.from_pretrained("my_model/")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

with open("my_model/id2label.json", encoding="utf-8") as f:
    id2label = json.load(f)
id2label = {int(k): v for k, v in id2label.items()} 
def predict_symptom_new(text, threshold=0.3): # 太多無法判斷的話可以降低 threshold
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=1)[0]
        pred_idx = torch.argmax(probs).item()
        confidence = probs[pred_idx].item()
        label = id2label[pred_idx] if confidence >= threshold else "無法判斷"
        top3 = sorted({id2label[i]: float(p) for i, p in enumerate(probs)}.items(), key=lambda x: x[1], reverse=True)[:3]
        return {
            "輸入": text,
            "預測結果": label,
            "信心分數": round(confidence, 4),
            "前三高分類別": top3
        }

result = predict_symptom_new("我頭好痛")
print(f"【輸入】：{result['輸入']}")
print(f"【預測結果】：{result['預測結果']}")
for label, score in result["前三高分類別"]:
    print(f"{label:<10}: {score:.4f}")


【輸入】：我頭好痛
【預測結果】：無法判斷
流行性感冒     : 0.2380
關節痛       : 0.0939
肌肉痠痛      : 0.0786
