# 安裝套件

In [1]:
# 套件安裝指令
!pip install --extra-index-url https://download.pytorch.org/whl/cu125 torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 accelerate==1.8.1 transformers datasets evaluate scikit-learn google.colab pandas

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu125
Collecting torch==2.4.1
  Downloading torch-2.4.1-cp312-cp312-manylinux1_x86_64.whl.metadata (26 kB)
Collecting torchvision==0.19.1
  Downloading torchvision-0.19.1-cp312-cp312-manylinux1_x86_64.whl.metadata (6.0 kB)
Collecting torchaudio==2.4.1
  Downloading torchaudio-2.4.1-cp312-cp312-manylinux1_x86_64.whl.metadata (6.4 kB)
Collecting accelerate==1.8.1
  Downloading accelerate-1.8.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.4.1)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.4.1)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.4.1)
  Downloading nvidia_cu

# 掛載 Google Drive

In [2]:
# 會需要使用者同意授權/存取 Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
# 切換目錄 (Colab 預設目錄為 /content，使用 %cd 切換目錄)
%cd /content/drive/MyDrive/nlp_bert

/content/drive/MyDrive/nlp_bert


# 觀看系統設定

In [4]:
!lsb_release -a

No LSB modules are available.
Distributor ID:	Ubuntu
Description:	Ubuntu 22.04.5 LTS
Release:	22.04
Codename:	jammy


In [5]:
!nvidia-smi

Thu Feb 12 08:01:07 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 580.82.07              Driver Version: 580.82.07      CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   40C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [6]:
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Fri_Feb_21_20:23:50_PST_2025
Cuda compilation tools, release 12.8, V12.8.93
Build cuda_12.8.r12.8/compiler.35583870_0


In [7]:
!python -V

Python 3.12.12


# 微調模型

In [8]:
import torch
print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.device(0))

True
0
<torch.cuda.device object at 0x7b3c82bbce30>


# 匯入套件

In [9]:
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

import random
import json
import pandas as pd
from sklearn.metrics import f1_score

# 基本參數設定

In [10]:
model_name = 'google-bert/bert-base-chinese' # 預訓練模型名稱
max_seq_length = 512 # 可訓練的序列最大長度
num_labels = 8 # 多元分類 (8 種情緒)
output_dir = './output' # 輸出模型資料夾

# 載入 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

# 讀取 HuggingFace 資料集

In [12]:
# 設定隨機種子，確保實驗可重現
seed = 42

# 從 HuggingFace 讀取資料集
from datasets import load_dataset
dataset_hf = load_dataset("Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset", split='train')
df = dataset_hf.to_pandas()

# 查看資料集基本資訊
print(f"資料集大小: {len(df)}")
print(f"欄位: {df.columns.tolist()}")
print(f"情緒類別: {df['emotion'].unique().tolist()}")
print(f"各類別數量:\n{df['emotion'].value_counts()}")

# 建立 label 對應表 (文字 -> 整數)
label_list = sorted(df['emotion'].unique().tolist())
label2id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for label, idx in label2id.items()}

print(f"\nLabel 對應表:")
for label, idx in label2id.items():
    print(f"  {label} -> {idx}")

# 將文字 label 轉換為整數
df['labels'] = df['emotion'].map(label2id)

# 洗牌
df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

sentences = df['text'].tolist()
labels = df['labels'].tolist()

README.md: 0.00B [00:00, ?B/s]



data.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/4159 [00:00<?, ? examples/s]

資料集大小: 4159
欄位: ['text', 'emotion']
情緒類別: ['平淡語氣', '開心語調', '悲傷語調', '憤怒語調', '驚奇語調', '厭惡語調', '疑問語調', '關切語調']
各類別數量:
emotion
平淡語氣    705
開心語調    592
關切語調    560
憤怒語調    527
驚奇語調    499
悲傷語調    486
厭惡語調    404
疑問語調    386
Name: count, dtype: int64

Label 對應表:
  厭惡語調 -> 0
  平淡語氣 -> 1
  悲傷語調 -> 2
  憤怒語調 -> 3
  疑問語調 -> 4
  開心語調 -> 5
  關切語調 -> 6
  驚奇語調 -> 7


# 轉換成 huggingface trainer 可以使用的 datasets

In [13]:
# 建立 Dataset
dataset = Dataset.from_dict({
    'sentences': sentences,
    'labels': labels
})

# 回傳切分資料 (訓練 和 驗證)
dataset = dataset.train_test_split(test_size=0.2)
'''
print(dataset) 的內容範例如下 (實際筆數以資料集為準):

DatasetDict({
    train: Dataset({
        features: ['sentences', 'labels'],
        num_rows: 3327  # 4159 * 0.8
    })
    test: Dataset({
        features: ['sentences', 'labels'],
        num_rows: 832   # 4159 * 0.2
    })
})
'''

# 預處理資料
def preprocess_data(dataset):
    # 將句子轉換為 token (tokenization)
    return tokenizer(
        dataset['sentences'],
        truncation=True,
        padding=True,
        return_tensors='pt',
        max_length=max_seq_length
    )

# 轉換資料
train_data = dataset['train'].map(preprocess_data, batched=True)
valid_data = dataset['test'].map(preprocess_data, batched=True)

# 建立 DatasetDict
dataset_dict = DatasetDict({
    'train': train_data,
    'test': valid_data
})

Map:   0%|          | 0/3327 [00:00<?, ? examples/s]

Map:   0%|          | 0/832 [00:00<?, ? examples/s]

# 設定模型評估指標

In [14]:
# 計算模型評估指標
def compute_metrics(predicted_results):
    # 取得真實標籤
    labels = predicted_results.label_ids

    # 取得預測結果
    preds = predicted_results.predictions.argmax(-1)

    # 計算 F1 score (多元分類使用 weighted)
    # 可參考: https://blog.csdn.net/qq_40671063/article/details/130447922
    f1 = f1_score(labels, preds, average='weighted')
    return {
        'f1': f1,
    }

# 微調模型

In [22]:
# 讀取模型 (設定 id2label 和 label2id，讓模型知道每個 id 對應的標籤名稱)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# 設定訓練參數
training_args = TrainingArguments(
    output_dir=output_dir, # 輸出資料夾
    # overwrite_output_dir=True,
    num_train_epochs=4, # 訓練回合數
    per_device_train_batch_size=16, # 批次大小
    per_device_eval_batch_size=16, # 批次大小
    gradient_accumulation_steps=2, # 梯度累積步數，主要是為了讓較小的 GPU 也能訓練較大的 batch size
    learning_rate=5e-5, # 預設會將每個小 batch 的 loss 除以累積步數（loss / 2），讓最終梯度等同於單一大 batch，但這會讓梯度變小（等於將 learning rate 除以 2），所以要手動將 learning rate 乘以累積步數來抵消
    warmup_steps=31, # 預熱步數，主要是讓 learning rate 從 0 緩慢增加到設定的 learning rate，避免一開始學習率過大導致模型不穩定
    weight_decay=0.01, # 權重衰減，主要是為了正則化，避免過擬合，讓 loss 曲線較平滑
    eval_strategy="epoch", # epoch, steps, no
    save_strategy="epoch", # epoch, steps, no
    save_total_limit=2, # 最多儲存模型數量
    metric_for_best_model="f1", # 用 F1 挑 best
    greater_is_better=True, # F1 越大越好
    load_best_model_at_end=True, # Trainer 會追蹤 metric_for_best_model（預設 eval_loss，越小越好）來判斷最佳模型，訓練結束時自動將該最佳檢查點載入記憶體覆蓋最終模型
    logging_strategy="steps",
    logging_steps=50,
    seed=seed, # 隨機種子，主要是為了確保實驗可重現
    lr_scheduler_type="linear", # https://blog.csdn.net/muyao987/article/details/139319466
    report_to="none", # 關閉內建的實驗追蹤功能 (如 wandb, tensorboard)
)

# 設定 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict['train'],
    eval_dataset=dataset_dict['test'],
    data_collator=DataCollatorWithPadding(tokenizer), # 修正: 使用 DataCollatorWithPadding
    compute_metrics=compute_metrics,
)

# 開始訓練
trainer.train()

# 儲存模型
trainer.save_model(output_dir)

# 儲存 tokenizer
tokenizer.save_pretrained(output_dir)

# 儲存 label 對應表 (方便預測時使用)
import json
with open(f'{output_dir}/label_mapping.json', 'w', encoding='utf-8') as f:
    json.dump({'label2id': label2id, 'id2label': {str(k): v for k, v in id2label.items()}}, f, ensure_ascii=False, indent=2)

print(f"模型已儲存至 {output_dir}")
print(f"Label 對應表已儲存至 {output_dir}/label_mapping.json")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: google-bert/bert-base-chinese
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss,F1
1,1.22934,0.495043,0.849107
2,0.708316,0.419692,0.863442
3,0.342139,0.435488,0.870713
4,0.117301,0.462694,0.89422


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

模型已儲存至 ./output
Label 對應表已儲存至 ./output/label_mapping.json
