In [1]:
!pip install peft

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.13.0->peft)
  Using cached nvidia_cudnn_cu12-8.9.2.26

In [2]:
import torch
print("PyTorch 的版本為: {}".format(torch.__version__))

import transformers as T
print("Hugging Face Transformers 的版本為: {}".format(T.__version__))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from torch.utils.data import Dataset, DataLoader
from peft import get_peft_model, LoraConfig

PyTorch 的版本為: 2.2.1+cu121
Hugging Face Transformers 的版本為: 4.40.2


# 二分類任務
## 準備資料集 (需先下載)
- SST 資料集
    - https://dl.fbaipublicfiles.com/glue/data/SST-2.zip


In [3]:
#@title  { display-mode: "form" }
DATA_NAME = "SST-2"         #@param ["SST-2", "RTE"] {type:"string"}
MODEL_NAME = "microsoft/deberta-base" #@param  {type:"string"}
PEFT_TYPE = "full-finetune"          #@param ["lora", "bitfit", "full-finetune"] {type:"string"}
RANDOM_SEED = 50            #@param  {type:"integer"}

In [4]:
# 對資料集產生對應的下載網址
filename = f"https://dl.fbaipublicfiles.com/glue/data/{DATA_NAME}.zip"

# 解壓縮後的檔名
entry = "SST-2" if DATA_NAME == "SST-2" else "RTE"

# splits -> 資料集的分割名稱
splits = ["train", "dev"]

# 設定執行環境 (CPU or GPU)
device = "cpu" if torch.cuda.is_available() else "cuda"
dataset_dict = dict()

In [5]:
# 下載資料集
os.system(f"wget {filename}")
# 解壓縮
os.system(f"unzip {DATA_NAME}.zip")

0

In [6]:
# 讀取所有 splits 資料
for split_type in splits:
    dataset_dict[split_type] = pd.read_csv(
        os.path.join(entry, (split_type + ".tsv")),
        sep="\t"
    )

## Task 1: 資料載入


In [7]:
class CustomedDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def process_dataset(df, tokenizer):
    texts = []
    labels = []

    if DATA_NAME == "SST-2":
        sentences = df["sentence"].tolist()
        texts = [f"[CLS] {sentence.strip()} [SEP]" for sentence in sentences]
        labels = df["label"].tolist()
    else:
        q1_list = df["question1"].tolist()
        q2_list = df["question2"].tolist()
        texts = [f"[CLS] {q1.strip()} [SEP] {q2.strip()} [SEP]" for q1, q2 in zip(q1_list, q2_list)]
        labels = df["is_duplicate"].tolist()

    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        add_special_tokens=False
    )

    return {"encodings": encodings, "labels": labels}


In [8]:
# load model and tokenizer
model = T.AutoModelForSequenceClassification.from_pretrained( MODEL_NAME )
tokenizer = T.AutoTokenizer.from_pretrained( MODEL_NAME )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [9]:
# 輸出資料集數量
print("training size:  ", len(dataset_dict["train"]))
print("evaluation size:", len(dataset_dict["dev"]))

training size:   67349
evaluation size: 872


In [10]:
# 從 dataframe 轉換到 torch dataset
train_dataset = CustomedDataset( **process_dataset(dataset_dict["train"], tokenizer) )
dev_dataset = CustomedDataset( **process_dataset(dataset_dict["dev"], tokenizer) )

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


## Task2: 模型驗證


In [11]:
from sklearn.metrics import accuracy_score, f1_score


def compute_metrics(pred):
    # 請參考GLUE benchmark的官方網頁，使用和資料集對應的evaluation matrics
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': accuracy,
        'f1': f1
    }


## Task3: PEFT

以下是各個資料集的baseline:

|dataset|metrics|baseline|
|----|----|----|
|CoLA|Matthew's Corr|0.6|
|SST2|Accuracy|0.88|
|MRPC|Accuracy|0.8|
|STSB|Pearson-Spearman Corr|0.8|
|QQP|F1 / Accuracy|0.8/0.8|
|MNLI_Matched|Accuracy|0.8|
|MNLI_Mismatched|Accuracy|0.8|
|QNLI|Accuracy|0.85|
|RTE|Accuracy|0.7|
|WNLI|Accuracy|0.8|

In [12]:
lora_config = dict()

In [13]:
#@title  { display-mode: "form" }
lora_config["r"]               = 8      #@param  {type:"integer"}
lora_config["lora_alpha"]      = 32     #@param  {type:"number"}
lora_config["lora_dropout"]    = 0.1    #@param  {type:"number"}

In [14]:
def show_trainable_ratio(model):
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    all_params = sum(p.numel() for p in model.parameters())
    trainable_percentage = trainable_params / all_params * 100
    return f"trainable params: {trainable_params:,} || all params: {all_params:,} || trainable%: {trainable_percentage:.6f}"


In [15]:
if PEFT_TYPE == "lora":
    # LoRA -> 採用 peft 套件的設定
    model = get_peft_model(model, LoraConfig(**lora_config))

elif PEFT_TYPE == "bitfit":
    # 凍結非 bias 的參數
    for name, param in model.named_parameters():
        if "bias" not in name:
            param.requires_grad = False
else:
    # full finetune 不用做任何改動
    pass

print(show_trainable_ratio(model))

trainable params: 139,193,858 || all params: 139,193,858 || trainable%: 100.000000


In [16]:
training_args = {
    "output_dir":   DATA_NAME,      # 設定輸出位置
    "seed":         RANDOM_SEED,    # 設定亂數種子
}

In [17]:
#@title  { display-mode: "form" }
num_train_epochs                = 10         #@param  {type:"integer"}
learning_rate                   = 3e-5      #@param  {type:"number"}
per_device_train_batch_size     = 16        #@param  {type:"integer"}
per_device_eval_batch_size      = 16        #@param  {type:"integer"}
gradient_accumulation_steps     = 1         #@param  {type:"integer"}
warmup_steps                    = 50      #@param  {type:"integer"}
weight_decay                    = 0      #@param  {type:"number"}
evaluation_strategy             = "steps"   #@param ["steps", "no", "epoch"] {type:"string"}
save_strategy                   = "steps"   #@param ["steps", "no", "epoch"] {type:"string"}
save_steps                      = 50      #@param  {type:"integer"}
eval_steps                      = 50      #@param  {type:"integer"}
adam_epsilon                    = 1e-6      #@param  {type:"number"}
save_total_limit                = 2         #@param  {type:"integer"}
logging_steps                   = 50      #@param  {type:"integer"}

In [18]:
training_args = {
    "output_dir"                    :   DATA_NAME,                  # 設定輸出位置
    "seed"                          :   RANDOM_SEED,                # 設定亂數種子
    "label_names"                   :   ["labels"],
    "num_train_epochs"              :   num_train_epochs,
    "learning_rate"                 :   learning_rate,
    "per_device_train_batch_size"   :   per_device_train_batch_size,
    "per_device_eval_batch_size"    :   per_device_eval_batch_size,
    "gradient_accumulation_steps"   :   gradient_accumulation_steps,
    "warmup_steps"                  :   warmup_steps,
    "weight_decay"                  :   weight_decay,
    "evaluation_strategy"           :   evaluation_strategy,
    "save_strategy"                 :   save_strategy,
    "save_steps"                    :   save_steps,
    "eval_steps"                    :   eval_steps,
    "save_total_limit"              :   save_total_limit,
    "logging_steps"                 :   logging_steps
}

In [None]:
trainer = T.Trainer(
    model=model,                                        # 🤗 的模型
    args=T.TrainingArguments(**training_args),          # Trainer 所需要的引數
    train_dataset=train_dataset,                        # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=dev_dataset,                           # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    compute_metrics=compute_metrics,                    # 自定的評估的指標
)

# 指定使用 1 個 GPU 進行訓練
trainer.args._n_gpu=1

# 開始進行模型訓練
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
50,0.688,0.690233,0.625,0.459504
100,0.5782,0.499353,0.832569,0.814721
150,0.4327,0.352376,0.885321,0.876847
200,0.4188,0.303031,0.900229,0.906952
250,0.3178,0.266111,0.91055,0.908665
300,0.3289,0.263486,0.912844,0.917391
350,0.3,0.253659,0.917431,0.920705
400,0.3212,0.243656,0.927752,0.930233
450,0.3139,0.286115,0.905963,0.905312
500,0.279,0.22706,0.911697,0.911596


In [None]:
# 測試模型
trainer.predict(test_dataset)