In [None]:
!pip install peft

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/199.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/199.1 kB[0m [31m2.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from peft)
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nv

In [None]:
import torch
print("PyTorch 的版本為: {}".format(torch.__version__))

import transformers as T
print("Hugging Face Transformers 的版本為: {}".format(T.__version__))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

from torch.utils.data import Dataset, DataLoader
from peft import get_peft_model, LoraConfig

PyTorch 的版本為: 2.2.1+cu121
Hugging Face Transformers 的版本為: 4.40.2


# 二分類任務
## 準備資料集 (需先下載)
- SST 資料集
    - https://dl.fbaipublicfiles.com/glue/data/SST-2.zip


In [None]:
#@title  { display-mode: "form" }
DATA_NAME = "SST-2"         #@param ["SST-2", "QQP-clean"] {type:"string"}
MODEL_NAME = "google-bert/bert-base-uncased" #@param  {type:"string"}
PEFT_TYPE = "lora"        #@param ["lora", "bitfit", "full-finetune"] {type:"string"}
RANDOM_SEED = 42            #@param  {type:"integer"}

In [None]:
# 對資料集產生對應的下載網址
filename = f"https://dl.fbaipublicfiles.com/glue/data/{DATA_NAME}.zip"

# 解壓縮後的檔名
entry = "SST-2" if DATA_NAME == "SST-2" else "QQP"

# splits -> 資料集的分割名稱
splits = ["train", "dev"]

# 設定執行環境 (CPU or GPU)
device = "cpu" if torch.cuda.is_available() else "cuda"
dataset_dict = dict()

### 下載資料與解壓縮

In [None]:
# 下載資料集
os.system(f"wget {filename}")
# 解壓縮
os.system(f"unzip {DATA_NAME}.zip")

0

### 讀取資料
包括 train, dev, test 三種分割資料

In [None]:
# 讀取所有 splits 資料
for split_type in splits:
    dataset_dict[split_type] = pd.read_csv(
        os.path.join(entry, (split_type + ".tsv")),
        sep="\t"
    )

## Task 1: 資料載入


In [None]:
class CustomedDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def process_dataset(df, tokenizer):

    texts = []

    if DATA_NAME == "SST-2":
        for sentence in df["sentence"]:
            sentence = sentence.strip()
            texts.append(f"[CLS] {sentence} [SEP]")

    else:
        for q1, q2 in df[ ["question1", "question2"] ]:
            q1 = q1.strip()
            q2 = q2.strip()
            texts.append(f"[CLS] {q1} [SEP] {q2} [SEP]")

    labels = df["label"] if DATA_NAME == "SST-2" else df["is_duplicate"]
    encodings = tokenizer(
        texts, truncation=True, padding=True, add_special_tokens=False
    )
    return {
        "encodings": encodings,
        "labels": labels
    }

In [None]:
# load model and tokenizer
model = T.AutoModelForSequenceClassification.from_pretrained( MODEL_NAME )
tokenizer = T.AutoTokenizer.from_pretrained( MODEL_NAME )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
# 輸出資料集數量
print("training size:  ", len(dataset_dict["train"]))
print("evaluation size:", len(dataset_dict["dev"]))

training size:   67349
evaluation size: 872


In [None]:
# 從 dataframe 轉換到 torch dataset
train_dataset = CustomedDataset( **process_dataset(dataset_dict["train"], tokenizer) )
dev_dataset = CustomedDataset( **process_dataset(dataset_dict["dev"], tokenizer) )

## Task2: 模型驗證


In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds)
    }

## Task3: PEFT

以下是各個資料集的baseline:

|dataset|metrics|baseline|
|----|----|----|
|CoLA|Matthew's Corr|0.6|
|SST2|Accuracy|0.88|
|MRPC|Accuracy|0.8|
|STSB|Pearson-Spearman Corr|0.8|
|QQP|F1 / Accuracy|0.8/0.8|
|MNLI_Matched|Accuracy|0.8|
|MNLI_Mismatched|Accuracy|0.8|
|QNLI|Accuracy|0.85|
|RTE|Accuracy|0.7|
|WNLI|Accuracy|0.8|

In [None]:
lora_config = dict()

In [None]:
#@title  { display-mode: "form" }
lora_config["r"]               = 8      #@param  {type:"integer"}
lora_config["lora_alpha"]      = 32     #@param  {type:"number"}
lora_config["lora_dropout"]    = 0.1    #@param  {type:"number"}

In [None]:
def show_trainable_ratio(model):
    trainbale_params = model.num_parameters(only_trainable=True)
    all_params = model.num_parameters(only_trainable=False)
    return f"trainable params: {trainbale_params:,} || all params: {all_params:,} || trainable%: {trainbale_params/all_params*100:.6f}"

In [None]:
if PEFT_TYPE == "lora":
    # LoRA -> 採用 peft 套件的設定
    model = get_peft_model(model, LoraConfig(**lora_config))

elif PEFT_TYPE == "bitfit":
    # 凍結非 bias 的參數
    for name, param in model.named_parameters():
        if "bias" not in name:
            param.requires_grad = False
else:
    # full finetune 不用做任何改動
    pass

print(show_trainable_ratio(model))

trainable params: 294,912 || all params: 109,778,690 || trainable%: 0.268642


In [None]:
training_args = {
    "output_dir":   DATA_NAME,      # 設定輸出位置
    "seed":         RANDOM_SEED,    # 設定亂數種子
    "label_names":  ["labels"]
}

In [None]:
#@title  { display-mode: "form" }
training_args["num_train_epochs"]               = 3         #@param  {type:"integer"}
training_args["learning_rate"]                  = 1e-4      #@param  {type:"number"}
training_args["per_device_train_batch_size"]    = 8         #@param  {type:"integer"}
training_args["per_device_eval_batch_size"]     = 8         #@param  {type:"integer"}
training_args["gradient_accumulation_steps"]    = 1         #@param  {type:"integer"}
training_args["warmup_steps"]                   = 50        #@param  {type:"integer"}
training_args["weight_decay"]                   = 1e-4      #@param  {type:"number"}
training_args["evaluation_strategy"]            = "steps"   #@param ["steps", "no", "epoch"] {type:"string"}
training_args["save_strategy"]                  = "steps"   #@param ["steps", "no", "epoch"] {type:"string"}
training_args["save_steps"]                     = 500       #@param  {type:"integer"}
training_args["eval_steps"]                     = 500       #@param  {type:"integer"}
training_args["save_total_limit"]               = 2         #@param  {type:"integer"}
training_args["logging_steps"]                  = 500       #@param  {type:"integer"}

In [None]:
trainer = T.Trainer(
    model=model,                              # 🤗 的模型
    args=T.TrainingArguments(**training_args),               # Trainer 所需要的引數
    train_dataset=train_dataset,                      # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=dev_dataset,                        # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    compute_metrics=compute_metrics,                    # 自定的評估的指標
)

# 指定使用 1 個 GPU 進行訓練
trainer.args._n_gpu=1

# 開始進行模型訓練
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1
500,0.5259,0.356915,0.862385,0.866962
1000,0.3598,0.319193,0.873853,0.881974
1500,0.3316,0.29714,0.887615,0.890869
2000,0.3224,0.292822,0.888761,0.893524
2500,0.2851,0.327975,0.879587,0.88746
3000,0.296,0.287346,0.891055,0.897075
3500,0.2814,0.276239,0.900229,0.904918
4000,0.2858,0.28664,0.90367,0.907285
4500,0.2763,0.306699,0.893349,0.900107
5000,0.2559,0.304793,0.901376,0.906522


TrainOutput(global_step=25257, training_loss=0.24395249806235067, metrics={'train_runtime': 2035.8708, 'train_samples_per_second': 99.244, 'train_steps_per_second': 12.406, 'total_flos': 6876355387561704.0, 'train_loss': 0.24395249806235067, 'epoch': 3.0})

In [None]:
trainer.state.log_history

[{'loss': 0.5259,
  'grad_norm': 9.038320541381836,
  'learning_rate': 9.821478160828342e-05,
  'epoch': 0.05938947618482005,
  'step': 500},
 {'eval_loss': 0.35691511631011963,
  'eval_accuracy': 0.8623853211009175,
  'eval_f1': 0.8669623059866963,
  'eval_runtime': 2.8692,
  'eval_samples_per_second': 303.921,
  'eval_steps_per_second': 37.99,
  'epoch': 0.05938947618482005,
  'step': 500},
 {'loss': 0.3598,
  'grad_norm': 2.472405433654785,
  'learning_rate': 9.623120561748721e-05,
  'epoch': 0.1187789523696401,
  'step': 1000},
 {'eval_loss': 0.3191927969455719,
  'eval_accuracy': 0.8738532110091743,
  'eval_f1': 0.8819742489270387,
  'eval_runtime': 3.0119,
  'eval_samples_per_second': 289.514,
  'eval_steps_per_second': 36.189,
  'epoch': 0.1187789523696401,
  'step': 1000},
 {'loss': 0.3316,
  'grad_norm': 5.850961685180664,
  'learning_rate': 9.4247629626691e-05,
  'epoch': 0.17816842855446016,
  'step': 1500},
 {'eval_loss': 0.29713982343673706,
  'eval_accuracy': 0.8876146788

In [None]:
!pip freeze > requirements.txt
!cat requirements.txt
from google.colab import files
files.download('requirements.txt')

absl-py==1.4.0
accelerate==0.30.1
aiohttp==3.9.5
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
annotated-types==0.6.0
anyio==3.7.1
appdirs==1.4.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.15.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==1.5.0
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.4
bqplot==0.12.43
branca==0.7.2
build==1.2.1
CacheControl==0.14.0
cachetools==5.3.3
catalogue==2.0.10
certifi==2024.2.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.86
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpathlib==0.16.0
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.2
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.4
cons==0.4.6
contextlib2==21.6.0
contourpy==1.2.1
cryptography==42.0.7
cuda-python==12.2.1

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>