In [None]:
!pip install peft

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.10.0


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader

import transformers as T

from peft import get_peft_model, LoraConfig

In [None]:
#@title  { display-mode: "form" }
DATA_NAME = "STS-B"         #@param ["RTE", "STS-B"] {type:"string"}
MODEL_NAME = "roberta-base" #@param  {type:"string"}
PEFT_TYPE = "full-finetune"        #@param ["lora", "bitfit", "full-finetune"] {type:"string"}
RANDOM_SEED = 42            #@param  {type:"integer"}

In [None]:
# 對資料集產生對應的下載網址
filename = f"https://dl.fbaipublicfiles.com/glue/data/{DATA_NAME}.zip"

# 解壓縮後的檔名
entry =  DATA_NAME

# splits -> 資料集的分割名稱
splits = ["train", "dev"]

# 設定執行環境 (CPU or GPU)
device = "cpu" if torch.cuda.is_available() else "cuda"
dataset_dict = dict()

In [None]:
# 確認是否已下載資料
if not os.path.isfile(f"{DATA_NAME}.zip"):
    # 下載資料集
    os.system(f"wget {filename}")
    # 解壓縮
    os.system(f"unzip {DATA_NAME}.zip")

Archive:  STS-B.zip
   creating: STS-B/
  inflating: STS-B/LICENSE.txt       
  inflating: STS-B/dev.tsv           
   creating: STS-B/original/
  inflating: STS-B/original/sts-dev.tsv  
  inflating: STS-B/original/sts-test.tsv  
  inflating: STS-B/original/sts-train.tsv  
  inflating: STS-B/readme.txt        
  inflating: STS-B/test.tsv          
  inflating: STS-B/train.tsv         


--2024-05-15 07:28:06--  https://dl.fbaipublicfiles.com/glue/data/STS-B.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.108, 3.163.189.51, 3.163.189.96, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.108|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 802872 (784K) [application/zip]
Saving to: 'STS-B.zip'

     0K .......... .......... .......... .......... ..........  6% 3.86M 0s
    50K .......... .......... .......... .......... .......... 12% 20.6M 0s
   100K .......... .......... .......... .......... .......... 19% 13.1M 0s
   150K .......... .......... .......... .......... .......... 25% 25.6M 0s
   200K .......... .......... .......... .......... .......... 31% 22.5M 0s
   250K .......... .......... .......... .......... .......... 38% 26.2M 0s
   300K .......... .......... .......... .......... .......... 44% 23.8M 0s
   350K .......... .......... .......... .......... .......... 51% 49.5M 0s
  

In [None]:
# 讀取所有 splits 資料
for split_type in splits:
    dataset_dict[split_type] = pd.read_csv(
        os.path.join(entry, (split_type + ".tsv")),
        sep="\t", on_bad_lines='skip'
    )

In [None]:
class CustomedDataset(Dataset):

    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = { key: torch.tensor(val[idx]) for key, val in self.encodings.items() }
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def process_dataset(df, tokenizer):

    df = df.dropna(ignore_index=True)

    texts = []
    labels = None

    for s1, s2 in zip( df["sentence1"], df["sentence2"] ):
        s1 = s1.strip()
        s2 = s2.strip()
        texts.append(f"<s> {s1} </s></s> {s2} </s>")


    labels = df["score"]

    encodings = tokenizer(
        texts, truncation=True, padding=True, add_special_tokens=False
    )
    return {
        "encodings": encodings,
        "labels": labels.astype(np.float32)
    }

In [None]:
num_labels = 1 # 定義不同資料集的label數量(1 -> regression -> MSELoss)

# 讀入 huggingface 的 model 與 tokenizer
model = T.AutoModelForSequenceClassification.from_pretrained( MODEL_NAME, num_labels=num_labels )
tokenizer = T.AutoTokenizer.from_pretrained( MODEL_NAME )

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
# 輸出資料集數量
print("training size:  ", len(dataset_dict["train"]))
print("evaluation size:", len(dataset_dict["dev"]))

training size:   5706
evaluation size: 1465


In [None]:
# 從 dataframe 轉換到 torch dataset
train_dataset = CustomedDataset( **process_dataset(dataset_dict["train"], tokenizer) )
dev_dataset = CustomedDataset( **process_dataset(dataset_dict["dev"], tokenizer) )

In [None]:
from sklearn.metrics import accuracy_score
from scipy.stats import spearmanr

def compute_metrics(pred):
    # 請參考GLUE benchmark的官方網頁，使用和資料集對應的evaluation matrics

    labels = pred.label_ids
    preds = None

    if DATA_NAME != "STS-B":
        preds = pred.predictions.argmax(-1)
    else:
        preds = pred.predictions[:, 0]

    return {"accuracy": accuracy_score(labels, preds)} if DATA_NAME != "STS-B" else {"pearson": spearmanr(labels, preds).statistic}


In [None]:
#@title  { display-mode: "form" }
lora_rank       = 8      #@param  {type:"integer"}
lora_alpha      = 32     #@param  {type:"number"}
lora_dropout    = 0.1    #@param  {type:"number"}

In [None]:
lora_config = {
    "r": lora_rank,
    "lora_alpha": lora_alpha,
    "lora_dropout": lora_dropout
}

In [None]:
def show_trainable_ratio(model):
    trainbale_params = model.num_parameters(only_trainable=True)
    all_params = model.num_parameters(only_trainable=False)
    return f"trainable params: {trainbale_params:,} || all params: {all_params:,} || trainable%: {trainbale_params/all_params*100:.6f}"

In [None]:
if PEFT_TYPE == "lora":
    # LoRA -> 採用 peft 套件的設定
    model = get_peft_model(model, LoraConfig(**lora_config))

elif PEFT_TYPE == "bitfit":
    # 凍結非 bias 的參數
    for name, param in model.named_parameters():
        if "bias" not in name:
            param.requires_grad = False
else:
    # full finetune 不用做任何改動
    pass

print(show_trainable_ratio(model))

trainable params: 124,646,401 || all params: 124,646,401 || trainable%: 100.000000


In [None]:
#@title  { display-mode: "form" }
num_train_epochs                = 10         #@param  {type:"integer"}
learning_rate                   = 2e-5      #@param  {type:"number"}
per_device_train_batch_size     = 16        #@param  {type:"integer"}
per_device_eval_batch_size      = 16        #@param  {type:"integer"}
gradient_accumulation_steps     = 1         #@param  {type:"integer"}
warmup_steps                    = 200      #@param  {type:"integer"}
weight_decay                    = 0.1      #@param  {type:"number"}
evaluation_strategy             = "steps"   #@param ["steps", "no", "epoch"] {type:"string"}
save_strategy                   = "steps"   #@param ["steps", "no", "epoch"] {type:"string"}
save_steps                      = 150      #@param  {type:"integer"}
eval_steps                      = 150      #@param  {type:"integer"}
adam_epsilon                    = 1e-6      #@param  {type:"number"}
save_total_limit                = 2         #@param  {type:"integer"}
logging_steps                   = 150      #@param  {type:"integer"}

In [None]:
training_args = {
    "output_dir"                    :   DATA_NAME,                  # 設定輸出位置
    "seed"                          :   RANDOM_SEED,                # 設定亂數種子
    "report_to"                     :   "none",
    "label_names"                   :   ["labels"],
    "num_train_epochs"              :   num_train_epochs,
    "learning_rate"                 :   learning_rate,
    "per_device_train_batch_size"   :   per_device_train_batch_size,
    "per_device_eval_batch_size"    :   per_device_eval_batch_size,
    "gradient_accumulation_steps"   :   gradient_accumulation_steps,
    "warmup_steps"                  :   warmup_steps,
    "weight_decay"                  :   weight_decay,
    "adam_epsilon"                  :   adam_epsilon,
    "evaluation_strategy"           :   evaluation_strategy,
    "save_strategy"                 :   save_strategy,
    "save_steps"                    :   save_steps,
    "eval_steps"                    :   eval_steps,
    "save_total_limit"              :   save_total_limit,
    "logging_steps"                 :   logging_steps
}

In [None]:
trainer = T.Trainer(
    model=model,                                        # 🤗 的模型
    args=T.TrainingArguments(**training_args),          # Trainer 所需要的引數
    train_dataset=train_dataset,                        # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=dev_dataset,                           # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    compute_metrics=compute_metrics,                    # 自定的評估的指標
)

# 指定使用 1 個 GPU 進行訓練
trainer.args._n_gpu=1

# 開始進行模型訓練
trainer.train()

2024-05-15 07:28:13.714380: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-15 07:28:13.714480: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-15 07:28:13.840599: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Pearson
150,4.4119,0.965465,0.789221
300,0.7881,0.66197,0.872731
450,0.5299,0.703243,0.891105
600,0.4892,0.528469,0.890592
750,0.4197,0.528584,0.895045
900,0.2968,0.692268,0.900986
1050,0.3027,0.45292,0.898377
1200,0.2341,0.425704,0.905187
1350,0.2161,0.455258,0.899627
1500,0.1829,0.443159,0.903503


TrainOutput(global_step=3570, training_loss=0.3936915181264156, metrics={'train_runtime': 2983.4131, 'train_samples_per_second': 19.116, 'train_steps_per_second': 1.197, 'total_flos': 1.339321399204986e+16, 'train_loss': 0.3936915181264156, 'epoch': 10.0})

In [None]:
del model, tokenizer
torch.cuda.empty_cache()