# Fine-Tuning Language Models for Text Classification
用于文本分类的微调语言模型

## Preparation for Google Colab

In [None]:
import os
from google.colab import drive

#  挂载 google 云盘
drive.mount("/content/drive")

print(os.getcwd())  # /content

# print(os.listdir("/content/drive/MyDrive/"))

# print(os.listdir("/content/drive/MyDrive/Colab Notebooks"))

# if os.getcwd() != "/content/drive/MyDrive":
#     os.chdir("/content/drive/MyDrive")

# print(os.getcwd())

In [None]:
# 提前将 requirements.txt 放在 google 云盘上
!pip install -r /content/drive/MyDrive/requirements.txt

In [3]:
subdir = "ch05d"
work_path = "/content/drive/MyDrive/" + subdir
if not os.path.exists(work_path):
    os.mkdir(work_path)
os.chdir(work_path)
print(os.getcwd())

/content/drive/MyDrive/ch05d


In [4]:
!apt-get install tree && tree -a "./"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tree is already the newest version (2.0.2-1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
[01;34m./[0m

0 directories, 0 files


## Loading Pre-Trained Model

In [5]:
from transformers import (
    DistilBertConfig,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
)

MODEL_PATH = "distilbert-base-uncased"
config = DistilBertConfig.from_pretrained(MODEL_PATH, num_labels=1)
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH, config=config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Loading Dataset

In [6]:
import datasets
from datasets import load_dataset

stsb_train = load_dataset("glue", "stsb", split="train")
print("stsb_train:", stsb_train)

stsb_validation = load_dataset("glue", "stsb", split="validation")
print("stsb_validation:", stsb_validation)
stsb_validation = stsb_validation.shuffle(seed=42)
print("stsb_validation:", stsb_validation)

stsb_val = datasets.Dataset.from_dict(stsb_validation[:750])
print("stsb_val:", stsb_val)

stsb_test = datasets.Dataset.from_dict(stsb_validation[750:])
print("stsb_test:", stsb_test)

stsb_train: Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 5749
})
stsb_validation: Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})
stsb_validation: Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 1500
})
stsb_val: Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 750
})
stsb_test: Dataset({
    features: ['sentence1', 'sentence2', 'label', 'idx'],
    num_rows: 750
})


In [7]:
stsb_train.shape, stsb_val.shape, stsb_test.shape

((5749, 4), (750, 4), (750, 4))

In [8]:
stsb_train[0]

{'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.',
 'label': 5.0,
 'idx': 0}

In [9]:
import pandas as pd

pd.DataFrame(stsb_train)

Unnamed: 0,sentence1,sentence2,label,idx
0,A plane is taking off.,An air plane is taking off.,5.00,0
1,A man is playing a large flute.,A man is playing a flute.,3.80,1
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.80,2
3,Three men are playing chess.,Two men are playing chess.,2.60,3
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4
...,...,...,...,...
5744,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia,0.00,5744
5745,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...,0.00,5745
5746,President heading to Bahrain,President Xi: China to continue help to fight ...,0.00,5746
5747,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders,0.00,5747


In [10]:
enc_train = stsb_train.map(
    lambda e: tokenizer(e["sentence1"], e["sentence2"], padding=True, truncation=True),
    batched=True,
    batch_size=1000,
)
enc_val = stsb_val.map(
    lambda e: tokenizer(e["sentence1"], e["sentence2"], padding=True, truncation=True),
    batched=True,
    batch_size=1000,
)
enc_test = stsb_test.map(
    lambda e: tokenizer(e["sentence1"], e["sentence2"], padding=True, truncation=True),
    batched=True,
    batch_size=1000,
)

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

In [11]:
enc_train.shape, enc_val.shape, enc_test.shape

((5749, 6), (750, 6), (750, 6))

In [12]:
enc_train[0]

{'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.',
 'label': 5.0,
 'idx': 0,
 'input_ids': [101,
  1037,
  4946,
  2003,
  2635,
  2125,
  1012,
  102,
  2019,
  2250,
  4946,
  2003,
  2635,
  2125,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [13]:
import pandas as pd

pd.DataFrame(enc_train)

Unnamed: 0,sentence1,sentence2,label,idx,input_ids,attention_mask
0,A plane is taking off.,An air plane is taking off.,5.00,0,"[101, 1037, 4946, 2003, 2635, 2125, 1012, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,A man is playing a large flute.,A man is playing a flute.,3.80,1,"[101, 1037, 2158, 2003, 2652, 1037, 2312, 8928...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.80,2,"[101, 1037, 2158, 2003, 9359, 14021, 5596, 209...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,Three men are playing chess.,Two men are playing chess.,2.60,3,"[101, 2093, 2273, 2024, 2652, 7433, 1012, 102,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4,"[101, 1037, 2158, 2003, 2652, 1996, 10145, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...
5744,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia,0.00,5744,"[101, 5729, 14554, 2015, 2004, 4040, 18856, 13...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5745,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...,0.00,5745,"[101, 9877, 1997, 23437, 19323, 2579, 2011, 19...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5746,President heading to Bahrain,President Xi: China to continue help to fight ...,0.00,5746,"[101, 2343, 5825, 2000, 15195, 102, 2343, 8418...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
5747,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders,0.00,5747,"[101, 2859, 1010, 2634, 19076, 2000, 2582, 177...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


## Training with Trainer Class

In [14]:
from transformers import TrainingArguments, Trainer
from torch import cuda

training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir="./stsb-model",
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.01,
    # TensorBoard log directory
    logging_strategy="steps",
    logging_dir="./logs",
    logging_steps=50,
    # other options : no, steps
    evaluation_strategy="steps",
    save_strategy="epoch",
    fp16=cuda.is_available(),
    report_to=["tensorboard"]
)
training_args



TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=50,
eval_strategy=IntervalStrategy.STEPS,
eval_use

In [15]:
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr


def compute_metrics(pred):
    preds = np.squeeze(pred.predictions)
    return {
        "MSE": ((preds - pred.label_ids) ** 2).mean().item(),
        "RMSE": (np.sqrt(((preds - pred.label_ids) ** 2).mean())).item(),
        "MAE": (np.abs(preds - pred.label_ids)).mean().item(),
        "Pearson": pearsonr(preds, pred.label_ids)[0],
        "Spearman's Rank": spearmanr(preds, pred.label_ids)[0],
    }

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=enc_train,
    eval_dataset=enc_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer

  trainer = Trainer(


<transformers.trainer.Trainer at 0x7aeabb4a4ed0>

In [17]:
train_result = trainer.train()

Step,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson,Spearman's rank
50,5.8273,2.443084,2.443084,1.563037,1.286075,0.394141,0.417262
100,1.4472,0.695558,0.695558,0.834001,0.683476,0.834711,0.83838
150,0.7875,0.60465,0.60465,0.777592,0.601106,0.854914,0.855888
200,0.6755,0.595684,0.595684,0.771806,0.612848,0.863073,0.857392
250,0.5322,0.573226,0.573226,0.757117,0.595859,0.860448,0.855006
300,0.4844,0.546013,0.546013,0.738927,0.574077,0.869681,0.865049
350,0.458,0.549105,0.549105,0.741016,0.570889,0.866739,0.860722
400,0.345,0.566414,0.566414,0.752605,0.570922,0.867127,0.860671
450,0.2834,0.543244,0.543244,0.737051,0.562696,0.869544,0.864308
500,0.2957,0.550302,0.550302,0.741823,0.562045,0.868784,0.864132


In [18]:
!tree -a ./

[01;34m./[0m
├── [01;34mlogs[0m
│   └── [00mevents.out.tfevents.1740807585.f7f3b038796b.3894.0[0m
└── [01;34mstsb-model[0m
    ├── [01;34mcheckpoint-180[0m
    │   ├── [00mconfig.json[0m
    │   ├── [00mmodel.safetensors[0m
    │   ├── [00moptimizer.pt[0m
    │   ├── [00mrng_state.pth[0m
    │   ├── [00mscheduler.pt[0m
    │   ├── [00mspecial_tokens_map.json[0m
    │   ├── [00mtokenizer_config.json[0m
    │   ├── [00mtokenizer.json[0m
    │   ├── [00mtrainer_state.json[0m
    │   ├── [00mtraining_args.bin[0m
    │   └── [00mvocab.txt[0m
    ├── [01;34mcheckpoint-360[0m
    │   ├── [00mconfig.json[0m
    │   ├── [00mmodel.safetensors[0m
    │   ├── [00moptimizer.pt[0m
    │   ├── [00mrng_state.pth[0m
    │   ├── [00mscheduler.pt[0m
    │   ├── [00mspecial_tokens_map.json[0m
    │   ├── [00mtokenizer_config.json[0m
    │   ├── [00mtokenizer.json[0m
    │   ├── [00mtrainer_state.json[0m
    │   ├── [00mtraining_args.bin[0m
    │   └── 

In [19]:
device = "cuda" if cuda.is_available() else "cpu"
s1, s2 = "A plane is taking off.", "An air plane is taking off."
encoding = tokenizer(
    s1, s2, return_tensors="pt", padding=True, truncation=True, max_length=512
)
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
outputs.logits.item()

4.3046875

In [20]:
encoding = tokenizer(
    "hey how are you there",
    "hey how are you",
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=512,
)
input_ids = encoding["input_ids"].to(device)
attention_mask = encoding["attention_mask"].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
outputs.logits.item()

3.12890625

In [21]:
q = [trainer.evaluate(eval_dataset=data) for data in [enc_train, enc_val, enc_test]]
pd.DataFrame(q, index=["train", "val", "test"]).iloc[:, :6]

Unnamed: 0,eval_loss,eval_MSE,eval_RMSE,eval_MAE,eval_Pearson,eval_Spearman's Rank
train,0.173064,0.173064,0.41601,0.319452,0.958919,0.952253
val,0.544952,0.544952,0.738209,0.563143,0.869364,0.86462
test,0.521219,0.521219,0.721955,0.549136,0.879338,0.877605


In [22]:
!tree -a ./

[01;34m./[0m
├── [01;34mlogs[0m
│   ├── [00mevents.out.tfevents.1740807585.f7f3b038796b.3894.0[0m
│   └── [00mevents.out.tfevents.1740807669.f7f3b038796b.3894.1[0m
└── [01;34mstsb-model[0m
    ├── [01;34mcheckpoint-180[0m
    │   ├── [00mconfig.json[0m
    │   ├── [00mmodel.safetensors[0m
    │   ├── [00moptimizer.pt[0m
    │   ├── [00mrng_state.pth[0m
    │   ├── [00mscheduler.pt[0m
    │   ├── [00mspecial_tokens_map.json[0m
    │   ├── [00mtokenizer_config.json[0m
    │   ├── [00mtokenizer.json[0m
    │   ├── [00mtrainer_state.json[0m
    │   ├── [00mtraining_args.bin[0m
    │   └── [00mvocab.txt[0m
    ├── [01;34mcheckpoint-360[0m
    │   ├── [00mconfig.json[0m
    │   ├── [00mmodel.safetensors[0m
    │   ├── [00moptimizer.pt[0m
    │   ├── [00mrng_state.pth[0m
    │   ├── [00mscheduler.pt[0m
    │   ├── [00mspecial_tokens_map.json[0m
    │   ├── [00mtokenizer_config.json[0m
    │   ├── [00mtokenizer.json[0m
    │   ├── [00mtraine

In [23]:
model_path = "sentence-pair-regression-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

('sentence-pair-regression-model/tokenizer_config.json',
 'sentence-pair-regression-model/special_tokens_map.json',
 'sentence-pair-regression-model/vocab.txt',
 'sentence-pair-regression-model/added_tokens.json',
 'sentence-pair-regression-model/tokenizer.json')

In [24]:
!tree -a ./

[01;34m./[0m
├── [01;34mlogs[0m
│   ├── [00mevents.out.tfevents.1740807585.f7f3b038796b.3894.0[0m
│   └── [00mevents.out.tfevents.1740807669.f7f3b038796b.3894.1[0m
├── [01;34msentence-pair-regression-model[0m
│   ├── [00mconfig.json[0m
│   ├── [00mmodel.safetensors[0m
│   ├── [00mspecial_tokens_map.json[0m
│   ├── [00mtokenizer_config.json[0m
│   ├── [00mtokenizer.json[0m
│   ├── [00mtraining_args.bin[0m
│   └── [00mvocab.txt[0m
└── [01;34mstsb-model[0m
    ├── [01;34mcheckpoint-180[0m
    │   ├── [00mconfig.json[0m
    │   ├── [00mmodel.safetensors[0m
    │   ├── [00moptimizer.pt[0m
    │   ├── [00mrng_state.pth[0m
    │   ├── [00mscheduler.pt[0m
    │   ├── [00mspecial_tokens_map.json[0m
    │   ├── [00mtokenizer_config.json[0m
    │   ├── [00mtokenizer.json[0m
    │   ├── [00mtrainer_state.json[0m
    │   ├── [00mtraining_args.bin[0m
    │   └── [00mvocab.txt[0m
    ├── [01;34mcheckpoint-360[0m
    │   ├── [00mconfig.json[0m
    