# Fine-Tuning Language Models for Text Classification
用于文本分类的微调语言模型

##### Fine-Tuning with Native Pytorch

## Preparation for Google Colab

In [None]:
import os
from google.colab import drive

#  挂载 google 云盘
drive.mount("/content/drive")

print(os.getcwd())  # /content

# print(os.listdir("/content/drive/MyDrive/"))

# print(os.listdir("/content/drive/MyDrive/Colab Notebooks"))

# if os.getcwd() != "/content/drive/MyDrive":
#     os.chdir("/content/drive/MyDrive")

# print(os.getcwd())

In [None]:
# 提前将 requirements.txt 放在 google 云盘上
!pip install -r /content/drive/MyDrive/requirements.txt

In [3]:
subdir = "ch05b"
work_path = "/content/drive/MyDrive/" + subdir
if not os.path.exists(work_path):
    os.mkdir(work_path)
os.chdir(work_path)
print(os.getcwd())

/content/drive/MyDrive/ch05b


In [4]:
!apt-get install tree && tree -a "./"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tree is already the newest version (2.0.2-1).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
[01;34m./[0m

0 directories, 0 files


## Loading pre-trained model and tokenizer

In [5]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [6]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizerFast'.


DistilBertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [7]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=1e-3)
optimizer



AdamW (
Parameter Group 0
    betas: (0.9, 0.999)
    correct_bias: True
    eps: 1e-06
    lr: 0.001
    weight_decay: 0.0
)

### 验证

In [8]:
# one step forward
import torch

texts = ["this is a good example", "this is a bad example", "this is a good one"]
print("texts:", texts)
encoding = tokenizer(
    texts, return_tensors="pt", padding=True, truncation=True, max_length=512
)
print("encoding:", encoding)

input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]
print("input_ids:", input_ids)
print("input_ids:", input_ids.shape)
print("attention_mask:", attention_mask)
print("attention_mask:", attention_mask.shape)

labels = [1, 0, 1]
labels = torch.tensor(labels).unsqueeze(0)
print("labels:", labels)
print("labels:", labels.shape)

texts: ['this is a good example', 'this is a bad example', 'this is a good one']
encoding: {'input_ids': tensor([[ 101, 2023, 2003, 1037, 2204, 2742,  102],
        [ 101, 2023, 2003, 1037, 2919, 2742,  102],
        [ 101, 2023, 2003, 1037, 2204, 2028,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}
input_ids: tensor([[ 101, 2023, 2003, 1037, 2204, 2742,  102],
        [ 101, 2023, 2003, 1037, 2919, 2742,  102],
        [ 101, 2023, 2003, 1037, 2204, 2028,  102]])
input_ids: torch.Size([3, 7])
attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])
attention_mask: torch.Size([3, 7])
labels: tensor([[1, 0, 1]])
labels: torch.Size([1, 3])


In [9]:
outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
outputs

SequenceClassifierOutput(loss=tensor(0.6774, grad_fn=<NllLossBackward0>), logits=tensor([[-0.0815,  0.0293],
        [-0.0779,  0.0238],
        [-0.0296,  0.0636]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [10]:
outputs.logits, outputs.logits.shape

(tensor([[-0.0815,  0.0293],
         [-0.0779,  0.0238],
         [-0.0296,  0.0636]], grad_fn=<AddmmBackward0>),
 torch.Size([3, 2]))

In [11]:
loss = outputs.loss
loss, loss.shape

(tensor(0.6774, grad_fn=<NllLossBackward0>), torch.Size([]))

In [12]:
loss.backward()
optimizer.step()

In [13]:
# Manually calculate loss
from torch.nn import functional

labels = torch.tensor([1, 0, 1])
outputs = model(input_ids, attention_mask=attention_mask)
outputs, outputs.logits.shape

(SequenceClassifierOutput(loss=None, logits=tensor([[-0.5801,  0.6060],
         [-0.5247,  0.5688],
         [-0.5666,  0.5986]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 torch.Size([3, 2]))

In [14]:
loss = functional.cross_entropy(outputs.logits, labels)
loss, loss.shape

(tensor(0.6401, grad_fn=<NllLossBackward0>), torch.Size([]))

In [15]:
loss.backward()
optimizer.step()

## Training the model from entire dataset with Native PyTorch

In [16]:
from torch.utils.data import Dataset


class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
from transformers import DistilBertForSequenceClassification
from transformers import DistilBertTokenizerFast

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# import datasets
from datasets import load_dataset
from evaluate import load

sst2 = load_dataset("glue", "sst2")
print("sst2:", sst2)

metric = load("glue", "sst2")
print("metric:", metric)

sst2: DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})
metric: EvaluationModule(name: "glue", module_type: "metric", features: {'predictions': Value(dtype='int64', id=None), 'references': Value(dtype='int64', id=None)}, usage: """
Compute GLUE evaluation metric associated to each GLUE dataset.
Args:
    predictions: list of predictions to score.
        Each translation should be tokenized into a list of tokens.
    references: list of lists of references for each translation.
        Each reference should be tokenized into a list of tokens.
Returns: depending on the GLUE subset, one or several of:
    "accuracy": Accuracy
    "f1": F1 score
    "pearson": Pearson Correlation
    "spearmanr": Spearman Correlation
    "mat

In [19]:
texts = sst2["train"]["sentence"]
labels = sst2["train"]["label"]
val_texts = sst2["validation"]["sentence"]
val_labels = sst2["validation"]["label"]

In [20]:
len(texts), len(labels), len(val_texts), len(val_labels)

(67349, 67349, 872, 872)

In [21]:
sst2["train"][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0}

In [22]:
# I will take small portion
K = 10000
train_dataset = MyDataset(
    tokenizer(texts[:K], truncation=True, padding=True), labels[:K]
)
val_dataset = MyDataset(tokenizer(val_texts, truncation=True, padding=True), val_labels)

In [23]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("device:", device)

model.to(device)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

optimizer = AdamW(model.parameters(), lr=1e-5)

for epoch in range(3):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
    model.eval()
    for batch in val_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        predictions = outputs.logits.argmax(dim=-1)
        metric.add_batch(
            predictions=predictions,
            references=batch["labels"],
        )
    eval_metric = metric.compute()
    print(f"epoch {epoch}: {eval_metric}")

device: cuda
epoch 0: {'accuracy': 0.8853211009174312}
epoch 1: {'accuracy': 0.8704128440366973}
epoch 2: {'accuracy': 0.8956422018348624}
