#### - BERT/GPT/Transfoer is suitable for classification, but it is very big. We do fine-tuning at most, barely train a new one.
#### - backbone for feature extraction, pass the task model (usually neuron network)

In [14]:
from datasets import load_from_disk
from transformers import BertTokenizer
import torch

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokenizer

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

BertTokenizer(name_or_path='bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [10]:
#trail to encode
out = tokenizer.batch_encode_plus(
    batch_text_or_text_pairs = ['心态是最正规的。人生是很权威的。'],
    truncation = True,
    padding = 'max_length',
    max_length = 15,
    return_tensors = 'pt',
    return_length = True
)

In [11]:
for k, v in out.items():
    print(k, v)

input_ids tensor([[ 101, 2552, 2578, 3221, 3297, 3633, 6226, 4638,  511,  782, 4495, 3221,
         2523, 3326,  102]])
token_type_ids tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
length tensor([15])
attention_mask tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [13]:
tokenizer.decode(out['input_ids'][0])

'[CLS] 心 态 是 最 正 规 的 。 人 生 是 很 权 [SEP]'

## Define dataset

In [20]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, split):
        self.dataset = load_from_disk('../ChnSentiCorp/')[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, i):
        text = self.dataset[i]['text']
        label = self.dataset[i]['label']

        return text, label
        

In [21]:
dataset = Dataset('train')

In [22]:
len(dataset)

9600

In [23]:
dataset[20]

('非常不错，服务很好，位于市中心区，交通方便，不过价格也高！', 1)

In [24]:
dataset

<__main__.Dataset at 0x129701f60>

In [25]:
type(dataset)

__main__.Dataset

Define device

In [30]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'

## Function to organize data

In [63]:
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    data = tokenizer.batch_encode_plus(batch_text_or_text_pairs = sents,
                                       truncation = True,
                                       padding = 'max_length',
                                       max_length = 500,
                                       return_tensors = 'pt', # 'pt' means PyTorch, it could be 'np' or 'tf' as well
                                       return_length = True)

    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    token_type_ids = token_type_ids.to(device)
    labels = labels.to(device)

    return input_ids, attention_mask, token_type_ids, labels

In [64]:
data = [
    ('科比就像奥尼尔的正宫大老婆，虽然不是很满意，但好歹是患难与共，也一起收获过很多快乐和成就的。', 1),
    ('韦德可以说是后来的续弦，虽然也是明媒正娶，年龄、地位和气势总归差距大了，合作愉快，但内心总觉得缺少了什么。', 0),
    ('至于便士，那妥妥就是鲨鱼心中的白月光了。', 1),
    ('奥尼尔这两年上镜，很容易流泪，声音也很低沉，语速很慢，我怀疑他有抑郁症。', 0)
]

In [65]:
input_ids, attention_mask, token_type_ids, labels = collate_fn(data)

In [66]:
labels.shape

torch.Size([4])

In [67]:
labels

tensor([1, 0, 1, 0], device='mps:0')

In [72]:
print(input_ids.shape, attention_mask.shape, token_type_ids.shape)

torch.Size([4, 500]) torch.Size([4, 500]) torch.Size([4, 500])


In [80]:
#dataloader, only for practice purpose
loader = torch.utils.data.DataLoader(dataset = dataset,
                                     batch_size = 16,
                                     collate_fn = collate_fn,
                                     shuffle = True,
                                     drop_last = True)

In [81]:
len(loader)

600

In [83]:
# for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
#     break

input_ids.shape, attention_mask.shape, token_type_ids.shape, labels

(torch.Size([16, 500]),
 torch.Size([16, 500]),
 torch.Size([16, 500]),
 tensor([1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1], device='mps:0'))

In [84]:
#load pretrained model
from transformers import BertModel

pretrained = BertModel.from_pretrained('bert-base-chinese')

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

In [85]:
#see the parameter amount
sum(i.numel() for i in pretrained.parameters())

102267648

The `.numel()` method in PyTorch (and also available in some other frameworks like TensorFlow and NumPy via `.size` or `.size().prod()`) returns the total number of elements in a tensor.

## Freeze parameters

In [89]:
for param in pretrained.parameters():
    param.requires_grad_(False)
#cannot do derevatives and back-propagation any more

In [90]:
pretrained.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [93]:
out = pretrained(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids)

In [95]:
out.last_hidden_state.shape

torch.Size([16, 500, 768])

- 16 in this batch
- 500 Sequence length	Each sequence was padded or truncated to 500 tokens long
- 768 Hidden size	Each token is represented by a 768-dimensional embedding vector

In [102]:
out.last_hidden_state[:,0, :].shape

torch.Size([16, 768])

In [104]:
out.last_hidden_state[0,:, 0].shape

torch.Size([500])

## Define a model

In [127]:
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(in_features = 768, out_features = 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        # with torch.no_grad():
        #     out = pretrained(input_ids = input_ids,
        #                      attention_mask = attention_mask,
        #                      token_type_ids = token_type_ids)
        #     out = self.fc(out.last_hidden_state[:,0]) #equal to out.last_hidden_state[:, 0, :], Linear layer expects 2 dimensional data
        #     out = out.softmax(dim = 1)
        #     return out

        out = pretrained(input_ids = input_ids,
                         attention_mask = attention_mask,
                         token_type_ids = token_type_ids)
        out = self.fc(out.last_hidden_state[:,0]) #equal to out.last_hidden_state[:, 0, :], Linear layer expects 2 dimensional data
        out = out.softmax(dim = 1)
        return out

Why last_hidden_state[:,0]?
- In models like BERT, the first token (position 0) is a synthetic token called [CLS], short for Classification.
- It’s not part of the original input sentence—it’s added by the tokenizer.
- During training, the model learns to pack a summary of the entire sequence into the [CLS] token's embedding.
- [CLS] becomes the attention magnet During training for tasks like classification, the model is explicitly trained to make the [CLS] vector capture all the important information. Because the loss function is applied directly to it—it’s what feeds the classifier head.
- It’s trained to absorb and summarize information from the whole sequence. The loss function in classification tasks is directly tied to its output during pretraining and fine-tuning. It's the one doing the most listening when trained for classification.

In [128]:
model = Model()

In [129]:
model.to(device)

Model(
  (fc): Linear(in_features=768, out_features=2, bias=True)
)

In [130]:
#trail
model(input_ids, attention_mask, token_type_ids)

tensor([[0.6226, 0.3774],
        [0.5881, 0.4119],
        [0.4956, 0.5044],
        [0.6835, 0.3165],
        [0.6196, 0.3804],
        [0.7332, 0.2668],
        [0.6828, 0.3172],
        [0.6107, 0.3893],
        [0.5979, 0.4021],
        [0.5717, 0.4283],
        [0.5741, 0.4259],
        [0.6400, 0.3600],
        [0.5127, 0.4873],
        [0.7230, 0.2770],
        [0.6481, 0.3519],
        [0.6398, 0.3602]], device='mps:0', grad_fn=<SoftmaxBackward0>)

## Training

In [131]:
from transformers import AdamW
from transformers.optimization import get_scheduler

In [134]:
def train():
    optimizer =  AdamW(model.parameters(), lr = 0.0001)
    
    #loss function
    criterion = torch.nn.CrossEntropyLoss()
    
    scheduler = get_scheduler(name = 'linear',
                              num_warmup_steps = 0,
                              num_training_steps = len(loader),
                              optimizer = optimizer)
    model.train()

    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader):
        out = model(input_ids, attention_mask, token_type_ids)
        loss = criterion(out, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        if i%10 ==0:
            out = out.argmax(dim = 1)
            accuracy = (out==labels).sum().item()/len(labels)
            lr = optimizer.state_dict()['param_groups'][0]['lr']
            print(i, loss.item(), lr, accuracy)

In [135]:
train()

0 0.7071318626403809 9.983333333333334e-05 0.4375
10 0.7115724086761475 9.816666666666668e-05 0.375
20 0.7273609638214111 9.65e-05 0.3125
30 0.6518293023109436 9.483333333333334e-05 0.8125
40 0.6847739815711975 9.316666666666666e-05 0.5625
50 0.6582504510879517 9.15e-05 0.75
60 0.6269950270652771 8.983333333333334e-05 0.8125
70 0.6505166888237 8.816666666666668e-05 0.5625
80 0.6002245545387268 8.65e-05 0.9375
90 0.6611295938491821 8.483333333333334e-05 0.625
100 0.6146636009216309 8.316666666666666e-05 0.75
110 0.6305767893791199 8.15e-05 0.75
120 0.6338016986846924 7.983333333333334e-05 0.8125
130 0.6113054752349854 7.816666666666666e-05 0.8125
140 0.591750979423523 7.65e-05 0.8125
150 0.5957347750663757 7.483333333333333e-05 0.875
160 0.6201099157333374 7.316666666666668e-05 0.75
170 0.6292654871940613 7.15e-05 0.625
180 0.5875069499015808 6.983333333333334e-05 0.875
190 0.5883765816688538 6.816666666666667e-05 0.8125
200 0.5737649202346802 6.65e-05 0.875
210 0.5999044179916382 6.483

## Evaluation

In [138]:
def test():
    loader_test = torch.utils.data.DataLoader(dataset = Dataset('test'),
                                                        batch_size = 32,
                                                        collate_fn = collate_fn,
                                                        shuffle = True,
                                                        drop_last = True
                                              )
    model.eval()
    correct = 0
    total = 0
    for i, (input_ids, attention_mask, token_type_ids, labels) in enumerate(loader_test):
        if i == 5:
            break
        print(i)
        with torch.no_grad():
            out = model(input_ids, attention_mask, token_type_ids)

        out = out.argmax(dim = 1)
        correct +=(out==labels).sum().item()
        total += len(labels)
        print(correct/total)

In [139]:
test()

0
0.78125
1
0.8125
2
0.84375
3
0.875
4
0.875
