## Prepare dataset

In [38]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    'hfl/rbt3'
)

In [39]:
tokenizer

BertTokenizerFast(name_or_path='hfl/rbt3', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [40]:
#trail
tokenizer.batch_encode_plus(['春天是吃冰淇凌的时节', '我在喜马拉雅山的上空折纸飞机'], truncation = True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[101, 3217, 1921, 3221, 1391, 1102, 3899, 1119, 4638, 3198, 5688, 102], [101, 2769, 1762, 1599, 7716, 2861, 7414, 2255, 4638, 677, 4958, 2835, 5291, 7607, 3322, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [41]:
from datasets import load_from_disk
dataset = load_from_disk('../ChnSentiCorp/')

In [42]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9600
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 0
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1200
    })
})

In [43]:
dataset['train'] = dataset['train'].shuffle().select(range(2000))
dataset['test'] = dataset['test'].shuffle().select(range(100))

In [44]:
dataset['train'][0]

{'text': '作者有一种专业的谨慎，若能有幸学习原版也许会更好，简体版的书中的印刷错误比较多，影响学者理解，全书结构简单，但内容详实，学起来如鱼得水非常轻松。这只是一项技术而已，若可以结合本专业，将会得到更高的学习快乐，家财万贯不如一技在身，一技在身不如一念在心，本书有不仅有技，而且有念。书中佳品。',
 'label': 1}

In [45]:
def f(data, tokenizer):
    return tokenizer.batch_encode_plus(data['text'], truncation = True)

In [46]:
dataset = dataset.map(f,
                      batched = True,
                      batch_size = 1000,
                      num_proc = 4,
                      remove_columns = ['text'],
                      fn_kwargs = {'tokenizer':tokenizer})

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

In [47]:
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2000
    })
    validation: Dataset({
        features: ['label'],
        num_rows: 0
    })
    test: Dataset({
        features: ['label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [70]:
dataset['train'][0]

{'label': 1,
 'input_ids': [101,
  868,
  5442,
  3300,
  671,
  4905,
  683,
  689,
  4638,
  6474,
  2708,
  8024,
  5735,
  5543,
  3300,
  2401,
  2110,
  739,
  1333,
  4276,
  738,
  6387,
  833,
  3291,
  1962,
  8024,
  5042,
  860,
  4276,
  4638,
  741,
  704,
  4638,
  1313,
  1170,
  7231,
  6428,
  3683,
  6772,
  1914,
  8024,
  2512,
  1510,
  2110,
  5442,
  4415,
  6237,
  8024,
  1059,
  741,
  5310,
  3354,
  5042,
  1296,
  8024,
  852,
  1079,
  2159,
  6422,
  2141,
  8024,
  2110,
  6629,
  3341,
  1963,
  7824,
  2533,
  3717,
  7478,
  2382,
  6768,
  3351,
  511,
  6821,
  1372,
  3221,
  671,
  7555,
  2825,
  3318,
  5445,
  2347,
  8024,
  5735,
  1377,
  809,
  5310,
  1394,
  3315,
  683,
  689,
  8024,
  2199,
  833,
  2533,
  1168,
  3291,
  7770,
  4638,
  2110,
  739,
  2571,
  727,
  8024,
  2157,
  6568,
  674,
  6581,
  679,
  1963,
  671,
  2825,
  1762,
  6716,
  8024,
  671,
  2825,
  1762,
  6716,
  679,
  1963,
  671,
  2573,
  1762,
  2552,
 

In [49]:
# delete long sentences
def f2(data):
    return [len(i) <=  512 for i in data['input_ids']]

dataset = dataset.filter(
    f2,
    batched = True,
    batch_size = 1000,
    num_proc = 4
)

Filter (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter (num_proc=4):   0%|          | 0/100 [00:00<?, ? examples/s]

`batch_size` doesn’t change what gets filtered (the output result will be the same) —it just controls how efficiently the filtering happens.
- It affects memory usage: Larger batches use more RAM.
- It affects speed: Larger batches usually mean fewer function calls and faster processing.
- It affects parallelism: With num_proc=4, each of the 4 processes will handle batches of 1000 examples independently.

In [77]:
from transformers import AutoModelForSequenceClassification
from transformers import BertForSequenceClassification

In [76]:
!pip show transformers

Name: transformers
Version: 4.41.0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /opt/anaconda3/envs/Advanced_AI/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 


In [75]:
!pip show accelerate

Name: accelerate
Version: 0.30.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /opt/anaconda3/envs/Advanced_AI/lib/python3.10/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: 


In [79]:
# model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3', num_labels=2)
model = BertForSequenceClassification.from_pretrained('hfl/rbt3', num_labels=2)

pytorch_model.bin:   0%|          | 0.00/156M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [80]:
sum([i.nelement() for i in model.parameters()])

38478338

There are about 38 millions parameters in the model

In [81]:
import torch

In [82]:
#trail
data = {
    'input_ids': torch.ones(4,10, dtype = torch.long),
    'token_type_ids': torch.ones(4,10, dtype = torch.long),
    'attention_mask':torch.ones(4,10, dtype = torch.long),
    'labels':torch.ones(4, dtype = torch.long),
}

In [83]:
data

{'input_ids': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'token_type_ids': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'labels': tensor([1, 1, 1, 1])}

In [84]:
out = model(**data)

In [85]:
out

SequenceClassifierOutput(loss=tensor(0.7646, grad_fn=<NllLossBackward0>), logits=tensor([[ 0.0253, -0.1128],
        [ 0.0253, -0.1128],
        [ 0.0253, -0.1128],
        [ 0.0253, -0.1128]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [86]:
out['loss']

tensor(0.7646, grad_fn=<NllLossBackward0>)

In [87]:
out['logits']

tensor([[ 0.0253, -0.1128],
        [ 0.0253, -0.1128],
        [ 0.0253, -0.1128],
        [ 0.0253, -0.1128]], grad_fn=<AddmmBackward0>)

## Evaluation metrics

In [88]:
from evaluate import load as load_metric
metric = load_metric('accuracy')

In [89]:
import numpy as np
from transformers.trainer_utils import EvalPrediction

In [90]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    pred = logits.argmax(axis = 1)
    print (pred)
    return metric.compute(predictions = pred, references = labels)
    # return {'accuracy' : (pred == labels).mean()}
    

In [91]:
eval_pred = EvalPrediction(
    predictions = np.array([[1,0],[9,3],[0.4,0.3],[6,7]]),
    label_ids = np.array([1,1,0,1])
)
compute_metrics(eval_pred)

[0 0 0 1]


{'accuracy': 0.5}

In [92]:
eval_pred

<transformers.trainer_utils.EvalPrediction at 0x3031157b0>

## Parameter in training
In Huggingface, the parameters are packed in a **class -- TrainingArguments**

In [93]:
!pip install --no-cache-dir "transformers==4.41.0" "accelerate==0.30.0"



I was using transformers 4.50.0 and accelerate 1.8.1, but they are not doing well with the below code.

So I swithed to transfomers 4.41.0 and accelerate 0.30.0 


In [94]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

In [95]:
args = TrainingArguments(
    output_dir = './output_dir', # temporary output data's saving path
    # evaluation_strategy = 'steps',
    
    eval_steps = 30,#how many steps to perform an evaluation
    save_strategy = 'steps', # it can be 'no', 'epoch', 'steps'
    save_steps = 30,

    num_train_epochs = 2,

    learning_rate = 0.001,

    weight_decay = 0.01, #𝜆 to multiply on the sum of square of weights in loss function, to reduce overfitting

    per_device_eval_batch_size = 16,
    per_device_train_batch_size = 16,

    no_cuda = False #It will us MPS if available
)

In [96]:
#define the trainer
from transformers import Trainer
from transformers.data.data_collator import DataCollatorWithPadding 

In [98]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = dataset['train'],
    eval_dataset = dataset['test'],
    compute_metrics = compute_metrics,
    data_collator = DataCollatorWithPadding(tokenizer)
)

Data collator standadize the input data and make them the same length. Check out the test below.

In [99]:
#test
data_collator = DataCollatorWithPadding(tokenizer)

In [101]:
data = dataset['train'][:5]
print(data)

{'label': [1, 1, 0, 0, 1], 'input_ids': [[101, 868, 5442, 3300, 671, 4905, 683, 689, 4638, 6474, 2708, 8024, 5735, 5543, 3300, 2401, 2110, 739, 1333, 4276, 738, 6387, 833, 3291, 1962, 8024, 5042, 860, 4276, 4638, 741, 704, 4638, 1313, 1170, 7231, 6428, 3683, 6772, 1914, 8024, 2512, 1510, 2110, 5442, 4415, 6237, 8024, 1059, 741, 5310, 3354, 5042, 1296, 8024, 852, 1079, 2159, 6422, 2141, 8024, 2110, 6629, 3341, 1963, 7824, 2533, 3717, 7478, 2382, 6768, 3351, 511, 6821, 1372, 3221, 671, 7555, 2825, 3318, 5445, 2347, 8024, 5735, 1377, 809, 5310, 1394, 3315, 683, 689, 8024, 2199, 833, 2533, 1168, 3291, 7770, 4638, 2110, 739, 2571, 727, 8024, 2157, 6568, 674, 6581, 679, 1963, 671, 2825, 1762, 6716, 8024, 671, 2825, 1762, 6716, 679, 1963, 671, 2573, 1762, 2552, 8024, 3315, 741, 3300, 679, 788, 3300, 2825, 8024, 5445, 684, 3300, 2573, 511, 741, 704, 881, 1501, 511, 102], [101, 3296, 3301, 1351, 6370, 4638, 2791, 7313, 8024, 2945, 6432, 6820, 679, 7231, 1568, 8024, 671, 678, 1726, 8024, 5632, 2

In [102]:
for i in data['input_ids']:
    print(len(i))

145
27
35
37
48


In [103]:
#test the data_collator
data = data_collator(data)

In [104]:
data

{'input_ids': tensor([[  101,   868,  5442,  3300,   671,  4905,   683,   689,  4638,  6474,
          2708,  8024,  5735,  5543,  3300,  2401,  2110,   739,  1333,  4276,
           738,  6387,   833,  3291,  1962,  8024,  5042,   860,  4276,  4638,
           741,   704,  4638,  1313,  1170,  7231,  6428,  3683,  6772,  1914,
          8024,  2512,  1510,  2110,  5442,  4415,  6237,  8024,  1059,   741,
          5310,  3354,  5042,  1296,  8024,   852,  1079,  2159,  6422,  2141,
          8024,  2110,  6629,  3341,  1963,  7824,  2533,  3717,  7478,  2382,
          6768,  3351,   511,  6821,  1372,  3221,   671,  7555,  2825,  3318,
          5445,  2347,  8024,  5735,  1377,   809,  5310,  1394,  3315,   683,
           689,  8024,  2199,   833,  2533,  1168,  3291,  7770,  4638,  2110,
           739,  2571,   727,  8024,  2157,  6568,   674,  6581,   679,  1963,
           671,  2825,  1762,  6716,  8024,   671,  2825,  1762,  6716,   679,
          1963,   671,  2573,  1762,  

In [105]:
for k, v in data.items():
    print(k, v.shape)

input_ids torch.Size([5, 145])
token_type_ids torch.Size([5, 145])
attention_mask torch.Size([5, 145])
labels torch.Size([5])


In [107]:
tokenizer.decode(data['input_ids'][1])

'[CLS] 替 朋 友 订 的 房 间 ， 据 说 还 不 错 啦 ， 一 下 回 ， 自 己 去 试 试 。 [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'