## 数据集与模型
- 这些数据集都可以直接用：https://huggingface.co/datasets
- 咋们今天玩这个（GLUE):https://gluebenchmark.com/

In [1]:
import warnings
warnings.filterwarnings('ignore')
from datasets import load_dataset # https://github.com/huggingface/datasets

raw_datasets=load_dataset('glue','mrpc')
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

看看数据长啥样子

In [2]:
raw_train_datasets=raw_datasets['train']
raw_train_datasets[100]

{'sentence1': 'The Nasdaq composite index inched up 1.28 , or 0.1 percent , to 1,766.60 , following a weekly win of 3.7 percent .',
 'sentence2': 'The technology-laced Nasdaq Composite Index .IXIC was off 24.44 points , or 1.39 percent , at 1,739.87 .',
 'label': 0,
 'idx': 114}

In [3]:
raw_train_datasets.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

## 使用AutoTokenizer来处理数据

In [4]:
from transformers import AutoTokenizer

checkpoint='bert-base-uncased'
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

不是所有模型返回结果都是一样的，得看你选择模型训练的时候人家咋设置的

In [5]:
inputs=tokenizer('This is first sentence.','This is the second one.')
inputs

{'input_ids': [101, 2023, 2003, 2034, 6251, 1012, 102, 2023, 2003, 1996, 2117, 2028, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'])

['[CLS]',
 'this',
 'is',
 'first',
 'sentence',
 '.',
 '[SEP]',
 'this',
 'is',
 'the',
 'second',
 'one',
 '.',
 '[SEP]']

## 对所有数据进行处理

In [7]:
def tokenizer_function(example):
    return tokenizer(example['sentence1'],example['sentence2'],truncation=True)

In [8]:
tokenized_datasets=raw_datasets.map(tokenizer_function,batched=True)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [9]:
from transformers import DataCollatorWithPadding

data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

In [10]:
tokenized_datasets['train'][0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0,
 'input_ids': [101,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1010,
  3183,
  2002,
  2170,
  1000,
  1996,
  7409,
  1000,
  1010,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102,
  7727,
  2000,
  2032,
  2004,
  2069,
  1000,
  1996,
  7409,
  1000,
  1010,
  2572,
  3217,
  5831,
  5496,
  2010,
  2567,
  1997,
  9969,
  4487,
  23809,
  3436,
  2010,
  3350,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
 

In [11]:
samples=tokenized_datasets['train'][:8]#取到所有的列
samples={k:v for k,v in samples.items() if k not in ['idx','sentence1','sentence2']}#不需要这些列
[len(x) for x in samples['input_ids']]#每一个样本的长度

[50, 59, 47, 67, 59, 50, 62, 32]

经过data_collator处理之后，所有的样本长度都是固定的

In [12]:
batch=data_collator(samples)
{k:v.shape for k,v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

## 训练模块

In [13]:
from transformers import TrainingArguments

training_args=TrainingArguments('test-trainer')

API文档：实际用的时候一定对应着来
- https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments

In [14]:
training_args

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_la

In [15]:
from transformers import AutoModelForSequenceClassification

model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from transformers import Trainer
trainer=Trainer(
    model,#模型
    training_args,# 配置参数
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    data_collator=data_collator,#类似batch
    tokenizer=tokenizer,
)

In [17]:
trainer.train()

 15%|█▍        | 206/1377 [15:40<1:40:57,  5.17s/it]

: 

: 

In [None]:
predictions=trainer.predict(tokenized_datasets['validation'])
print(predictions,predictions.shape,prediction.label_ids.shape)

In [None]:
import numpy as np

preds=np.argmax(predictons,predictions,axis=-1)

In [None]:
from datasets import load_metric

metric=load_metric('glue','mrpc')
metric.compute(predictions=preds,references=predictions,label_ids)

## 训练过程中也可以指定好评估方法

In [None]:
def compute_metrics(eval_preds):
    metric=load_metric('glue','mrpc')
    logits,labels=eval_preds
    predictions=np.argmax(logits,axis=-1)
    return metric.compute(predictions=predictions,references=labels)

In [None]:
training_args=TrainingArguments('test-trainer',evaluation_strategy='epoch')
model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

trainer=Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataser=tokenized_datasets['validation'],
    data_colator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

会把每一个epoch的评估结果进行返回

In [None]:
trainer.train()