In [1]:
from datasets import load_dataset


# Finetune BERT on the regular configuration of the SWAG dataset to select the best answer given multiple options and some context.

data_file = "your data path/data/regular"
model_tokenizer_path = "your model path"

dataset = load_dataset(data_file)
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 73546
    })
    validation: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 20006
    })
    test: Dataset({
        features: ['video-id', 'fold-ind', 'startphrase', 'sent1', 'sent2', 'gold-source', 'ending0', 'ending1', 'ending2', 'ending3', 'label'],
        num_rows: 20005
    })
})

In [2]:
dataset["train"][0]

{'video-id': 'anetv_jkn6uvmqwh4',
 'fold-ind': '3416',
 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
 'sent2': 'A drum line',
 'gold-source': 'gold',
 'ending0': 'passes by walking down the street playing their instruments.',
 'ending1': 'has heard approaching them.',
 'ending2': "arrives and they're outside dancing and asleep.",
 'ending3': 'turns the lead singer watches the performance.',
 'label': 0}

In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_tokenizer_path)
tokenizer

BertTokenizerFast(name_or_path='D:/Desktop/learn/instance/model/bert', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [4]:
ending_names = ["ending0", "ending1", "ending2", "ending3"]

def process_function(examples):

    # 数据格式：[[str * 4] * batch_size]
    first_sentences = [[context] * 4 for context in examples["sent1"]]

    # 数据格式：[str * batch_size]
    question_headers = examples["sent2"]

    # 数据格式：[[question1, question2, question3, question4] * batch_size]
    # second_sentences = [
    #     [header + examples[ending][i] for ending in ending_names] for i, header in enumerate(question_headers)
    # ]

    second_sentences = [
        [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
    ]

    # 数据格式['str' * 4 * batch_size] => tokenizer不能对二维数组进行tokenizer，见下
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])

    """返回所期望的数据格式:
        {
            "input_ids":[[senten1_tokenized * 4], [sentence2_tokenized * 4]]
            "attention_mask":...
        }

    """
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)

    # 这里有一个注意事项：从返回的结果来看i是一个tokenized的列表v[i:i+4]是嵌套列表，那么[v[i:i+4]]应该就是三层的嵌套列表
    # 有一层是batch
    result = {
            k: [v[i : i + 4] for i in range(0, len(v), 4)] 
            for k, v in tokenized_examples.items()
        }
    # print(result)
    return result


In [5]:
import torch
tokenized_dataset = dataset.map(process_function, batched=True)

# 取了一条所以应该是两层
tokenized_example = tokenized_dataset["train"][0]
tokenized_example, tokenized_dataset

Map: 100%|██████████| 20006/20006 [00:06<00:00, 3259.97 examples/s]


({'video-id': 'anetv_jkn6uvmqwh4',
  'fold-ind': '3416',
  'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
  'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
  'sent2': 'A drum line',
  'gold-source': 'gold',
  'ending0': 'passes by walking down the street playing their instruments.',
  'ending1': 'has heard approaching them.',
  'ending2': "arrives and they're outside dancing and asleep.",
  'ending3': 'turns the lead singer watches the performance.',
  'label': 0,
  'input_ids': [[101,
    2372,
    1997,
    1996,
    14385,
    3328,
    2091,
    1996,
    2395,
    3173,
    2235,
    7109,
    8782,
    5693,
    1012,
    102,
    1037,
    6943,
    2240,
    5235,
    2011,
    3788,
    2091,
    1996,
    2395,
    2652,
    2037,
    5693,
    1012,
    102],
   [101,
    2372,
    1997,
    1996,
    14385,
    3328,
    2091,
    1996,
    2395,
    3173,
  

In [6]:
a = [["this is test","a"], ["this is a example","b"]]
b = [["this is test","a"], ["this is a example","b"]]
tokenizer(a), tokenizer(b)

({'input_ids': [[101, 2023, 2003, 3231, 102, 1037, 102], [101, 2023, 2003, 1037, 2742, 102, 1038, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]},
 {'input_ids': [[101, 2023, 2003, 3231, 102, 1037, 102], [101, 2023, 2003, 1037, 2742, 102, 1038, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 1, 1], [0, 0, 0, 0, 0, 0, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1]]})

In [7]:
from transformers import DataCollatorForMultipleChoice
collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
collator

DataCollatorForMultipleChoice(tokenizer=BertTokenizerFast(name_or_path='D:/Desktop/learn/instance/model/bert', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, return_ten

In [8]:
from evaluate import load

accuracy = load("../../evaluate/accuracy.py")

accuracy

EvaluationModule(name: "accuracy", module_type: "metric", features: {'predictions': Value(dtype='int32', id=None), 'references': Value(dtype='int32', id=None)}, usage: """
Args:
    predictions (`list` of `int`): Predicted labels.
    references (`list` of `int`): Ground truth labels.
    normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
    sample_weight (`list` of `float`): Sample weights Defaults to None.

Returns:
    accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.

Examples:

    Example 1-A simple example
        >>> accuracy_metric = evaluate.load("accuracy")
        >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
        >>> print(results)
    

In [9]:
# 首先应该知道的是模型的输出是啥

from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained(model_tokenizer_path)
model, model.config

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at D:/Desktop/learn/instance/model/bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(BertForMultipleChoice(
   (bert): BertModel(
     (embeddings): BertEmbeddings(
       (word_embeddings): Embedding(30522, 768, padding_idx=0)
       (position_embeddings): Embedding(512, 768)
       (token_type_embeddings): Embedding(2, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): BertEncoder(
       (layer): ModuleList(
         (0-11): 12 x BertLayer(
           (attention): BertAttention(
             (self): BertSdpaSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): BertSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (LayerNorm): LayerNorm(

**注意**：这里先使用手动推断的代码查看一下模型的输出是什么样子的，从而判断compute_metrics中的

In [10]:
import numpy as np
def compute_metrics(pred):
    predictions, labels = pred

    predictions = np.argmax(predictions, axis=-1)

    accuracy_rate = accuracy.compute(predictions=predictions, references=labels)

    return accuracy_rate

    

In [11]:
training_args = TrainingArguments(
    output_dir="./checkpoint",
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_strategy="steps",
    logging_steps=3,
    save_strategy="epoch",
    save_total_limit=2,
    per_device_eval_batch_size=4,
    eval_strategy="epoch"
)

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].shuffle().select(range(100)),
    eval_dataset=tokenized_dataset["validation"].shuffle().select(range(20)),
    data_collator=collator,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.4759,1.275363,0.55
2,0.416,1.094547,0.7
3,0.2858,1.30931,0.7


TrainOutput(global_step=150, training_loss=0.7843694820006688, metrics={'train_runtime': 35.6013, 'train_samples_per_second': 8.427, 'train_steps_per_second': 4.213, 'total_flos': 22763013017136.0, 'train_loss': 0.7843694820006688, 'epoch': 3.0})

In [14]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


trainer.evaluate(tokenized_dataset["validation"].shuffle().select(range(25)))

# tokenized_dataset["validation"]["label"]

{'eval_loss': 0.9289669990539551,
 'eval_accuracy': 0.68,
 'eval_runtime': 0.747,
 'eval_samples_per_second': 33.465,
 'eval_steps_per_second': 9.37,
 'epoch': 3.0}

In [15]:
import torch
import numpy as np

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
candidate1 = "The law does not apply to croissants and brioche."
candidate2 = "The law applies to baguettes."

inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
inputs = {
    k:v.to(device)
    for k, v in inputs.items()
}
labels = torch.tensor(0).unsqueeze(0).to(device)
outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)

logits = outputs.logits

category = np.argmax(locals).item()
category


0