In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

In [3]:
# 读取数据
train_df = pd.read_csv('train.csv')
train_ds = Dataset.from_pandas(train_df)

#调用bert—base模型
moder_dir = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(moder_dir)  #使用这个模型来处理这个文本数据




In [4]:
#结果与编码后的数字进行一一对应
options = 'ABCDE'  
indices = list(range(5))
option_to_index = {option:index for option,index in zip(options,indices)}
index_to_index = {index:option for option,index in zip(options,indices)}

In [5]:
def preprocess(example):
    # 问题需要和答案配对，有5个答案，所以讲问题的数量乘以5
    first_sentence = [example['prompt']] * 5  # 就是问题
    # 将选项的答案添加到seconde_sentence
    second_sentence = []
    for option in options :
        second_sentence.append(example[option])

    tokenized_example = tokenizer(first_sentence,second_sentence,truncation=True)
    # 将问题的标准答案也加入到tokenized_example中
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example


In [6]:
tokenized_train_ds = train_ds.map(preprocess,batched=False,remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
print(tokenized_train_ds[1])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

{'id': 1, 'input_ids': [[101, 5979, 1104, 1103, 1378, 1110, 1126, 8026, 5754, 1104, 9652, 188, 7867, 1158, 1107, 2191, 118, 1861, 2344, 136, 102, 141, 27500, 188, 7867, 1158, 4431, 1106, 1103, 7243, 1104, 2191, 118, 1861, 2344, 117, 1187, 2233, 3836, 1121, 22675, 12217, 1116, 1120, 4275, 1551, 10877, 15213, 1106, 1103, 7514, 2233, 1678, 1121, 22675, 12217, 1116, 1104, 1251, 2206, 1137, 1224, 1159, 119, 1188, 15213, 1110, 7289, 1118, 170, 2218, 1159, 118, 7449, 188, 2430, 7147, 5668, 7898, 193, 119, 102], [101, 5979, 1104, 1103, 1378, 1110, 1126, 8026, 5754, 1104, 9652, 188, 7867, 1158, 1107, 2191, 118, 1861, 2344, 136, 102, 141, 27500, 188, 7867, 1158, 4431, 1106, 1103, 1664, 118, 7243, 1104, 2191, 118, 1861, 2344, 117, 1187, 2233, 3836, 1121, 22675, 12217, 1116, 1120, 4275, 1551, 1110, 1861, 1106, 1103, 7514, 2233, 1678, 1121, 22675, 12217, 1116, 1104, 1251, 2206, 1137, 1224, 1159, 119, 1188, 15213, 1110, 7289, 1118, 170, 2218, 1159, 118, 7449, 188, 2430, 7147, 5668, 7898, 193, 119, 1

In [7]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase,PaddingStrategy
from typing import Optional,Union
import torch

In [8]:
from typing import Any


class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool,str,PaddingStrategy] = True
    max_length : Optional[int] = None
    pad_to_multiple_of : Optional[int] = None   #设置一些前提条件

    def __call__(self, features):
        
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k:v[i] for k,v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features,[])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch



In [10]:
# 开始用模型进行训练（用transformers库来进行训练）
from transformers import AutoModelForMultipleChoice,TrainingArguments,Trainer
model = AutoModelForMultipleChoice.from_pretrained(moder_dir)

output_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir = output_dir,
    evaluation_strategy='epoch',
    save_strategy= 'epoch',
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),)

trainer.train()

ConnectTimeout: HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-cased/resolve/main/tf_model.h5 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000024DA025B8E0>, 'Connection to huggingface.co timed out. (connect timeout=10)'))