In [5]:
# 挂载Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
# 下载SQuAD v2.0数据集到指定目录
!wget -P /content/drive/MyDrive/Colab_Notebooks/ https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
!wget -P /content/drive/MyDrive/Colab_Notebooks/ https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json



--2024-05-22 09:06:15--  https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 42123633 (40M) [application/json]
Saving to: ‘/content/drive/MyDrive/Colab_Notebooks/train-v2.0.json’


2024-05-22 09:06:16 (59.2 MB/s) - ‘/content/drive/MyDrive/Colab_Notebooks/train-v2.0.json’ saved [42123633/42123633]

--2024-05-22 09:06:16--  https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json
Resolving rajpurkar.github.io (rajpurkar.github.io)... 185.199.108.153, 185.199.109.153, 185.199.110.153, ...
Connecting to rajpurkar.github.io (rajpurkar.github.io)|185.199.108.153|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4370528 (4.2M) [application/json]
Saving to: ‘/content/drive/MyDrive/Colab_Notebooks

In [21]:
#用于查看当前环境中的GPU信息。
!nvidia-smi
#安装transformers库：
!pip install transformers


Wed May 22 09:06:19 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   76C    P0              33W /  72W |   1093MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [22]:
#设置设备（GPU或CPU）和随机种子以确保实验的reproducibility。
import json
import numpy as np
import random
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertForQuestionAnswering, BertTokenizerFast, get_scheduler
from tqdm.auto import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(0)

Using device: cuda


In [23]:
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()



VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
#加载模型和分词器：
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased").to(device)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def read_squad_data(file):
    with open(file, 'r', encoding="utf-8") as reader:
        data = json.load(reader)["data"]
    contexts, questions, answers = [], [], []
    for group in data:
        for passage in group['paragraphs']:
            context = passage['context']
            for qa in passage['qas']:
                question = qa['question']
                if qa['is_impossible']:
                    continue
                answer = qa['answers'][0]
                contexts.append(context)
                questions.append(question)
                answers.append(answer)
    return contexts, questions, answers

In [26]:
train_contexts, train_questions, train_answers = read_squad_data("/content/drive/MyDrive/Colab_Notebooks/train-v2.0.json")
dev_contexts, dev_questions, dev_answers = read_squad_data("/content/drive/MyDrive/Colab_Notebooks/dev-v2.0.json")


In [27]:
def add_end_idx(answers, contexts):
    for answer, context in zip(answers, contexts):
        gold_text = answer['text']
        start_idx = answer['answer_start']
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            answer['answer_end'] = end_idx
        else:
            for n in [1, 2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    answer['answer_start'] = start_idx - n
                    answer['answer_end'] = end_idx - n


In [28]:
add_end_idx(train_answers, train_contexts)
add_end_idx(dev_answers, dev_contexts)


In [29]:
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
dev_encodings = tokenizer(dev_contexts, dev_questions, truncation=True, padding=True)


In [30]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
        end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))

        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        if end_positions[-1] is None:
            end_positions[-1] = tokenizer.model_max_length

    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})


In [31]:
add_token_positions(train_encodings, train_answers)
add_token_positions(dev_encodings, dev_answers)


In [32]:
class SquadDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings.input_ids)


In [33]:
train_dataset = SquadDataset(train_encodings)
dev_dataset = SquadDataset(dev_encodings)


In [34]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=16, shuffle=False)


In [35]:
def evaluate(model, data_loader):
    model.eval()
    answers, preds = [], []
    with torch.no_grad():
        for batch in tqdm(data_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            start_preds = torch.argmax(outputs.start_logits, dim=-1)
            end_preds = torch.argmax(outputs.end_logits, dim=-1)
            preds.extend(zip(start_preds.cpu().numpy(), end_preds.cpu().numpy()))
            answers.extend(zip(start_positions.cpu().numpy(), end_positions.cpu().numpy()))
    return preds, answers


In [36]:
num_epochs = 3
logging_step = 100
learning_rate = 1e-4


In [38]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * num_epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=total_steps)


In [39]:
model.train()
print("Start Training ...")


Start Training ...


In [40]:
for epoch in range(num_epochs):
    train_loss = 0
    for step, batch in enumerate(tqdm(train_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        train_loss += loss.item()

        if (step + 1) % logging_step == 0:
            print(f"Epoch {epoch + 1} | Step {step + 1} | loss = {train_loss / logging_step:.3f}")
            train_loss = 0


  0%|          | 0/5427 [00:00<?, ?it/s]

Epoch 1 | Step 100 | loss = 3.719
Epoch 1 | Step 200 | loss = 2.361
Epoch 1 | Step 300 | loss = 1.945
Epoch 1 | Step 400 | loss = 1.727
Epoch 1 | Step 500 | loss = 1.796
Epoch 1 | Step 600 | loss = 1.589
Epoch 1 | Step 700 | loss = 1.664
Epoch 1 | Step 800 | loss = 1.555
Epoch 1 | Step 900 | loss = 1.470
Epoch 1 | Step 1000 | loss = 1.524
Epoch 1 | Step 1100 | loss = 1.433
Epoch 1 | Step 1200 | loss = 1.450
Epoch 1 | Step 1300 | loss = 1.467
Epoch 1 | Step 1400 | loss = 1.545
Epoch 1 | Step 1500 | loss = 1.435
Epoch 1 | Step 1600 | loss = 1.449
Epoch 1 | Step 1700 | loss = 1.388
Epoch 1 | Step 1800 | loss = 1.460
Epoch 1 | Step 1900 | loss = 1.338
Epoch 1 | Step 2000 | loss = 1.405
Epoch 1 | Step 2100 | loss = 1.337
Epoch 1 | Step 2200 | loss = 1.362
Epoch 1 | Step 2300 | loss = 1.278
Epoch 1 | Step 2400 | loss = 1.312
Epoch 1 | Step 2500 | loss = 1.268
Epoch 1 | Step 2600 | loss = 1.366
Epoch 1 | Step 2700 | loss = 1.351
Epoch 1 | Step 2800 | loss = 1.320
Epoch 1 | Step 2900 | loss = 

  0%|          | 0/5427 [00:00<?, ?it/s]

Epoch 2 | Step 100 | loss = 0.820
Epoch 2 | Step 200 | loss = 0.846
Epoch 2 | Step 300 | loss = 0.836
Epoch 2 | Step 400 | loss = 0.767
Epoch 2 | Step 500 | loss = 0.877
Epoch 2 | Step 600 | loss = 0.864
Epoch 2 | Step 700 | loss = 0.866
Epoch 2 | Step 800 | loss = 0.839
Epoch 2 | Step 900 | loss = 0.795
Epoch 2 | Step 1000 | loss = 0.857
Epoch 2 | Step 1100 | loss = 0.863
Epoch 2 | Step 1200 | loss = 0.799
Epoch 2 | Step 1300 | loss = 0.860
Epoch 2 | Step 1400 | loss = 0.840
Epoch 2 | Step 1500 | loss = 0.814
Epoch 2 | Step 1600 | loss = 0.821
Epoch 2 | Step 1700 | loss = 0.809
Epoch 2 | Step 1800 | loss = 0.800
Epoch 2 | Step 1900 | loss = 0.840
Epoch 2 | Step 2000 | loss = 0.827
Epoch 2 | Step 2100 | loss = 0.823
Epoch 2 | Step 2200 | loss = 0.835
Epoch 2 | Step 2300 | loss = 0.854
Epoch 2 | Step 2400 | loss = 0.792
Epoch 2 | Step 2500 | loss = 0.777
Epoch 2 | Step 2600 | loss = 0.807
Epoch 2 | Step 2700 | loss = 0.896
Epoch 2 | Step 2800 | loss = 0.834
Epoch 2 | Step 2900 | loss = 

  0%|          | 0/5427 [00:00<?, ?it/s]

Epoch 3 | Step 100 | loss = 0.480
Epoch 3 | Step 200 | loss = 0.458
Epoch 3 | Step 300 | loss = 0.460
Epoch 3 | Step 400 | loss = 0.504
Epoch 3 | Step 500 | loss = 0.487
Epoch 3 | Step 600 | loss = 0.486
Epoch 3 | Step 700 | loss = 0.438
Epoch 3 | Step 800 | loss = 0.484
Epoch 3 | Step 900 | loss = 0.495
Epoch 3 | Step 1000 | loss = 0.485
Epoch 3 | Step 1100 | loss = 0.453
Epoch 3 | Step 1200 | loss = 0.431
Epoch 3 | Step 1300 | loss = 0.450
Epoch 3 | Step 1400 | loss = 0.472
Epoch 3 | Step 1500 | loss = 0.449
Epoch 3 | Step 1600 | loss = 0.485
Epoch 3 | Step 1700 | loss = 0.442
Epoch 3 | Step 1800 | loss = 0.472
Epoch 3 | Step 1900 | loss = 0.454
Epoch 3 | Step 2000 | loss = 0.452
Epoch 3 | Step 2100 | loss = 0.473
Epoch 3 | Step 2200 | loss = 0.418
Epoch 3 | Step 2300 | loss = 0.475
Epoch 3 | Step 2400 | loss = 0.456
Epoch 3 | Step 2500 | loss = 0.478
Epoch 3 | Step 2600 | loss = 0.458
Epoch 3 | Step 2700 | loss = 0.463
Epoch 3 | Step 2800 | loss = 0.440
Epoch 3 | Step 2900 | loss = 

In [41]:
    model.eval()
    preds, answers = evaluate(model, dev_loader)
    correct = 0
    for (pred_start, pred_end), (true_start, true_end) in zip(preds, answers):
        if pred_start == true_start and pred_end == true_end:
            correct += 1
    accuracy = correct / len(preds)
    print(f"Validation | Epoch {epoch + 1} | accuracy = {accuracy:.3f}")
    model.train()


  0%|          | 0/371 [00:00<?, ?it/s]

Validation | Epoch 3 | accuracy = 0.551


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [51]:
model_save_dir = "/content/drive/MyDrive/Colab_Notebooks/saved_model"
model.save_pretrained(model_save_dir)
tokenizer.save_pretrained(model_save_dir)

('/content/drive/MyDrive/Colab_Notebooks/saved_model/tokenizer_config.json',
 '/content/drive/MyDrive/Colab_Notebooks/saved_model/special_tokens_map.json',
 '/content/drive/MyDrive/Colab_Notebooks/saved_model/vocab.txt',
 '/content/drive/MyDrive/Colab_Notebooks/saved_model/added_tokens.json',
 '/content/drive/MyDrive/Colab_Notebooks/saved_model/tokenizer.json')

In [44]:
print("Evaluating Test Set ...")
test_encodings = tokenizer(dev_contexts, dev_questions, truncation=True, padding=True)
add_token_positions(test_encodings, dev_answers)
test_dataset = SquadDataset(test_encodings)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


Evaluating Test Set ...


In [45]:
result = []
model.eval()
with torch.no_grad():
    for batch in tqdm(test_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        start_preds = torch.argmax(outputs.start_logits, dim=-1)
        end_preds = torch.argmax(outputs.end_logits, dim=-1)
        result.extend(zip(start_preds.cpu().numpy(), end_preds.cpu().numpy()))


  0%|          | 0/371 [00:00<?, ?it/s]

In [46]:
result_file = "/content/drive/MyDrive/Colab_Notebooks/result.csv"
with open(result_file, 'w') as f:
    f.write("ID,Answer\n")
    for i, (start, end) in enumerate(result):
        answer = tokenizer.decode(test_encodings.input_ids[i][start:end + 1])
        f.write(f"{i},{answer}\n")


In [47]:
print(f"Completed! Result is in {result_file}")


Completed! Result is in /content/drive/MyDrive/Colab_Notebooks/result.csv


In [52]:
from transformers import BertForQuestionAnswering, BertTokenizerFast
import torch

# 指定保存的模型目录
model_save_dir = "/content/drive/MyDrive/Colab_Notebooks/saved_model"

# 加载保存的模型和分词器
model = BertForQuestionAnswering.from_pretrained(model_save_dir)
tokenizer = BertTokenizerFast.from_pretrained(model_save_dir)

# 设置设备
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [55]:
import matplotlib.pyplot as plt
def answer_question(question, context):
    # 对问题和上下文进行分词
    inputs = tokenizer.encode_plus(question, context, add_special_tokens=True, return_tensors="pt")

    # 将分词后的数据移动到设备
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # 使用模型进行预测
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        start_scores = outputs.start_logits
        end_scores = outputs.end_logits

    # 获取答案的开始和结束位置
    start_index = torch.argmax(start_scores)
    end_index = torch.argmax(end_scores) + 1

    # 解码答案
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[0][start_index:end_index]))

    return answer, start_scores, end_scores, input_ids

# 示例问题和上下文
context = "BERT是由Google开发的一种用于自然语言处理的预训练模型。它可以用于各种任务，如问答、文本分类等。"
question = "BERT是由谁开发的？"

# 获取答案和分数
answer, start_scores, end_scores, input_ids = answer_question(question, context)

# 打印问题和答案
print(f"Question: {question}")
print(f"Answer: {answer}")




Question: BERT是由谁开发的？
Answer: bert
