# Source & about the dataset

Code for this file is based on https://github.com/AMontgomerie/question_generator. The code is under an MIT license

Race source: https://www.cs.cmu.edu/~glai1/data/race/ 

Race is a large-scale reading comprehension dataset with more than 28,000 passages and nearly 100,000 questions. The dataset is collected from English examinations in China, which are designed for middle school and high school students. The dataset can be served as the training and test sets for machine comprehension.

The dataset itself is collected from https://huggingface.co/datasets/race

# Imports

In [1]:
import os
import sys
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import spacy
from tqdm.notebook import tqdm

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

In [3]:
from datasets import load_dataset

dataset = load_dataset("race", "high")

Reusing dataset race (C:\Users\Kevin\.cache\huggingface\datasets\race\high\0.1.0\5a80ba2d003e023fdce95d01c1b02f5a70d5eb2375465bee162baf9824c91474)


  0%|          | 0/3 [00:00<?, ?it/s]

# Options

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [5]:
PRETRAINED_MODEL = 't5-base'
DIR = "question_generator/"
BATCH_SIZE = 1
SEQ_LENGTH = 512
EPOCHS = 3
USE_ANSWER = False
BEST = "race_finetune_withanswer_epoch3.pt"
BEST_HF = "race_finetune_withanswer_epoch3"

# Check whether the specified path exists or not
if not os.path.exists(DIR):
    os.makedirs(path)


tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.add_special_tokens(
    {'additional_special_tokens': ['<answer>', '<context>']}
)


2

In [6]:
def make_text(row):    
    encoded = {}
    if USE_ANSWER:
        s = '<answer> ' + row['answer'] + ' <context> ' + row['article']
    else:
        s = row['article']
    encoded_text = tokenizer(
        s,
        pad_to_max_length=True, 
        max_length=SEQ_LENGTH,
        truncation=True,
        return_tensors="pt"
    )
    encoded['input_ids'] = torch.squeeze(encoded_text['input_ids'])
    encoded['attention_mask'] = torch.squeeze(encoded_text['attention_mask'])

    encoded_question = tokenizer(
        row['question'],
        pad_to_max_length=True,
        max_length=SEQ_LENGTH,
        truncation=True,
        return_tensors='pt'
    )
    encoded['input_ids_question'] = torch.squeeze(encoded_question['input_ids'])
    return encoded

dataset = dataset.map(make_text)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'input_ids_question'])
train_loader = DataLoader(dataset["train"], batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset["validation"], batch_size=BATCH_SIZE, shuffle=True)

Loading cached processed dataset at C:\Users\Kevin\.cache\huggingface\datasets\race\high\0.1.0\5a80ba2d003e023fdce95d01c1b02f5a70d5eb2375465bee162baf9824c91474\cache-dde5553f59d1442b.arrow
Loading cached processed dataset at C:\Users\Kevin\.cache\huggingface\datasets\race\high\0.1.0\5a80ba2d003e023fdce95d01c1b02f5a70d5eb2375465bee162baf9824c91474\cache-aee9e17dc574fd1a.arrow
Loading cached processed dataset at C:\Users\Kevin\.cache\huggingface\datasets\race\high\0.1.0\5a80ba2d003e023fdce95d01c1b02f5a70d5eb2375465bee162baf9824c91474\cache-27aafa5353895ecf.arrow


In [7]:
LR = 0.001
EPOCHS = 20
LOG_INTERVAL = 5000

config = T5Config(decoder_start_token_id=tokenizer.pad_token_id)
model = T5ForConditionalGeneration(config).from_pretrained(PRETRAINED_MODEL)
model.resize_token_embeddings(len(tokenizer)) # to account for new special tokens
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

In [8]:
SAVED_MODEL_PATH = "question_generator/qg_pretrained_t5_model_trained.pth"

def train(epoch, best_val_loss):
    model.train()
    total_loss = 0.
    for batch_index, batch in tqdm(enumerate(train_loader)):
        target = {
            'input_ids': batch['input_ids_question'].to(device)
        }
        data = {
            'input_ids': batch['input_ids'].to(device),
            'attention_mask': batch['attention_mask'].to(device)
        }
        optimizer.zero_grad()
        masked_labels = mask_label_padding(target['input_ids'])
        output = model(
            input_ids=data['input_ids'],
            attention_mask=data['attention_mask'],
            labels=masked_labels
        )
        loss = output[0]
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        if batch_index % LOG_INTERVAL == 0 and batch_index > 0:
            cur_loss = total_loss / LOG_INTERVAL
            print('| epoch {:3d} | ' 
                  '{:5d}/{:5d} batches | '
                  'loss {:5.2f}'.format(
                    epoch, 
                    batch_index, len(train_loader), 
                    cur_loss))
            total_loss = 0

def evaluate(eval_model, data_loader):
    eval_model.eval()
    total_loss = 0.
    with torch.no_grad():
        for batch_index, batch in tqdm(enumerate(data_loader)):
            target = {
                'input_ids': batch['input_ids_question'].to(device)
            }
            data = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device)
            }
            masked_labels = mask_label_padding(target['input_ids'])
            output = eval_model(
                input_ids=data['input_ids'],
                attention_mask=data['attention_mask'],
                labels=masked_labels
            )
            total_loss += output[0].item()
    return total_loss / len(data_loader)

def mask_label_padding(labels):
    MASK_ID = -100
    labels[labels==tokenizer.pad_token_id] = MASK_ID
    return labels

def load(path):
    return torch.load(path)

def print_line():
    LINE_WIDTH = 60
    print('-' * LINE_WIDTH)

In [None]:
best_val_loss = float("inf")
best_model = None

print_line()

for epoch in range(1, EPOCHS + 1):

    train(epoch, best_val_loss)
    torch.cuda.empty_cache()
    val_loss = evaluate(model, valid_loader)
    torch.cuda.empty_cache()
    print_line()
    print('| end of epoch {:3d} | valid loss {:5.2f}'.format(
        epoch,
        val_loss)
    )
    print_line()

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_loss': best_val_loss,
            'using_answer': USE_ANSWER
        }, DIR + BEST)
        model.save_pretrained(DIR + BEST_HF)
        print("Model saved.\n")

------------------------------------------------------------


0it [00:00, ?it/s]

| epoch   1 |  5000/62445 batches | loss  3.75
| epoch   1 | 10000/62445 batches | loss  3.27
| epoch   1 | 15000/62445 batches | loss  3.07
| epoch   1 | 20000/62445 batches | loss  2.93
| epoch   1 | 25000/62445 batches | loss  2.84
| epoch   1 | 30000/62445 batches | loss  2.75
| epoch   1 | 35000/62445 batches | loss  2.66
| epoch   1 | 40000/62445 batches | loss  2.58
| epoch   1 | 45000/62445 batches | loss  2.53
