## RoBERTa trained on RACE dataset

In [31]:
# 导入环境
import os
import json
import torch
import torch.nn as nn
import transformers
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForMultipleChoice
transformers.logging.set_verbosity_error()

In [32]:
import time
for i in tqdm(range(10)):
    time.sleep(1)


100%|██████████| 10/10 [00:10<00:00,  1.00s/it]


### RACE Dataset

In [18]:
class RACEDataset(Dataset):
    def __init__(self, data_dir):
        # 处理数据集，主要包含RoBERTa的分词器
        self.data = []
        # print('Load RoBERTa tokenizer.')
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        # print('Extract data from file of .txt.')
        for root, dirs, files in os.walk(data_dir):
            for file in files:
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    data = json.loads(content)

                    article = data['article']
                    questions = data['questions']
                    options = data['options']
                    answers = data['answers']

                    for i in range(len(questions)):
                        encoded_inputs = self.tokenizer.encode_plus(
                            article,
                            questions[i],
                            options[i],
                            # add_special_tokens=True,
                            max_length=512,
                            padding='max_length',
                            truncation=True,
                            # TODO: what
                            return_tensors='pt'
                        )
                        input_ids = encoded_inputs['input_ids'].squeeze()
                        attention_mask = encoded_inputs['attention_mask'].squeeze()
                        answer = ord(answers[i]) - ord('A')

                        self.data.append((input_ids, attention_mask, answer))
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # 一个Item代表一个文章、一个问题、一个答案
        # 说明文章的每个问题，拆分成了一个对应的Item
        input_ids, attention_mask, answer = self.data[index]
        return input_ids, attention_mask, answer


### RoBERTa model

In [7]:
class RoBERTaClassifier(nn.Module):
    def __init__(self):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = RobertaForMultipleChoice.from_pretrained('roberta-base')

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # TODO: what symbol
        logits = outputs.logits
        return logits


### Train process function

In [34]:
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    # model.to(device)
    
    # TODO: where is epochs
    for input_ids, attention_mask, answer in tqdm(train_loader):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        answer = answer.to(device)

        # 标准化流程
        optimizer.zero_grad()
        logits = model(input_ids, attention_mask)
        print(f'Logits: {logits}')
        loss = criterion(logits, answer)
        loss.backward()
        optimizer.step()

        print(f'Batch size loss: {loss.item()}')

        


### Evaluation process function

In [9]:
def evaluate_model(model, eval_loader, device):
    model.eval()
    # model.to(device)
    correct = 0
    total = 0

    with torch.no_grad():
        for input_ids, attention_mask, answer in eval_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            answer = answer.to(device)

            logits = model(input_ids, attention_mask)
            _, predicted = torch.max(logits, dim=1)
            total += answer.size(0)
            correct += (predicted == answer).sum().item()
    
    accuracy = correct / total
    return accuracy



### Main

In [35]:
# 定义超参数
device = torch.device('cuda')
data_dir = './RACE/'
batch_size = 8
num_epochs = 5
learning_rate = 2e-5

# 加载RACE数据集
train_dataset = RACEDataset(os.path.join(data_dir, 'train'))
print('Train dataset done')
dev_dataset = RACEDataset(os.path.join(data_dir, 'dev'))
print('Dev dataset done')

# 创建DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
print('Train dataloader done')
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
print('Dev dataloader done')

# 创建RoBERTa模型
model = RoBERTaClassifier().to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# 训练和评估模型
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    train_model(model, train_loader, criterion, optimizer, device)
    accuracy = evaluate_model(model, dev_loader, device)
    print(f'Epoch {epoch+1}/{num_epochs}: Accuracy: {accuracy}')


KeyboardInterrupt: 

In [28]:
dev_dataset[0][2]

2