## RoBERTa trained on RACE dataset

In [1]:
# 导入环境
import os
import json
import torch
import torch.nn as nn
import transformers
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForMultipleChoice
transformers.logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
import time
for i in tqdm(range(10)):
    time.sleep(1)


100%|██████████| 10/10 [00:10<00:00,  1.00s/it]


### RACE Dataset

In [12]:
class RACEDataset(Dataset):
    def __init__(self, data_dir):
        # 处理数据集，主要包含RoBERTa的分词器
        self.data = []
        # print('Load RoBERTa tokenizer.')
        self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

        # _tqdm_idx = -1
        # print('Extract data from file of .txt.')
        for root, dirs, files in os.walk(data_dir):
            # _tqdm_idx += 1
            for file in tqdm(files, desc=f"Process dataset"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    data = json.loads(content)

                    article = data['article']
                    questions = data['questions']
                    options = data['options']
                    answers = data['answers']

                    
                    for i in range(len(questions)):
                        encoded_inputs = self.tokenizer(
                            [article + ' ' + questions[i] for _ in range(len(options[i]))],
                            options[i],
                            # add_special_tokens=True,
                            max_length=512,
                            padding='max_length',
                            truncation=True,
                            # TODO: what
                            return_tensors='pt'
                        )
                        input_ids = encoded_inputs['input_ids']
                        attention_mask = encoded_inputs['attention_mask']
                        answer = torch.tensor(ord(answers[i]) - ord('A'))

                        self.data.append((input_ids, attention_mask, answer))
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # 一个Item代表一个文章、一个问题、一个答案
        # 说明文章的每个问题，拆分成了一个对应的Item
        input_ids, attention_mask, answer = self.data[index]
        return input_ids, attention_mask, answer


In [11]:
type(ord('B') - ord('A'))

int

In [13]:
data_dir = './RACE/'
dev_dataset = RACEDataset(os.path.join(data_dir, 'dev'))


Process dataset: 0it [00:00, ?it/s]
Process dataset: 100%|██████████| 368/368 [00:06<00:00, 56.20it/s]
Process dataset: 100%|██████████| 1021/1021 [00:23<00:00, 43.66it/s]


In [14]:
dev_dataset[0][0].shape

torch.Size([4, 512])

### RoBERTa model

In [15]:
class RoBERTaClassifier(nn.Module):
    def __init__(self):
        super(RoBERTaClassifier, self).__init__()
        self.roberta = RobertaForMultipleChoice.from_pretrained('roberta-base')

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        # TODO: what symbol
        logits = outputs.logits
        return logits


### Train process function

In [18]:
def train_model(model, train_loader, criterion, optimizer, device):
    model.train()
    # model.to(device)
    
    # TODO: where is epochs
    for input_ids, attention_mask, answer in tqdm(train_loader, desc='Train RoBERTa'):
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        answer = answer.to(device)

        # 标准化流程
        optimizer.zero_grad()
        # print(f'input_ids {input_ids.shape}')
        # print(f'attention_mask {attention_mask.shape}')

        logits = model(input_ids, attention_mask)
        print(f'Logits shape: {logits.shape}')
        loss = criterion(logits, answer)
        loss.backward()
        optimizer.step()

        print(f'Batch size loss: {loss.item()}')

        


### Evaluation process function

In [19]:
def evaluate_model(model, eval_loader, device):
    model.eval()
    # model.to(device)
    correct = 0
    total = 0

    with torch.no_grad():
        for input_ids, attention_mask, answer in tqdm(eval_loader,desc='Eval RoBERTa'):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            answer = answer.to(device)

            logits = model(input_ids, attention_mask)
            _, predicted = torch.max(logits, dim=1)
            total += answer.size(0)
            correct += (predicted == answer).sum().item()
    
    accuracy = correct / total
    return accuracy



### Main

In [20]:
# 定义超参数
device = torch.device('cuda')
data_dir = './RACE/'
batch_size = 4
num_epochs = 5
learning_rate = 2e-5

# 加载RACE数据集
# train_dataset = RACEDataset(os.path.join(data_dir, 'train'))
# print('Train dataset done')
dev_dataset = RACEDataset(os.path.join(data_dir, 'dev'))
print('Dev dataset done')

# 创建DataLoader
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# print('Train dataloader done')
dev_loader = DataLoader(dev_dataset, batch_size=batch_size)
print('Dev dataloader done')




Process dataset: 0it [00:00, ?it/s]
Process dataset: 100%|██████████| 368/368 [00:06<00:00, 55.41it/s]
Process dataset: 100%|██████████| 1021/1021 [00:23<00:00, 43.62it/s]

Dev dataset done
Dev dataloader done





In [21]:

# 创建RoBERTa模型
model = RoBERTaClassifier().to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [22]:
# 训练和评估模型
for epoch in range(num_epochs):
    print(f'Epoch {epoch+1}/{num_epochs}')
    train_model(model, dev_loader, criterion, optimizer, device)
    accuracy = evaluate_model(model, dev_loader, device)
    print(f'Epoch {epoch+1}/{num_epochs}: Accuracy: {accuracy}')

Epoch 1/5


Train RoBERTa:   0%|          | 0/1222 [00:00<?, ?it/s]

Logits shape: torch.Size([4, 4])


Train RoBERTa:   0%|          | 1/1222 [00:01<29:12,  1.44s/it]

Batch size loss: 1.4180974960327148
Logits shape: torch.Size([4, 4])


Train RoBERTa:   0%|          | 2/1222 [00:01<16:13,  1.25it/s]

Batch size loss: 1.444307804107666
Logits shape: torch.Size([4, 4])


Train RoBERTa:   0%|          | 3/1222 [00:02<12:00,  1.69it/s]

Batch size loss: 1.3969972133636475
Logits shape: torch.Size([4, 4])


Train RoBERTa:   0%|          | 4/1222 [00:02<09:59,  2.03it/s]

Batch size loss: 1.3857412338256836
Logits shape: torch.Size([4, 4])


Train RoBERTa:   0%|          | 5/1222 [00:02<08:53,  2.28it/s]

Batch size loss: 1.3712937831878662
Logits shape: torch.Size([4, 4])


Train RoBERTa:   0%|          | 6/1222 [00:03<08:14,  2.46it/s]

Batch size loss: 1.3881642818450928
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 7/1222 [00:03<07:48,  2.59it/s]

Batch size loss: 1.4206831455230713
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 8/1222 [00:03<07:31,  2.69it/s]

Batch size loss: 1.3801727294921875
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 9/1222 [00:04<07:20,  2.76it/s]

Batch size loss: 1.377102017402649
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 10/1222 [00:04<07:12,  2.80it/s]

Batch size loss: 1.3573033809661865
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 11/1222 [00:04<07:07,  2.83it/s]

Batch size loss: 1.3728522062301636
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 12/1222 [00:05<07:04,  2.85it/s]

Batch size loss: 1.321674108505249
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 13/1222 [00:05<07:00,  2.87it/s]

Batch size loss: 1.4113768339157104
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 14/1222 [00:05<06:59,  2.88it/s]

Batch size loss: 1.3483734130859375
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|          | 15/1222 [00:06<06:59,  2.88it/s]

Batch size loss: 1.3091610670089722
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|▏         | 16/1222 [00:06<06:56,  2.90it/s]

Batch size loss: 1.372544288635254
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|▏         | 17/1222 [00:06<06:55,  2.90it/s]

Batch size loss: 1.3784289360046387
Logits shape: torch.Size([4, 4])


Train RoBERTa:   1%|▏         | 18/1222 [00:07<06:56,  2.89it/s]

Batch size loss: 1.3782680034637451
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 19/1222 [00:07<06:55,  2.90it/s]

Batch size loss: 1.3525745868682861
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 20/1222 [00:07<06:52,  2.91it/s]

Batch size loss: 1.53678560256958
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 21/1222 [00:08<06:54,  2.90it/s]

Batch size loss: 1.3562309741973877
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 22/1222 [00:08<06:53,  2.90it/s]

Batch size loss: 1.178386926651001
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 23/1222 [00:09<06:53,  2.90it/s]

Batch size loss: 1.221478819847107
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 24/1222 [00:09<06:52,  2.91it/s]

Batch size loss: 1.3405234813690186
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 25/1222 [00:09<06:51,  2.91it/s]

Batch size loss: 1.6706300973892212
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 26/1222 [00:10<06:51,  2.91it/s]

Batch size loss: 1.564473271369934
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 27/1222 [00:10<06:52,  2.90it/s]

Batch size loss: 1.616933822631836
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 28/1222 [00:10<06:49,  2.91it/s]

Batch size loss: 1.5064396858215332
Logits shape: torch.Size([4, 4])


Train RoBERTa:   2%|▏         | 28/1222 [00:11<07:51,  2.53it/s]


KeyboardInterrupt: 

In [13]:
input_ids = train_dataset[100][0].unsqueeze(0).to(device)
attention_masks = train_dataset[100][1].unsqueeze(0).to(device)


RuntimeError: shape '[-1, 512]' is invalid for input of size 1

In [18]:
attention_masks.shape

torch.Size([1, 512])

In [16]:
logits = model(input_ids, attention_masks)

RuntimeError: shape '[-1, 512]' is invalid for input of size 1

In [24]:
train_dataset[100][0]

tensor([    0,  3762,   325, 10226,     8,   664,  3362,   198,     5,   232,
           32,    11,     5,   652,     9,  2086,    49,  1576,    30,  6288,
            7,  7337,   930,     4,   152,    16,   309,     7,     5,   623,
         1309,  6481,     4, 50118, 41026,   383,    64,   146,    82,  1372,
            8,   455,     9,  1007,   101,   205,   930,     4,  1876,    82,
          679, 25731,    16,   357,   114,    47,    32,  6288,     7,  3152,
            8,  3825,     4,   125,   114,    47,   269,  4161,     7,     5,
          930, 24120,     6,   190,   269,   205,   930,     6,    24,    64,
         2581,   110,  1576,  7340,     4, 50118,  1106,    10,   621,  1239,
           10, 15604,     7,   213,    31,    65,   317,     7,     5,    97,
           13,   457,    41,  1946,    11,     5,   662,     8,    10,   457,
           41,  1946,    11,     5,  1559,     6,     8,   358,   183,    34,
            7, 18134,    15,    39,  6086,  2187,   142,    89, 