In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m70.7 MB/s[0m eta [36m0:00:0

In [57]:
import json
import torch
from torch.utils.data import Dataset


class myDataset(Dataset):
  def __init__(self, data_path, tokenizer) -> None:
    super().__init__()
    self.json_data = []
    with open(data_path, 'r') as f:
        for line in f:
            self.json_data.append(json.loads(line))

    # special token = ['SPEC'] 을 추가한 부분은 구현되어 있음
    special_tokens_dict = {'additional_special_tokens': ['[SPEC]']}
    self.tokenizer = tokenizer
    self.tokenizer.add_special_tokens(special_tokens_dict)

  def __len__(self):
    return len(self.json_data)

  def __getitem__(self, index):
    data=self.json_data[index]
    answerKey = ord(data["answerKey"])-65 # [A, B, C, D, E]의 answerKey를 [0, 1, 2, 3, 4]로 변환
    question = data["question"]["stem"]
    choicesText=[]
    for i in range(len(data["question"]["choices"])):
        choicesText.append(data["question"]["choices"][i]["text"])

    # Tokenize
    '''
    배포된 자료를 참고하여 input의 형식을 구성하고
    tokenizer를 이용하여 tokenize (max_len=100)
    *** question과 choice에 대한 text, 그리고 special token "[SPEC]"을 모두 사용하여
    하나의 string을 만든 뒤에 그것을 tokenizer에 넣는 것임을 기억하기 ***
    '''
    string = question
    for choice in choicesText:
      string += ' [SPEC] ' + choice
    input_data = self.tokenizer(string, max_length=100, padding='max_length', truncation=True)

    # Conver to tensor
    input_ids=torch.IntTensor(input_data["input_ids"])
    token_type_ids=torch.IntTensor(input_data["token_type_ids"])
    attention_mask=torch.IntTensor(input_data["attention_mask"])

    # Store the index(position) of [SPEC] tokens
    spec_token_id=self.tokenizer.convert_tokens_to_ids("[SPEC]")
    spec_tokens_index = list(filter(lambda x: input_ids[x] == spec_token_id, range(len(input_ids))))
    spec_tokens_index = torch.LongTensor(spec_tokens_index)

    target=answerKey

    return input_ids, token_type_ids, attention_mask, spec_tokens_index, target

In [101]:
import torch
from torch import nn
from transformers import BertModel


class myModel(nn.Module):
    def __init__(self, tokenizer) -> None:
        super().__init__()
        self.bert =  BertModel.from_pretrained('bert-base-uncased')
        self.bert.resize_token_embeddings(len(tokenizer))
        self.linear = nn.Linear(768, 1)

    def forward(self, input_ids, token_type_ids, attention_mask, spec_tokens_index):
        # bert model에 입력하여 output 도출
        output = self.bert(input_ids, token_type_ids, attention_mask)

        '''
        이중 for문에 대한 설명:
        special token의 위치가 batch안의 한 데이터마다 모두 다르기 때문에
        해당하는 위치(special token의 position)의 값만 가져오는 부분
        (special token의 위치가 batch안의 한 데이터마다 다른 이유는 주어진 question과 answer를 tokenize 했을 때 몇 개의 token으로 tokenize되는지, 그 길이가 다르기 때문)

        i는 batch 안의 한 데이터에 접근하기 위함이고
        j는 dataset에서 넘겨준 special token의 위치(index)를 한개씩 가져오기 위함
        output의 last hidden state에서 각 special token index에 해당하는 값들을 logits list에 append (logits의 shape는 (batch_size*5, dim)이 됨)
        '''
        logits=[]
        for i in range(input_ids.shape[0]): # batch 1개씩
            for j in range(len(spec_tokens_index[i])): # choice 1개씩
                logits.append(output.last_hidden_state[i,spec_tokens_index[i][j],:])
        logits=torch.stack(logits)

        # batch processing을 위해 [batch_size*5, dim]의 logits tensor를 한꺼번에 linear에 통과시킴
        output = self.linear(logits)

        # shape이 [batch_size, 5, dim]이 되도록 reshape
        output = output.reshape(-1, 5, 1)

        return output

In [127]:
import torch
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

# from dataset import myDataset
# from model import myModel


# Dataset
train_data_path = "/content/drive/MyDrive/nlp-open-tutorial/과제/MultipleChoiceQA_blank/data/train_rand_split.jsonl" # train dataset의 경로 입력
test_data_path = "/content/drive/MyDrive/nlp-open-tutorial/과제/MultipleChoiceQA_blank/data/dev_rand_split.jsonl" # test dataset의 경로 입력

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
train_dataset = myDataset(train_data_path, tokenizer)
test_dataset = myDataset(test_data_path, tokenizer)

batch_size = 16 # Out of memory error가 뜬다면 batch size를 줄여서 다시 실행시켜보기
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size = batch_size, drop_last=True)

# Model
model = myModel(tokenizer).cuda()

# Optimizer/Loss function
optimizer = Adam(model.parameters(), lr=0.00001)
lf = CrossEntropyLoss()

# Train 10 epoch
for e in range(10):
  print("\nepoch ", e)
  epoch_loss = 0
  train_correct = 0

  model.train()

  for batch in tqdm(train_dataloader):
    optimizer.zero_grad()

    input_ids, token_type_ids, attention_mask, spec_tokens_index, target = batch
    input_ids = input_ids.cuda()
    token_type_ids = token_type_ids.cuda()
    attention_mask = attention_mask.cuda()
    spec_tokens_index = spec_tokens_index.cuda()
    target = target.cuda()

    output = model(input_ids, token_type_ids, attention_mask, spec_tokens_index)
    pred_label = torch.argmax(output, dim=1)
    train_correct += sum(pred_label == target.reshape(-1,1))

    loss = lf(output, target.reshape(-1, 1))

    loss.backward()

    optimizer.step()

    epoch_loss += loss.item()

  print("train loss", epoch_loss/len(train_dataloader))
  print("train acc", train_correct/len(train_dataset))

  # Test at every epoch
  test_loss = 0
  test_correct = 0

  model.eval()
  with torch.no_grad():
      for batch in tqdm(test_dataloader):
        input_ids, token_type_ids, attention_mask, spec_tokens_index, target = batch
        input_ids = input_ids.cuda()
        token_type_ids = token_type_ids.cuda()
        attention_mask = attention_mask.cuda()
        spec_tokens_index = spec_tokens_index.cuda()
        target = target.cuda()

        output = model(input_ids, token_type_ids, attention_mask, spec_tokens_index)
        pred_label = torch.argmax(output, dim=1)
        test_correct += sum(pred_label == target.reshape(-1,1))

        loss = lf(output, target.reshape(-1, 1))

        test_loss += loss.item()

  print("test loss", test_loss/len(test_dataloader))
  print("test acc", test_correct/len(test_dataset))

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embeding dimension will be 28997. This might induce some performance reduction as *Tensor Cores* will not be available. For more details  about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc



epoch  0


100%|██████████| 608/608 [03:12<00:00,  3.15it/s]


train loss 1.611810151291521
train acc tensor([0.1987], device='cuda:0')


100%|██████████| 76/76 [00:07<00:00,  9.72it/s]


test loss 1.6094292292469425
test acc tensor([0.1990], device='cuda:0')

epoch  1


100%|██████████| 608/608 [03:10<00:00,  3.19it/s]


train loss 1.6132166801314605
train acc tensor([0.2032], device='cuda:0')


100%|██████████| 76/76 [00:08<00:00,  9.10it/s]


test loss 1.6094363300423873
test acc tensor([0.2080], device='cuda:0')

epoch  2


100%|██████████| 608/608 [03:10<00:00,  3.19it/s]


train loss 1.6111124706895728
train acc tensor([0.1959], device='cuda:0')


100%|██████████| 76/76 [00:08<00:00,  9.34it/s]


test loss 1.6094382671933425
test acc tensor([0.2105], device='cuda:0')

epoch  3


100%|██████████| 608/608 [03:10<00:00,  3.19it/s]


train loss 1.6108104706202682
train acc tensor([0.2029], device='cuda:0')


100%|██████████| 76/76 [00:07<00:00,  9.77it/s]


test loss 1.6094393918388767
test acc tensor([0.2088], device='cuda:0')

epoch  4


100%|██████████| 608/608 [03:10<00:00,  3.19it/s]


train loss 1.6123395481784093
train acc tensor([0.1982], device='cuda:0')


100%|██████████| 76/76 [00:08<00:00,  9.46it/s]


test loss 1.6094387330506976
test acc tensor([0.1974], device='cuda:0')

epoch  5


100%|██████████| 608/608 [03:10<00:00,  3.19it/s]


train loss 1.6104097015371448
train acc tensor([0.1992], device='cuda:0')


100%|██████████| 76/76 [00:08<00:00,  9.07it/s]


test loss 1.6094378640777187
test acc tensor([0.2064], device='cuda:0')

epoch  6


100%|██████████| 608/608 [03:09<00:00,  3.20it/s]


train loss 1.6098372610776048
train acc tensor([0.1984], device='cuda:0')


100%|██████████| 76/76 [00:08<00:00,  9.11it/s]


test loss 1.6094375691915814
test acc tensor([0.2064], device='cuda:0')

epoch  7


100%|██████████| 608/608 [03:10<00:00,  3.19it/s]


train loss 1.6099116739473844
train acc tensor([0.2052], device='cuda:0')


100%|██████████| 76/76 [00:08<00:00,  8.55it/s]


test loss 1.609437854666459
test acc tensor([0.2105], device='cuda:0')

epoch  8


100%|██████████| 608/608 [03:10<00:00,  3.19it/s]


train loss 1.6102877714132007
train acc tensor([0.2033], device='cuda:0')


100%|██████████| 76/76 [00:08<00:00,  8.56it/s]


test loss 1.6094379519161426
test acc tensor([0.2048], device='cuda:0')

epoch  9


100%|██████████| 608/608 [03:09<00:00,  3.20it/s]


train loss 1.6097230211292441
train acc tensor([0.1985], device='cuda:0')


100%|██████████| 76/76 [00:08<00:00,  8.79it/s]

test loss 1.6094381495525962
test acc tensor([0.2080], device='cuda:0')



