<a href="https://colab.research.google.com/github/highway92/machine_learning/blob/main/year_dream/file1_PretrainingDataProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Transformer & BERT 실습

- 기존에 우리는 nn.Transformer 혹은 transoformers 라이브러리를 이용해 transformer 모델과 BERT를 구현했었습니다.
- 이번에는 직접 내부에 구현되는 것들을 하나하나 코드를 작성해보며 이해해봅시다.
- 오늘 진행할 과정은 아래의 5가지로 이루어져있습니다.
  - BERT 실습 (Masked LM, NSP)
    1. 데이터 전처리
    2. model 구현 
    3. BERT 모델 pretraining
  - Transformer 실습 (Text 번역)
    1. transformer 모델 짜기
    2. transformer 실습

In [1]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 16.8 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.4


In [2]:
# 데이터 저장 및 불러오기 관련 라이브러리
import json 
 
# 파이토치 관련 라이브러리
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchtext

# string 데이터 처리 라이브러리
from tokenizers import Tokenizer
from tokenizers.models import BPE # byte-pair encoding
from tokenizers.trainers import BpeTrainer
from spacy.lang.en import English

# iteration visualization 라이브러리
from tqdm.auto import tqdm

# 수학 관련 라이브러리
import numpy as np

In [3]:
dataset = torchtext.datasets.WikiText2(root='.data', split='train')

100%|██████████| 4.48M/4.48M [00:00<00:00, 39.9MB/s]


In [4]:
dataset = [d.strip() for d in dataset if len(d)>10] # 길이가 짧아 무의미한 데이터는 제외하고, strip을 통해 데이터를 정리해주겠습니다.

In [5]:
# dataset 예시 
dataset[1000]

"The album was completed on November 2013 , and a bonus disc was also made for the album , containing the leftover material from the main album as well as songs from Ghost 2 , the unreleased compilation of leftover tracks from Ghost . Originally in 2012 , Townsend stated that this album will be the sixth and the last album in the Devin Townsend Project series , but he ultimately confirmed that Casualties of Cool is its own project . Townsend also started a <unk> campaign through <unk> to support the release of the album . The funding quickly reached its goal , and all additional funds were put directly to Townsend 's upcoming projects . Casualties of Cool was released on May 14 , 2014 . The album was re @-@ issued worldwide on January 15 , 2016 containing an additional DVD with live footage from the 2014 concert at the Union Chapel in London ."

In [6]:
len(dataset)

23627

In [7]:
# 문장 단위로 잘라주는 라이브러리
# next sentence prediction을 하기위해 한 문장씩 잘라줍니다.
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(dataset[1000])


In [8]:
# 문장 단위로 잘려진 예시
sentences = [sent.string.strip() for sent in doc.sents]
sentences

['The album was completed on November 2013 , and a bonus disc was also made for the album , containing the leftover material from the main album as well as songs from Ghost 2 , the unreleased compilation of leftover tracks from Ghost .',
 'Originally in 2012 , Townsend stated that this album will be the sixth and the last album in the Devin Townsend Project series , but he ultimately confirmed that Casualties of Cool is its own project .',
 'Townsend also started a <unk> campaign through <unk> to support the release of the album .',
 "The funding quickly reached its goal , and all additional funds were put directly to Townsend 's upcoming projects .",
 'Casualties of Cool was released on May 14 , 2014 .',
 'The album was re @-@ issued worldwide on January 15 , 2016 containing an additional DVD with live footage from the 2014 concert at the Union Chapel in London .']

In [9]:
# 모든 데이터를 sentence 별로 분리
data=[]
data_length = 0
data_index = {}
sentence_lengths = []

for i,context in enumerate(tqdm(dataset)):
    doc = nlp(context)
    sentences = [sent.string.strip() for sent in doc.sents]
    if len(sentences)<=3: # positive, negative pair 를 모두 얻을 수 없는 경우에 제외
        continue
        # 바로 다음 문장 존재해야 next sentence prediction이 가능하기 때문에 아래의 코드에서 -1 이 붙어있다.
    data.append(sentences)
    for j in range(len(sentences)-1): # positive pair가 없는 마지막 문장 제외
        data_index[j+data_length] = (len(data)-1,j) 
    data_length += len(sentences)-1
    sentence_lengths.append(len(sentences))

  0%|          | 0/23627 [00:00<?, ?it/s]

In [10]:
data_length

52523

In [11]:
data_index

{0: (0, 0),
 1: (0, 1),
 2: (0, 2),
 3: (0, 3),
 4: (1, 0),
 5: (1, 1),
 6: (1, 2),
 7: (1, 3),
 8: (2, 0),
 9: (2, 1),
 10: (2, 2),
 11: (2, 3),
 12: (2, 4),
 13: (3, 0),
 14: (3, 1),
 15: (3, 2),
 16: (3, 3),
 17: (3, 4),
 18: (3, 5),
 19: (3, 6),
 20: (4, 0),
 21: (4, 1),
 22: (4, 2),
 23: (4, 3),
 24: (4, 4),
 25: (4, 5),
 26: (4, 6),
 27: (4, 7),
 28: (4, 8),
 29: (5, 0),
 30: (5, 1),
 31: (5, 2),
 32: (6, 0),
 33: (6, 1),
 34: (6, 2),
 35: (6, 3),
 36: (7, 0),
 37: (7, 1),
 38: (7, 2),
 39: (7, 3),
 40: (7, 4),
 41: (8, 0),
 42: (8, 1),
 43: (8, 2),
 44: (8, 3),
 45: (8, 4),
 46: (9, 0),
 47: (9, 1),
 48: (9, 2),
 49: (9, 3),
 50: (9, 4),
 51: (9, 5),
 52: (9, 6),
 53: (10, 0),
 54: (10, 1),
 55: (10, 2),
 56: (10, 3),
 57: (10, 4),
 58: (10, 5),
 59: (10, 6),
 60: (10, 7),
 61: (10, 8),
 62: (10, 9),
 63: (11, 0),
 64: (11, 1),
 65: (11, 2),
 66: (11, 3),
 67: (11, 4),
 68: (11, 5),
 69: (11, 6),
 70: (11, 7),
 71: (11, 8),
 72: (11, 9),
 73: (11, 10),
 74: (12, 0),
 75: (12, 1)

In [12]:
# Next sentence prediction 을 위해 positive sample 과 negative sample 을 추출하는 코드

index = 125 #예시

def get_negative_sample(index):
  # 본인, 다음 문장이 아닌 문장 뽑기
    i, j = data_index[index] # i번째 문단, j번째 문장
    max_j = sentence_lengths[i] # 문단 안에 문장이 몇개 있는지
    while True:
        random_j = np.random.randint(0,max_j) # randint를 통해서 랜덤문장 인덱스
        if random_j != (j+1) and random_j != j : # 만약, random index가 자기자신도 아니고, 다음 문장도 아니면 -> 통과
            break
    return data[i][random_j]

def get_positive_sample(index):
    i, j = data_index[index]
    return data[i][j+1]

i,j = data_index[index]

sentence_a = data[i][j]
sentence_b = get_positive_sample(index)
sentence_c = get_negative_sample(index)

print(sentence_a, '\n\n')
print(sentence_b,'\n\n') # positive sample은 sentence_a의 바로 다음 문장으로 고정되어야 함
print(sentence_c) # negative sample은 sentence_a 와 같은 context안에 존재하는 sentence_a, sentence_b 가 아닌 문장



John <unk> Walker , a builder for the Federal Government , supervised the construction . 


Originally $ 14 @,@ 000 was allocated for the construction of the arsenal , but proved inadequate . 


Being originally constructed to store ammunition , the building was designed with 3 @-@ foot @-@ thick ( 0 @.@ 91 m ) exterior walls .


In [13]:
# tokenizer 학습을 위해 데이터를 저장해둡시다.
save_data = '\n'.join(['\n'.join(d) for d in data])
with open('data.txt','w') as f:
  f.write(save_data)

In [14]:
save_data



In [15]:
tokenizer = Tokenizer(BPE(unk_token="<unk>"))
trainer = BpeTrainer(special_tokens=["[PAD]", "<unk>", "[CLS]", "[SEP]", "[MASK]"])


files = ['data.txt']
tokenizer.train(files, trainer)

In [16]:
tokenizer.get_vocab()# vocabulary 단어 예시

{'olph ': 22447,
 'body and ': 17508,
 'Jupiter ': 10919,
 'ir , ': 15998,
 'However , the ': 4186,
 'kidn': 17681,
 'Run ': 10107,
 'Asomtav': 16460,
 'separately ': 21790,
 'Finn': 23551,
 '1 @,@ 400 ': 24325,
 'monopol': 27129,
 'Hard ': 20135,
 'Iguanodon ': 9287,
 'stay ': 4318,
 'would become ': 16665,
 'the <unk> of the ': 15656,
 'abundance of ': 23368,
 'eding ': 3319,
 'estate ': 10804,
 'joint ': 10084,
 'compilation ': 17617,
 's : ': 25042,
 'Polka Party ': 26534,
 'terrorist ': 19129,
 'home and ': 24183,
 'ed over ': 7015,
 'message ': 9276,
 'divisional ': 24679,
 'tow': 1406,
 'know , ': 28067,
 'tic , ': 7868,
 'ound': 1527,
 'tail': 10757,
 'iTun': 17766,
 'Scratch': 24520,
 '" , ': 753,
 'shocked ': 25296,
 '250 @,@ 000 ': 24892,
 'mounts ': 18131,
 'Charlie ': 9559,
 'described the ': 4301,
 ',': 18,
 'incredi': 13925,
 'contr': 16600,
 'was directed by ': 28796,
 'copy': 29672,
 'known as the " ': 14380,
 'working ': 2767,
 'suggest that ': 24614,
 'ained ': 2496,

In [17]:
# 문장 i go home.

# -> tokenizer(단순처리) ->
# token 'i' 'go' 'home'

In [18]:
# token화 적용 예시
tokens_a = tokenizer.encode(sentence_a).tokens
tokens_b = tokenizer.encode(sentence_b).tokens

print(tokens_a,'\n\n')
print(tokens_b)

['John ', '<unk>', ' ', 'Wal', 'k', 'er , a ', 'buil', 'der ', 'for the ', 'Federal ', 'Government ', ', ', 'supervis', 'ed the ', 'construction ', '.'] 


['Originally ', '$ ', '14 @,@ 000 ', 'was ', 'allocated ', 'for the ', 'construction of the ', 'arsen', 'al , but ', 'proved ', 'in', 'adequate ', '.']


In [35]:
# token의 최대 길이가 사전에 정해진 length보다 크지 않도록 조절함
# [CLS], [SEP], [SEP] 의 3개 token 이 추가되므로 최대 길이보다 3 이상 작아야 함
# 두개의 문장이 합쳐져서 input이 되므로 둘 중 긴 것을 잘라내는 방식으로 길이를 줄임
max_len = 128

# -3  for special tokens [CLS], [SEP], [SEP]
while len(tokens_a)+len(tokens_b) > max_len-3:
    if len(tokens_a)>len(tokens_b) : 
        tokens_a = tokens_a[:-1] # 마지막 토큰 제거
    else:
        tokens_b = tokens_b[:-1] # 마지막 토큰 제거

In [20]:
tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]'] # input tokens
segment_ids = [0]*(len(tokens_a)+2) + [1]*(len(tokens_b)+1) # 뒷 문장의 위치를 표시한 변수
input_mask = [1]*len(tokens) # 향후 padding 한 부분을 제거해주기 위해 input 길이 만큼을 1로 표시한 변수

In [21]:
tokens

['[CLS]',
 'John ',
 '<unk>',
 ' ',
 'Wal',
 'k',
 'er , a ',
 'buil',
 'der ',
 'for the ',
 'Federal ',
 'Government ',
 ', ',
 'supervis',
 'ed the ',
 'construction ',
 '.',
 '[SEP]',
 'Originally ',
 '$ ',
 '14 @,@ 000 ',
 'was ',
 'allocated ',
 'for the ',
 'construction of the ',
 'arsen',
 'al , but ',
 'proved ',
 'in',
 'adequate ',
 '.',
 '[SEP]']

In [22]:
print(tokens)
print(segment_ids)
print(input_mask)

['[CLS]', 'John ', '<unk>', ' ', 'Wal', 'k', 'er , a ', 'buil', 'der ', 'for the ', 'Federal ', 'Government ', ', ', 'supervis', 'ed the ', 'construction ', '.', '[SEP]', 'Originally ', '$ ', '14 @,@ 000 ', 'was ', 'allocated ', 'for the ', 'construction of the ', 'arsen', 'al , but ', 'proved ', 'in', 'adequate ', '.', '[SEP]']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [23]:
# hyperparameters
mask_prob = 0.15 # mask 로 바꿀 token의 전체 길이 대비 비율
max_pred = 20 # mask 혹은 다른 단어로 바뀔 수 있는 최대 token 갯수

In [24]:
#masked LM 을 위한 preprocessing


# mask 혹은 다른 단어로 바꿀 token 갯수
n_pred = min(max_pred, max(1, int(round(len(tokens)*mask_prob)))) 

print(n_pred)

5


In [25]:
# [CLS], [SEP] 을 제외한 나머지 토큰 중 n_pred 개 만큼을 선택
candidate_position = [i for i, token in enumerate(tokens) if token != '[CLS]' and token != '[SEP]']
np.random.shuffle(candidate_position)
candidate_position = candidate_position[:n_pred]

print(candidate_position)

[23, 30, 9, 28, 29]


In [26]:
vocab_words = list(tokenizer.get_vocab().keys())

In [27]:
# 토큰 후보들을 80% 확률로 [MASK]로 바꾸고, 나머지를 10% 확률로 다른 단어로 바꿈
masked_tokens, masked_pos = [], []
for pos in candidate_position:
    masked_tokens.append(tokens[pos])
    masked_pos.append(pos)
    if np.random.random() < 0.8: # 80% random 0 ~ 1 사이의 uniform sampling
        tokens[pos] = '[MASK]'
    elif np.random.random() < 0.5: # 10%
        random_word = vocab_words[np.random.randint(0, len(vocab_words)-1)]
        tokens[pos] = random_word

In [28]:
# mask를 통해 바뀐 문장
print(' '.join(tokens))

[CLS] John  <unk>   Wal k er , a  buil der  for the  Federal  Government  ,  supervis ed the  construction  . [SEP] Originally  $  14 @,@ 000  was  allocated  for the  construction of the  arsen al , but  proved  [MASK] adequate  [MASK] [SEP]


In [29]:
# mask 된 단어들
print(masked_tokens)

['for the ', '.', 'for the ', 'in', 'adequate ']


In [30]:
# n_pred 갯수만큼만 loss를 흘려주기 위해 mask 갯수만큼 1을 표시한 변수 정의
masked_weights = [1]*len(masked_tokens)

In [31]:
masked_weights

[1, 1, 1, 1, 1]

In [32]:
# token to index
# 왜냐하면 word embedding 등 다른 걸 할때도 다 index를 사용하니까~

class TokenIndexer():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def __call__(self, tokens):
        return [self.tokenizer.token_to_id(token) for token in tokens]
# token_to_id는 token을 특정 id로 바꿔준다.
    def convert_ids_to_tokens(self, ids):
        return [self.tokenizer.id_to_token(id) for id in ids]
# id_to_token은 id를 token으로 바꿔준다.
token_indexer = TokenIndexer(tokenizer)

In [36]:
# Token Indexing
input_ids = token_indexer(tokens) # BERT input
masked_ids = token_indexer(masked_tokens) # MLM label

# Zero Padding
n_pad = max_len - len(input_ids)
input_ids.extend([0]*n_pad)
segment_ids.extend([0]*n_pad)
input_mask.extend([0]*n_pad)

# Zero Padding for masked target
if max_pred > n_pred:
    n_pad = max_pred - n_pred
    masked_ids.extend([0]*n_pad)
    masked_pos.extend([0]*n_pad)
    masked_weights.extend([0]*n_pad)

In [None]:
# token indexing , padding 예시
print(' '.join(tokens),'\n') # 처리 전
print(input_ids) # 처리 후

[CLS] The  item  was  intended  simply  as a  piece of  new [MASK] tel eg raph  lines  quickly  spread  [MASK] news  Monten state ,  <unk>   proc ession  sentim [MASK] . [SEP] The  rum or was  interpre ted by  some  <unk>   ess  call  from the  governor  to  assemble  to help  exp [MASK] federal  troops  [MASK] arsen al  . [SEP] 

[1, 322, 13802, 312, 7224, 4852, 622, 12697, 1891, 4, 1435, 729, 14794, 4875, 3432, 5509, 4, 12670, 12648, 14718, 0, 6, 1855, 5234, 10647, 4, 20, 2, 322, 9146, 21420, 5457, 1766, 885, 0, 6, 1021, 4636, 584, 10842, 286, 24593, 11987, 1365, 4, 5816, 4886, 4, 17121, 307, 20, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [37]:
# 위의 전처리 과정을 모두 포함한 pytorch data iterator 생성
class SentencePairDataset(Dataset):
    def __init__(self):
        # hyper parameters
        self.max_len = 128
        self.mask_prob = 0.15
        self.max_pred = 20 

        labels = ("0", "1")

        # data 다운로드 및 전처리
        dataset = torchtext.datasets.WikiText2(root='.data', split='train')
        dataset = [d.strip() for d in dataset if len(d)>10] # 길이가 짧아 무의미한 데이터는 제외하고, strip을 통해 데이터를 정리해주겠습니다.
        
        
        # data 처리
        nlp = English()
        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        doc = nlp(dataset[0])
        self.data=[]
        self.data_length = 0
        self.data_index = {}
        self.sentence_lengths = []
        for i,context in enumerate(tqdm(dataset)):
            doc = nlp(context)
            sentences = [sent.string.strip() for sent in doc.sents]
            if len(sentences)<=3: # can not obtain positive, negative pair in dataset
                continue
            self.data.append(sentences)
            for j in range(len(sentences)-1): # drop last sentence (no positive pair)
                self.data_index[j+self.data_length] = (len(self.data)-1,j)
            self.data_length += len(sentences)-1
            self.sentence_lengths.append(len(sentences))

        
        
        
        # tokenizer
        save_data = '\n'.join(['\n'.join(d) for d in self.data])
        with open('data.txt','w') as f:
          f.write(save_data)
        self.tokenizer = Tokenizer(BPE(unk_token="<unk>"))
        trainer = BpeTrainer(special_tokens=["[PAD]", "<unk>", "[CLS]", "[SEP]", "[MASK]"])
        files = ['data.txt']
        self.tokenizer.train(files, trainer)
        self.vocab_words = list(tokenizer.get_vocab().keys())
        self.token_indexer = TokenIndexer(self.tokenizer)


    def get_negative_sample(self, index):
        i, j = self.data_index[index]
        max_j = self.sentence_lengths[i]
        while True:
            random_j = np.random.randint(0,max_j)
            if random_j != (j+1) and random_j != j :
                break
        i, new_j = self.data_index[index+(random_j-j)]
        return self.data[i][new_j]

    def get_positive_sample(self, index):
        i, j = self.data_index[index]
        return self.data[i][j+1]
    
    
    def process_instance(self, instance):
        # -3  for special tokens [CLS], [SEP], [SEP]
        is_next, tokens_a, tokens_b = instance
        while len(tokens_a)+len(tokens_b) > self.max_len-3:
            if len(tokens_a)>len(tokens_b) : 
                tokens_a = tokens_a[:-1]
            else:
                tokens_b = tokens_b[:-1]
        
        tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
        segment_ids = [0]*(len(tokens_a)+2) + [1]*(len(tokens_b)+1)
        input_mask = [1]*len(tokens)
        
        #masked LM
        masked_tokens, masked_pos = [], []
        n_pred = min(self.max_pred, max(1, int(round(len(tokens)*self.mask_prob))))


        candidate_position = [i for i, token in enumerate(tokens) if token != '[CLS]' and token != '[SEP]']
        np.random.shuffle(candidate_position)

        for pos in candidate_position[:n_pred]:
            masked_tokens.append(tokens[pos])
            masked_pos.append(pos)
            if np.random.random() < 0.8: # 80%
                tokens[pos] = '[MASK]'
            elif np.random.random() < 0.5: # 10%
                random_word = self.vocab_words[np.random.randint(0, len(self.vocab_words)-1)]
                tokens[pos] = random_word

        # when n_pred < max_pred, we only calculate loss within n_pred
        masked_weights = [1]*len(masked_tokens)

        # Token Indexing
        input_ids = self.token_indexer(tokens)
        masked_ids = self.token_indexer(masked_tokens)

        # Zero Padding
        n_pad = self.max_len - len(input_ids)
        input_ids.extend([0]*n_pad)
        segment_ids.extend([0]*n_pad)
        input_mask.extend([0]*n_pad)

        # Zero Padding for masked target
        if self.max_pred > n_pred:
            n_pad = self.max_pred - n_pred
            masked_ids.extend([0]*n_pad)
            masked_pos.extend([0]*n_pad)
            masked_weights.extend([0]*n_pad)
        
        
        return (input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next)
    
    

    def __getitem__(self, index):
        i,j = self.data_index[index]
        sentence_a = self.data[i][j]
        token_a = self.tokenizer.encode(sentence_a).tokens
        
        is_next = np.random.random()
        if is_next < 0.5:
            is_next = 0
            sentence_b = self.get_negative_sample(index)
        else:
            is_next = 1
            sentence_b = self.get_positive_sample(index)
        token_b = self.tokenizer.encode(sentence_b).tokens
            
        instance = (is_next, token_a, token_b)
        input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = self.process_instance(instance)
        
        input_ids = torch.tensor(input_ids).long()
        segment_ids = torch.tensor(segment_ids).long()
        input_mask = torch.tensor(input_mask).long()
        masked_ids = torch.tensor(masked_ids).long()
        masked_pos = torch.tensor(masked_pos).long()
        masked_weights = torch.tensor(masked_weights).long()
        is_next = torch.tensor(is_next).long()
        
        
        return input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next
    
    def __len__(self):
        return self.data_length

In [38]:
dataset = SentencePairDataset()

  0%|          | 0/23627 [00:00<?, ?it/s]

In [39]:
item = dataset[0]
item

(tensor([    2,    57,  3494,   155,     6, 19249,  9126,     4,     1, 26917,
         10707,   362,  2953,   587,   245,   243,   230,   241,   231,   240,
           233,   238,   239,   232,  1261, 22487,    20,     3,  9126,   325,
         27451,  1419,     4,   738, 29342, 15740, 28244,  6283, 11573,   914,
             4, 22837,  7152, 11532, 26626,  1210,  7636,    70,    20,     3,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [40]:
dataloader = DataLoader(dataset, batch_size = 5)

In [41]:
for item in dataloader:
    break

In [42]:
item

[tensor([[    2,    57,  3494,   155,     4,   942,  9126, 20433,     1,     6,
          10707,     4,  2953,     4,   245,   243,   230,   241,   231,   240,
            233,     4,   239,   232,  1261, 22487,    20,     3,  5018,  1007,
           2079,  2895,  1187, 10833,   658,  5449,  8671,  1132,  4159,   283,
              4,  1520,    20,     3,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [None]:
token_indexer.convert_ids_to_tokens([    1,    57,  3518,   155,     6,   943, 10716, 20433,     0,     6,
         12181,   362,     4,   588,   245,   243,   230,   241,   231,   240,
           233,   238,   239,   232,     4,  1761,    20,     2,  5063,  1007,
          2085,  2907,     4, 19454,   658,  5499,  8778,  1132,  4189,   283,
             4,     4,    20,     2,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0])

['[CLS]',
 'S',
 'enj',
 'ō',
 ' ',
 'no ',
 'Valkyria ',
 '3 : ',
 '<unk>',
 ' ',
 'Chronicles ',
 '( ',
 '[MASK]',
 ': ',
 '戦',
 '場',
 'の',
 'ヴ',
 'ァ',
 'ル',
 'キ',
 'ュ',
 'リ',
 'ア',
 '[MASK]',
 'defe',
 '.',
 '[SEP]',
 'The game ',
 'began ',
 'development ',
 'in 201',
 '[MASK]',
 'ranks ',
 'over ',
 'a large ',
 'portion of the ',
 'work ',
 'done ',
 'on ',
 '[MASK]',
 '[MASK]',
 '.',
 '[SEP]',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk

TypeError: ignored