In [45]:
# https://github.com/GyuminJack/torchstudy/blob/main/06Jun/NER/src/data.py

import linecache
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizerFast
from torch.nn.utils.rnn import pad_sequence

def load_tokenizer(tokenizer_path):
    loaded_tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path, strip_accents=False, lowercase=False)  # Must be False if cased model  # 로드
    return loaded_tokenizer

class KlueDataset_NER(Dataset):
    def __init__(self, vocab_txt_path, txt_path, *args, **kwargs):
        self.tokenizer = load_tokenizer(vocab_txt_path)
        self.max_seq_len = 256
        self.txt_path = txt_path
        
        self.cls_token_id  = self.tokenizer.cls_token_id
        self.sep_token_id  = self.tokenizer.sep_token_id
        self.pad_token_id  = self.tokenizer.pad_token_id
        
        self.bio_dict = {
                        '[PAD]' : 0,
                        'B-DT': 1,
                        'B-LC': 2,
                        'B-OG': 3,
                        'B-PS': 4,
                        'B-QT': 5,
                        'B-TI': 6,
                        'I-DT': 7,
                        'I-LC': 8,
                        'I-OG': 9,
                        'I-PS': 10,
                        'I-QT': 11,
                        'I-TI': 12,
                        'O': 13
                        }
        self.reverse_bio_dict = {v:k for k, v in self.bio_dict.items()}
        with open(self.txt_path, "r") as f:
            self._total_data = len(f.readlines())

    def __len__(self):
        return self._total_data

    def __getitem__(self, idx):
        raw_ko = linecache.getline(self.txt_path, idx + 1).strip()
        text, bio_string = raw_ko.split("\t")

        bio_tensor = [self.bio_dict[i] for i in bio_string.split(",")]
    
        sent = self.tokenizer.encode(text)[1:-1]
        pad_length = self.max_seq_len - 2 - len(sent)
        
        train = torch.tensor([self.cls_token_id] + sent + [self.sep_token_id] + [self.pad_token_id] * pad_length).long().contiguous()
        target = torch.tensor([self.cls_token_id] + bio_tensor + [self.sep_token_id] + [self.pad_token_id] * pad_length).long().contiguous()
        
        segment_embedding = torch.zeros(target.size(0)).long()
        
        return train, target, segment_embedding

In [46]:
vocab_txt_path = "./data/tokenizer_model"
file_path = "./data/klue_ner_processed.train"

dataset = KlueDataset_NER(vocab_txt_path, file_path)
train_data_loader = DataLoader(dataset, batch_size=1, shuffle=True)

for train, target, segment_embedding in train_data_loader:
    print(train)
    print(target)
    print(segment_embedding)
    
    break

1회 폭격에 수십기가 격추되어 과거 지구연방의 격추왕 아무로레이를 무색하게한다.	B-QT,I-QT,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-PS,I-PS,I-PS,O,O,O,O,O
tensor([[    2,    21,  3435, 22008,  3541,  5978,  3395,  3595, 14382,  4186,
          3431,  5555,  6000,  3652,  3360,  3548, 14382,  3807,  4936,  3503,
          6963,  3794, 16831, 20629,  9451,    18,     3,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0