In [46]:
import os
from importlib import import_module
import pandas as pd
import numpy as np
import torch
from transformers import ElectraForSequenceClassification, ElectraConfig, AutoTokenizer, ElectraModel

# Electra Tokenizer, Config, Model 불러오기

In [57]:
MODEL_TYPE = "Electra"
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_config = getattr(import_module("transformers"), MODEL_TYPE + "Config").from_pretrained(MODEL_NAME)
model = getattr(import_module("transformers"), MODEL_TYPE + "Model").from_pretrained(MODEL_NAME)

In [58]:
tokenizer.sep_token

'[SEP]'

In [55]:
print(model)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
     

# 학습 데이터 데이터 프레임 불러오기

In [8]:
data_path = os.path.join("..", "data", "train")
data_df = pd.read_csv(os.path.join(data_path, "train.tsv"), sep="\t", header=None)
data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,wikipedia-24896-25-30-33-19-21,영국에서 사용되는 스포츠 유틸리티 자동차의 브랜드로는 랜드로버(Land Rover)...,랜드로버,30,33,자동차,19,21,단체:제작
1,wikipedia-12728-224-5-7-42-44,"선거에서 민주당은 해산 전 의석인 230석에 한참 못 미치는 57석(지역구 27석,...",민주당,5,7,27석,42,44,관계_없음
2,wikipedia-28460-3-0-7-9-12,유럽 축구 연맹(UEFA) 집행위원회는 2014년 1월 24일에 열린 회의를 통해 ...,유럽 축구 연맹,0,7,UEFA,9,12,단체:별칭
3,wikipedia-11479-37-24-26-3-5,"용병 공격수 챠디의 부진과 시즌 초 활약한 강수일의 침체, 시즌 중반에 영입한 세르...",강수일,24,26,공격수,3,5,인물:직업/직함
4,wikipedia-15581-6-0-2-32-40,람캄행 왕은 1237년에서 1247년 사이 수코타이의 왕 퍼쿤 씨 인트라팃과 쓰엉 ...,람캄행,0,2,퍼쿤 씨 인트라팃,32,40,인물:부모님
...,...,...,...,...,...,...,...,...,...
8995,wikipedia-5414-12-15-21-0-4,2002년 FIFA 월드컵 사우디아라비아와의 1차전에서 독일은 8-0으로 승리하였는...,사우디아라비아,15,21,2002년,0,4,관계_없음
8996,wikipedia-10384-4-12-14-0-1,일본의 2대 메이커인 토요타와 닛산은 시장 점유율을 높이기 위한 신차 개발을 계속하...,토요타,12,14,일본,0,1,단체:본사_국가
8997,wikipedia-25913-6-8-10-93-106,방호의의 손자 방덕룡(方德龍)은 1588년(선조 21년) 무과에 급제하고 낙안군수로...,방덕룡,8,10,선무원종공신(宣武原從功臣),93,106,인물:직업/직함
8998,wikitree-12062-15-0-3-46-47,LG전자는 올해 초 국내시장에 출시한 2020년형 ‘LG 그램’ 시리즈를 이달부터 ...,LG전자,0,3,북미,46,47,관계_없음


# Tokenize 테스트

In [9]:
tokenize_sent = data_df.iloc[37, 1]

print(f"Original : {tokenize_sent}")
print(f"Tokenized : {'/'.join(tokenizer.tokenize(tokenize_sent))}")
print(f"Encoded : {tokenizer.encode(tokenize_sent)}")

Original : 이부는 원상과 헤어지게 되었으므로 어쩔 수 없이 원담(袁譚)에게 항복했고 평원(平原)에 부임하였다.
Tokenized : 이부/##는/원상/##과/헤어지/##게/되/##었/##으므로/어쩔/수/없이/원/##담/(/袁/[UNK]/)/에게/항복/##했/##고/평원/(/平/原/)/에/부임/##하/##였/##다/.
Encoded : [2, 29038, 4034, 21843, 4047, 17957, 4325, 2411, 4480, 15542, 9461, 2967, 6419, 3201, 4274, 12, 1652, 1, 13, 6220, 15158, 4398, 4219, 24046, 12, 845, 574, 13, 3130, 14372, 4279, 4737, 4176, 18, 3]


In [10]:
print(tokenizer.encode("[SEP]"))

[2, 3, 3]


In [11]:
print(tokenizer.encode("[UNK]"))

[2, 1, 3]


In [12]:
print(tokenizer.encode("[CLS]"))

[2, 2, 3]


In [13]:
for i in range(4):
    print(tokenizer.decode(i))

[PAD]
[UNK]
[CLS]
[SEP]


Electra Tokenizer로 인코딩을 하면 자동으로 문장 처음에는 [CLS] 토큰이 끝에는 [SEP] 토큰이 생성되는 것을 확인할 수 있음.  
그리고 인코딩된 정수형 인덱스에서 0은 패딩 토큰([PAD]), 1은 언노운 토큰([UNK]), 2는 클래스 토큰([CLS]), 3은 분리 토큰([SEP])임을 알 수 있음!

In [25]:
tokenizer.tokenize(["hello", "Im hangjoo"])

['he', '##ll', '##o', 'Im', 'ha', '##ng', '##j', '##oo']

In [27]:
tokenized_sent = tokenizer(
    text=data_df.iloc[:, 1].to_list(),
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=100,
    add_special_tokens=True,
)

In [26]:
for k, v in tokenized_sent.items():
    print(k, v)
    print(v.shape)

input_ids tensor([[    2, 19604, 18268,  4008,     3, 31106,     5,     3,     0,     0],
        [    2, 21866, 12904, 12557,     3, 21196, 12615,  4220, 16545,     3]])
torch.Size([2, 10])
token_type_ids tensor([[0, 0, 0, 0, 0, 1, 1, 1, 0, 0],
        [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]])
torch.Size([2, 10])
attention_mask tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
torch.Size([2, 10])


In [23]:
model = ElectraModel.from_pretrained(MODEL_NAME)
output = model(**tokenized_sent)
print(output.last_hidden_state)
print(output.last_hidden_state.shape)

tensor([[[ 0.3783,  0.1481, -0.2795,  ...,  0.2298, -0.3127,  0.7684],
         [ 0.1961,  0.4683, -0.5439,  ...,  0.1308, -0.1579, -0.8483],
         [ 0.2506, -0.0540, -0.5331,  ...,  0.1553, -0.6999,  0.2225],
         ...,
         [ 0.3780,  0.1540, -0.2798,  ...,  0.2266, -0.3079,  0.7678],
         [ 0.0734,  0.1770,  0.0052,  ...,  0.2063,  0.4673, -0.7800],
         [ 0.3186,  0.1875, -0.1589,  ...,  0.1055,  0.2299, -0.5941]],

        [[ 0.1948,  0.0405, -0.2541,  ...,  0.2638, -0.3665,  0.7497],
         [ 0.0683,  0.4253,  0.0120,  ..., -0.2512, -0.0936, -0.2599],
         [-0.0121, -0.0328, -0.1212,  ...,  0.0304, -0.0629, -0.0347],
         ...,
         [-0.5421, -0.1155,  0.0861,  ..., -0.0772,  0.1519, -0.1864],
         [-0.3997, -0.0418,  0.1718,  ..., -0.1247,  0.1347,  0.0721],
         [ 0.1952,  0.0462, -0.2543,  ...,  0.2609, -0.3606,  0.7491]]],
       grad_fn=<NativeLayerNormBackward>)
torch.Size([2, 10, 256])


In [24]:
model = ElectraModel(electra_config)
output = model(**tokenized_sent)
print(output.last_hidden_state)
print(output.last_hidden_state.shape)

tensor([[[ 0.0729,  1.5832, -0.1638,  ..., -0.4335,  1.1890,  1.0299],
         [-0.4465, -0.1209, -0.7055,  ...,  0.5445,  0.6765,  1.4359],
         [-0.2695, -0.1258,  0.4331,  ...,  0.2604,  0.6639,  0.7231],
         ...,
         [-0.7475,  0.5739, -0.1490,  ...,  0.2755,  0.5541,  0.4332],
         [ 0.1801,  0.6906, -0.4432,  ..., -0.6756, -0.6663,  0.8265],
         [-0.5208,  1.1501, -1.2500,  ..., -0.7293,  0.4956,  1.8243]],

        [[ 0.0381,  0.7897, -0.7981,  ...,  0.5513,  1.1438,  1.3262],
         [-1.4069, -0.1161, -0.2860,  ...,  1.7845,  0.3445,  1.0335],
         [-1.6004,  1.4282, -0.2942,  ...,  0.6189,  1.7044,  1.5438],
         ...,
         [ 0.0999,  0.5018,  0.1397,  ...,  0.0052,  0.1692, -0.7591],
         [-0.2231,  0.7950,  1.5741,  ...,  0.8118, -1.3511, -0.4199],
         [-1.0749, -0.0117,  0.2787,  ..., -1.0062,  0.3903,  1.9678]]],
       grad_fn=<NativeLayerNormBackward>)
torch.Size([2, 10, 256])


In [41]:
import torch
from torch.utils.data import Dataset
import pickle as pkl

class MyDataset(Dataset):
    def __init__(self, data_df, tokenizer, token_max_len=256, max_label_count=300):
        self.tokenizer = tokenizer
        self.token_max_len = 256

        self.preprocessed_ = self.preprocessing(data_df, max_label_count)
        self.tokenized_ = self.tokenizing()

    def __getitem__(self, idx):
        # Tokenizer's input sentence would have a form like "[CLS], entity_a's name, [SEP], entity_b's name, [SEP] sentence [SEP]".
        encoded = {k: v[idx] for k, v in self.tokenized_.items()}
        label = torch.tensor(self.preprocessed_["labels"][idx])

        return encoded, label

    def __len__(self):
        return self.preprocessed_["sentence"].size

    def preprocessing(self, data_df, max_label_count):
        preprocessed_ = pd.DataFrame(columns=data_df.columns)

        label_num = len(data_df[8].value_counts())
        label_keys = [k for k in data_df[8].value_counts().keys()]

        for i in range(label_num):
            label_data_df = data_df.loc[data_df[8] == label_keys[i], :]
            if len(label_data_df) >= max_label_count:
                sample_df = label_data_df.sample(n=max_label_count)
            else:
                sample_df = label_data_df.sample(n=max_label_count, replace=True)
            preprocessed_ = pd.concat([preprocessed_, sample_df], ignore_index=True)

        with open("../data/label_type.pkl", "rb") as f:
            label_encoder = pkl.load(f)

        label = []
        for v in preprocessed_[8]:
            if v == "blind":
                label.append(100)
            else:
                label.append(label_encoder[v])

        preprocessed_ = {
            "sentence": preprocessed_[1].to_numpy(),
            "e1_name": preprocessed_[2].to_numpy(),
            "e1_idx": preprocessed_[[3, 4]].to_numpy(),
            "e2_name": preprocessed_[5].to_numpy(),
            "e2_idx": preprocessed_[[6, 7]].to_numpy(),
            "labels": label
        }

        return preprocessed_

    def tokenizing(self):
        concat_entity = [e_a + "[SEP]" + e_b for e_a, e_b in zip(self.preprocessed_["e1_name"], self.preprocessed_["e2_name"])]
        tokenized_sentences = self.tokenizer(
            text=concat_entity,
            text_pair=list(self.preprocessed_["sentence"]),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=self.token_max_len,
            add_special_tokens=True,
        )

        return tokenized_sentences

In [42]:
MyDataset(data_df, tokenizer).tokenizing()

{'input_ids': tensor([[    2, 24004,    21,  ...,     0,     0,     0],
        [    2,  2720,     3,  ...,     0,     0,     0],
        [    2,  3323,     3,  ...,     0,     0,     0],
        ...,
        [    2,  7694,  7303,  ...,     0,     0,     0],
        [    2,  7694,  7303,  ...,     0,     0,     0],
        [    2,  7694,  7303,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}