In [18]:
import os
from glob import glob
import sys
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

sys.path.insert(0, '../')
from dataset import load_data, LabelEncoder
from models import load_model
from tokenization import load_tokenizer
from config import Config, ModelType, PreTrainedType, TokenizationType

In [3]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [2]:
model = load_model()

Load Model...	Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized fro

In [4]:
tokenizer = load_tokenizer(TokenizationType.Base)

Load Tokenizer...	done!


In [13]:
COLUMNS = [
    "id",
    "relation_state",
    "e1",
    "e1_start",
    "e1_end",
    "e2",
    "e2_start",
    "e2_end",
    "label",
]

In [38]:
class REDataset(Dataset):
    COLUMNS = [
    "id",
    "relation_state",
    "e1",
    "e1_start",
    "e1_end",
    "e2",
    "e2_start",
    "e2_end",
    "label",
]
    def __init__(self, root: str=Config.Train, tokenization_type: str=TokenizationType.Base):
        self.tokenizer = load_tokenizer(type=tokenization_type)
        self.enc = LabelEncoder()
        self.raw = self._load_raw(root)
        self.sentences = self._tokenize(raw)
        self.labels = raw['label'].tolist()

    def __getitem__(self, idx):
        sentence = {
            key: torch.as_tensor(val[idx])
            for key, val in self.sentences.items()
        }
        label = torch.as_tensor(self.labels[idx])
        return sentence, label

    def __len__(self):
        return len(self.labels)
    
    def _load_raw(self, root):
        raw = pd.read_csv(Config.Train, sep='\t', header=None)
        raw.columns = self.COLUMNS
        raw = raw.drop('id', axis=1)
        raw['label'] = raw['label'].apply(lambda x: enc.transform(x))
        return raw

    def _tokenize(self, data):
        print('Apply Tokenization...', end='\t')
        data_tokenized = self.tokenizer(
            data["relation_state"].tolist(),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=128,
            add_special_tokens=True,
        )
        print('done!')
        return data_tokenized


In [47]:
dataloader = DataLoader(data, batch_size=64, shuffle=True, drop_last=False)

In [49]:
for sample in dataloader:
    break

In [50]:
sample

{'input_ids': tensor([[   101, 103988,  14040,  ...,      0,      0,      0],
         [   101,  26565,   9913,  ...,      0,      0,      0],
         [   101,   9272,  12692,  ...,      0,      0,      0],
         ...,
         [   101,  18347,  36802,  ...,      0,      0,      0],
         [   101,   9901, 118920,  ...,      0,      0,      0],
         [   101,   9651,  42815,  ...,      0,      0,      0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'label': tensor([ 0, 15,  0,  0, 10,  0, 28,  0, 10,  0,  0,  0, 15,  0,  4,  8,  4,  0,
   

In [39]:
data = REDataset()

Load Tokenizer...	done!
Apply Tokenization...	done!


In [45]:
data[5]

{'input_ids': tensor([   101, 104342,  86080,  14523,  26523,  11490,  11018,  67778,  96770,
           9665,  25934,  79611,  32613,  18778,  29364,  30005,  71439,  12310,
          17196,    113,    152,  71655,    114,   8843,   9405,  61250,  11882,
           9095,  29364,  64932,  12424,   9812,  11261,  16439,  54055,  11287,
           9665,  46150,  59330,   8843,  74986,  53371,   9834,  85634,  99896,
          11664,   9665,  12490,    119,    102,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0

In [26]:
tokenizer = load_tokenizer(TokenizationType.Base)

Load Tokenizer...	done!


In [19]:
enc = LabelEncoder()

In [22]:
raw = pd.read_csv(Config.Train, sep='\t', header=None)
raw.columns = COLUMNS
raw.drop('id', axis=1, inplace=True)
raw['label'] = raw['label'].apply(lambda x: enc.transform(x))
raw.head(3)

Unnamed: 0,relation_state,e1,e1_start,e1_end,e2,e2_start,e2_end,label
0,영국에서 사용되는 스포츠 유틸리티 자동차의 브랜드로는 랜드로버(Land Rover)...,랜드로버,30,33,자동차,19,21,17
1,"선거에서 민주당은 해산 전 의석인 230석에 한참 못 미치는 57석(지역구 27석,...",민주당,5,7,27석,42,44,0
2,유럽 축구 연맹(UEFA) 집행위원회는 2014년 1월 24일에 열린 회의를 통해 ...,유럽 축구 연맹,0,7,UEFA,9,12,6


In [23]:
tokenized_data = tokenizer(
            raw["relation_state"].tolist(),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=100,
            add_special_tokens=True,
        )

In [24]:
tokenized_data

{'input_ids': tensor([[  101, 50266, 11489,  ...,     0,     0,     0],
        [  101,  9428, 41521,  ...,     0,     0,     0],
        [  101, 68495, 37905,  ...,     0,     0,     0],
        ...,
        [  101,  9328, 20309,  ...,     0,     0,     0],
        [  101, 49780, 16617,  ...,     0,     0,     0],
        [  101,  9665, 43852,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}