In [1]:
import os
from glob import glob
import sys
import pandas as pd
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

sys.path.insert(0, '../')
from dataset import REDataset
from models import load_model
from tokenization import load_tokenizer
from config import Config, ModelType, PreTrainedType, TokenizationType

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
model = load_model()

Load Model...	Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized fro

In [4]:
dataset = REDataset()

Load Tokenizer...	done!
Load raw data...	done!
Apply Tokenization...	done!


In [22]:
from torch.utils.data.sampler import SubsetRandomSampler

In [29]:
import random

def get_train_test_loader(dataset: Dataset, batch_size: int=64, drop_last: bool=True, test_size: float=0.2, shuffle: bool=True):
    num_samples = len(dataset)

    indices = [i for i in range(num_samples)]
    if shuffle:
        random.shuffle(indices)

    num_test = int(test_size * num_samples)
    train_indices = dataset[num_test:]
    test_indices = dataset[:num_test]

    train_sampler = SubsetRandomSampler(train_indices)
    test_indices = SubsetRandomSampler(test_indices)


    train_loader = DataLoader(dataset, batch_size=batch_size)
    test_loader = DataLoader(dataset, batch_size=batch_size)

    return train_loader, test_loader

In [30]:
train_loader, valid_loader = get_train_test_loader(dataset)

In [35]:
for sentence, label in valid_loader:
    break

In [37]:
label

tensor([17,  0,  6,  2,  8,  0, 17,  3, 10,  0,  4,  0, 16,  4,  0,  0,  0,  0,
         4,  0,  0,  7, 17,  2,  6,  0, 10,  0, 10, 17,  8,  0, 15, 15,  5, 25,
         0,  0,  5,  0,  0,  8,  0,  6,  0, 15,  0,  0,  0,  4, 10,  4,  0, 10,
         0, 10, 15,  0,  0,  0,  4,  0, 10,  0])

In [12]:
math.floor(3.9)

3

In [None]:
DataLoader(dataset, )

In [6]:
dataset[0]

({'input_ids': tensor([   101,  50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625,
          119376,  12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,
            9167,  15001,  11261,  41605,    113,  12001,  57836,    114,   9590,
            9706,  28396,    113,  13796,  19986,    114,   8843,  22634,    117,
            9638,   9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,
           11513,   9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,
           27792,  16139,    119,    102,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
               0,      0,      0,      0,      0,      0,      0,      0,      0,
   

In [4]:
tokenizer = load_tokenizer(TokenizationType.Base)

Load Tokenizer...	done!


In [22]:
raw = pd.read_csv(Config.Train, sep='\t', header=None)
raw.columns = COLUMNS
raw.drop('id', axis=1, inplace=True)
raw['label'] = raw['label'].apply(lambda x: enc.transform(x))
raw.head(3)

Unnamed: 0,relation_state,e1,e1_start,e1_end,e2,e2_start,e2_end,label
0,영국에서 사용되는 스포츠 유틸리티 자동차의 브랜드로는 랜드로버(Land Rover)...,랜드로버,30,33,자동차,19,21,17
1,"선거에서 민주당은 해산 전 의석인 230석에 한참 못 미치는 57석(지역구 27석,...",민주당,5,7,27석,42,44,0
2,유럽 축구 연맹(UEFA) 집행위원회는 2014년 1월 24일에 열린 회의를 통해 ...,유럽 축구 연맹,0,7,UEFA,9,12,6


In [23]:
tokenized_data = tokenizer(
            raw["relation_state"].tolist(),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=100,
            add_special_tokens=True,
        )

In [24]:
tokenized_data

{'input_ids': tensor([[  101, 50266, 11489,  ...,     0,     0,     0],
        [  101,  9428, 41521,  ...,     0,     0,     0],
        [  101, 68495, 37905,  ...,     0,     0,     0],
        ...,
        [  101,  9328, 20309,  ...,     0,     0,     0],
        [  101, 49780, 16617,  ...,     0,     0,     0],
        [  101,  9665, 43852,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}