In [30]:
import sys
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertTokenizerFast, BertModel, BertConfig, DataCollatorForLanguageModeling, TrainingArguments, Trainer, DataCollatorForWholeWordMask
from tokenizers import BertWordPieceTokenizer


sys.path.insert(0, '../')
from models import load_model
from dataset import REDataset
from config import Config, ModelType, PreTrainedType

In [15]:
data = REDataset()

Load Tokenizer...	done!
Load raw data...	done!
Apply Tokenization...	done!


In [22]:
model = BertModel.from_pretrained(PreTrainedType.BertMultiLingual)
model.cuda()
print('CUDA')

CUDA


In [23]:
sent, label = data[0]

In [32]:
temp = DataLoader(data, batch_size=4)

In [33]:
for sents, labels in temp:
    break

In [43]:
for name, param in model.named_parameters():
    print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
encoder.layer.0.attention.self.query.weight
encoder.layer.0.attention.self.query.bias
encoder.layer.0.attention.self.key.weight
encoder.layer.0.attention.self.key.bias
encoder.layer.0.attention.self.value.weight
encoder.layer.0.attention.self.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.weight
encoder.layer.0.attention.output.LayerNorm.bias
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.weight
encoder.layer.0.output.LayerNorm.bias
encoder.layer.1.attention.self.query.weight
encoder.layer.1.attention.self.query.bias
encoder.layer.1.attention.self.key.weight
encoder.layer.1.attention.self.key

In [44]:
outputs = model(**sents)

In [47]:
outputs.pooler_output

tensor([[ 0.1927, -0.1140,  0.1545,  ..., -0.1682,  0.0765,  0.1716],
        [ 0.2937, -0.0188,  0.3508,  ..., -0.1271,  0.1640,  0.1288],
        [ 0.2748, -0.0108,  0.1434,  ..., -0.2278,  0.1751,  0.1524],
        [ 0.3211, -0.1236,  0.1505,  ..., -0.3515,  0.0855,  0.1090]],
       device='cuda:0', grad_fn=<TanhBackward>)

In [26]:
tokenizer = BertTokenizer.from_pretrained(PreTrainedType.BertMultiLingual)
model = BertModel.from_pretrained(PreTrainedType.BertMultiLingual)
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)

In [28]:
inputs

{'input_ids': tensor([[  101, 31178,   117, 15127, 17835, 10124, 21610, 10112,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [27]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0569,  0.0820,  0.0835,  ...,  0.2801, -0.1775,  0.2176],
         [-0.1544,  0.0196,  0.2836,  ...,  0.1256, -0.6682, -0.3504],
         [-0.3150, -0.3662,  0.1486,  ..., -0.4080, -0.1579,  0.5193],
         ...,
         [ 0.2656, -0.3016, -0.5070,  ...,  0.3957, -0.2573, -0.0307],
         [ 0.0419, -0.2688, -0.0519,  ...,  0.0905, -0.2808,  0.4754],
         [-0.0482, -0.0653,  0.5319,  ...,  0.2482, -0.2556,  0.2320]]],
       grad_fn=<NativeLayerNormBackward>), pooler_output=tensor([[ 0.3043,  0.0656,  0.2868, -0.1854, -0.1371,  0.5640,  0.2290,  0.2035,
         -0.4755,  0.4321, -0.1011, -0.2794, -0.2156, -0.1278,  0.2104, -0.2177,
          0.7091, -0.0092,  0.1960, -0.4344, -1.0000, -0.1208, -0.3457, -0.2106,
         -0.3447,  0.1666, -0.2626,  0.0763,  0.1979, -0.1930,  0.1113, -1.0000,
          0.5719,  0.7092,  0.2417, -0.0967,  0.2205,  0.2705,  0.2206, -0.3866,
         -0.2654, -0.0908, -0.178

In [3]:
tokenizer = load_tokenizer(
    type=PreProcessType.Base
)

Load Tokenizer...	done!


In [5]:
dataset_raw, labels = load_data(path=Config.Train)
dataset_tokenized = apply_tokenization(
    dataset=dataset_raw, tokenizer=tokenizer, method=PreProcessType.Base
)
dataset = REDataset(tokenized_dataset=dataset_tokenized, labels=labels)

Apply Tokenization...	done!


In [29]:
data_collator = DataCollatorForWholeWordMask(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

In [30]:
data_collator

DataCollatorForWholeWordMask(tokenizer=PreTrainedTokenizer(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), mlm=True, mlm_probability=0.15)

In [15]:
training_args = TrainingArguments(**TrainArgs.Base)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

In [24]:
dataset[0]

{'input_ids': tensor([   101,  50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625,
         119376,  12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,
           9167,  15001,  11261,  41605,    113,  12001,  57836,    114,   9590,
           9706,  28396,    113,  13796,  19986,    114,   8843,  22634,    117,
           9638,   9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,
          11513,   9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,
          27792,  16139,    119,    102,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [35]:
label = dataset['labels'], dataset['labels']

{'input_ids': tensor([   101,  50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625,
         119376,  12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,
           9167,  15001,  11261,  41605,    113,  12001,  57836,    114,   9590,
           9706,  28396,    113,  13796,  19986,    114,   8843,  22634,    117,
           9638,   9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,
          11513,   9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,
          27792,  16139,    119,    102,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [39]:
temp = data_collator(dataset)

In [47]:
tokenizer.decode(temp['input_ids'][3])

'[CLS] 용병 공격수 [MASK] [MASK] [MASK] [MASK] [MASK] 초 활약한 강수일의 침체, 시즌 중반에 영입한 세르비아 출신 [MASK] [MASK] 미드필더 오그넨 코로만의 부상 [MASK] 부진의 원인으로 지적되던 [MASK] 인천은 시즌 [MASK] 4경기에서 3승 1패를 거두며 막판 승점 [MASK] [MASK] [MASK] [MASK] [MASK] 정규리그 순위 5위로 플레이오프 [SEP]'

In [50]:
dataset[0]

{'input_ids': tensor([   101,  50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625,
         119376,  12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,
           9167,  15001,  11261,  41605,    113,  12001,  57836,    114,   9590,
           9706,  28396,    113,  13796,  19986,    114,   8843,  22634,    117,
           9638,   9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,
          11513,   9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,
          27792,  16139,    119,    102,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [48]:
tokenizer.decode(temp['labels'][0])

'[UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] 브랜드들은 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] 일컫는 말로 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]'

In [None]:
data_collator()

In [21]:
trainer.train()

ValueError: Expected input batch_size (32) to match target batch_size (3200).

In [None]:
training_args = TrainingArguments(**TrainArgs.Base)

data_collaor = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)