In [28]:
import sys
import torch
from transformers import BertTokenizer, BertTokenizerFast, BertModel, BertConfig, DataCollatorForLanguageModeling, TrainingArguments, Trainer, DataCollatorForWholeWordMask
from tokenizers import BertWordPieceTokenizer


sys.path.insert(0, '../')
from models import load_model
from config import Config, TrainArgs, ModelType, PreTrainedType, TokenizationType, TrainArgs
from dataset import load_data, apply_tokenization, REDataset
from load_tokenizer import load_tokenizer

In [2]:
model = load_model(
    ModelType.SequenceClf, 
    PreTrainedType.BertMultiLingual,
    Config.NumClasses
    )

Load Model...	Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized fro

In [3]:
tokenizer = load_tokenizer(
    type=TokenizationType.Base
)

Load Tokenizer...	done!


In [5]:
dataset_raw, labels = load_data(path=Config.Train)
dataset_tokenized = apply_tokenization(
    dataset=dataset_raw, tokenizer=tokenizer, method=TokenizationType.Base
)
dataset = REDataset(tokenized_dataset=dataset_tokenized, labels=labels)

Apply Tokenization...	done!


In [29]:
data_collator = DataCollatorForWholeWordMask(
        tokenizer=tokenizer, mlm=True, mlm_probability=0.15
    )

In [30]:
data_collator

DataCollatorForWholeWordMask(tokenizer=PreTrainedTokenizer(name_or_path='bert-base-multilingual-cased', vocab_size=119547, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), mlm=True, mlm_probability=0.15)

In [15]:
training_args = TrainingArguments(**TrainArgs.Base)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

In [24]:
dataset[0]

{'input_ids': tensor([   101,  50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625,
         119376,  12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,
           9167,  15001,  11261,  41605,    113,  12001,  57836,    114,   9590,
           9706,  28396,    113,  13796,  19986,    114,   8843,  22634,    117,
           9638,   9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,
          11513,   9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,
          27792,  16139,    119,    102,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [35]:
label = dataset['labels'], dataset['labels']

{'input_ids': tensor([   101,  50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625,
         119376,  12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,
           9167,  15001,  11261,  41605,    113,  12001,  57836,    114,   9590,
           9706,  28396,    113,  13796,  19986,    114,   8843,  22634,    117,
           9638,   9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,
          11513,   9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,
          27792,  16139,    119,    102,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [39]:
temp = data_collator(dataset)

In [47]:
tokenizer.decode(temp['input_ids'][3])

'[CLS] 용병 공격수 [MASK] [MASK] [MASK] [MASK] [MASK] 초 활약한 강수일의 침체, 시즌 중반에 영입한 세르비아 출신 [MASK] [MASK] 미드필더 오그넨 코로만의 부상 [MASK] 부진의 원인으로 지적되던 [MASK] 인천은 시즌 [MASK] 4경기에서 3승 1패를 거두며 막판 승점 [MASK] [MASK] [MASK] [MASK] [MASK] 정규리그 순위 5위로 플레이오프 [SEP]'

In [50]:
dataset[0]

{'input_ids': tensor([   101,  50266,  11489,   9405,  24974,  24683,   9477,  90578,   9625,
         119376,  12692,  45725,   9651,  99183,  10459,   9376,  42771,  70186,
           9167,  15001,  11261,  41605,    113,  12001,  57836,    114,   9590,
           9706,  28396,    113,  13796,  19986,    114,   8843,  22634,    117,
           9638,   9376,  42771,  22879,   9651,  99183,  10459,   9684,  46520,
          11513,   9641, 119298,  11018,   9251,  11261,   9405,  24974, 118800,
          27792,  16139,    119,    102,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [48]:
tokenizer.decode(temp['labels'][0])

'[UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] 브랜드들은 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] 일컫는 말로 [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]'

In [None]:
data_collator()

In [21]:
trainer.train()

ValueError: Expected input batch_size (32) to match target batch_size (3200).

In [None]:
training_args = TrainingArguments(**TrainArgs.Base)

data_collaor = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)