# Training

## module test

In [1]:
from char_mlm import CharMLMDataset

test = CharMLMDataset(
    masked_texts=['t[MASK]st', 'hel[MASK]o'],
    label_texts=['test', 'hello']
)

print(test.batch_encoding)
print(test.tokenizer.decode(test[0]['input_ids']))
print(test.tokenizer.decode(test[0]['labels']))


Inputs: Encoding texts...: 100%|██████████| 2/2 [00:00<00:00, 6781.41it/s]
Labels: Encoding texts...: 100%|██████████| 2/2 [00:00<00:00, 33288.13it/s]

{'input_ids': tensor([[101, 316, 103, 315, 316, 102,   0],
        [101, 304, 301, 308, 103, 311, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[101, 316, 301, 315, 316, 102,   0],
        [101, 304, 301, 308, 308, 311, 102]])}
[CLS]t[MASK]st[SEP][PAD]
[CLS]test[SEP][PAD]





## loading dataset

In [2]:
from char_mlm import CharMLMDataset
from typing import List, Union
import pandas as pd


def split_by_ratio(
    dataset: List[any], ratio: List[int],
) -> List[any]:
    ratio = [0, *ratio]
    u = len(dataset) // sum(ratio)
    accumulated_length = [
        sum(ratio[:y]) * u for y in range(1, len(ratio) + 1)
    ]
    splited_dataset = [
        dataset[accumulated_length[i]:accumulated_length[i+1]]
        for i in range(len(accumulated_length)-1)
    ]
    splited_dataset[0] += (dataset[accumulated_length[-1]:])
    return splited_dataset


def mask_idx(text: str, idx: Union[int, List[int]]) -> str:
    text = list(text)

    if type(idx) == int: idx = [idx]
    for i in idx:
        text[i] = '[MASK]'

    return ''.join(text)


def mask_sents(sents_origin: List[str]):
    sents, sents_masked = [], []
    for sent in sents_origin:
        for i in range(len(sent)):
            sents_masked.append(mask_idx(sent, i))
            sents.append(sent)
    return sents_masked, sents

sents_origin = pd.read_csv('./Data/en_setence.csv').setence.to_list()[:1000]
train_sents_origin, dev_sents_origin, test_sents_origin = split_by_ratio(sents_origin, [5, 1, 1])

train = CharMLMDataset(*mask_sents(train_sents_origin))
dev = CharMLMDataset(*mask_sents(dev_sents_origin))
test = CharMLMDataset(*mask_sents(test_sents_origin))

print(f'train: {len(train)}, dev: {len(dev)}, test: {len(test)}')


Inputs: Encoding texts...: 100%|██████████| 84176/84176 [00:02<00:00, 31167.12it/s]
Labels: Encoding texts...: 100%|██████████| 84176/84176 [00:02<00:00, 34976.69it/s]
Inputs: Encoding texts...: 100%|██████████| 18235/18235 [00:00<00:00, 30546.79it/s]
Labels: Encoding texts...: 100%|██████████| 18235/18235 [00:00<00:00, 34067.54it/s]
Inputs: Encoding texts...: 100%|██████████| 17500/17500 [00:00<00:00, 34876.83it/s]
Labels: Encoding texts...: 100%|██████████| 17500/17500 [00:00<00:00, 39836.51it/s]

train: 84176, dev: 18235, test: 17500





## Trainer & Model definition

In [3]:
from transformers import Trainer, BertForMaskedLM, BertConfig, TrainingArguments
import os
from datetime import datetime
import torch
import torch_ort

os.environ['CUDA_VISIBLE_DEVICES'] = '0'
DEVICE = torch.device('cuda:0')

MODEL_DIR = os.path.join(
    './models', datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

model_config = BertConfig(
    max_position_embeddings=1024,
)
model = BertForMaskedLM(model_config)
model = torch_ort.ORTModule(model)
model.to(DEVICE)

training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    num_train_epochs=8,
    evaluation_strategy='epoch',
    logging_dir=os.path.join(MODEL_DIR, 'tensorboard'),
    logging_strategy='epoch',
    log_level='warning',
    save_strategy='epoch'
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train,
    eval_dataset=dev,
)


## train

In [5]:
trainer.train(resume_from_checkpoint=True)
trainer.save_model()
test_result = trainer.evaluate(test)
print(test_result)

Epoch,Training Loss,Validation Loss
1,0.0055,0.004031
2,0.0049,0.003887
3,0.0041,0.005339


## train result on tensorboard

In [None]:
%tensorboard --logdir models/2021-11-22-20-30-31/tensorboard/

# prediction