# Training

## module test

In [2]:
from char_mlm import CharMLMDataset

test = CharMLMDataset(
    masked_texts=['t[MASK]st', 'hel[MASK]o'],
    label_texts=['test', 'hello']
)

print(test.batch_encoding)
print(test.tokenizer.decode(test[0]['input_ids']))
print(test.tokenizer.decode(test[0]['labels']))


Inputs: Encoding texts...: 100%|██████████| 2/2 [00:00<00:00, 17189.77it/s]
Labels: Encoding texts...: 100%|██████████| 2/2 [00:00<00:00, 12690.78it/s]

{'input_ids': tensor([[101, 316, 103, 315, 316, 102,   0],
        [101, 304, 301, 308, 103, 311, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[101, 316, 301, 315, 316, 102,   0],
        [101, 304, 301, 308, 308, 311, 102]])}
[CLS]t[MASK]st[SEP][PAD]
[CLS]test[SEP][PAD]





## loading dataset

In [3]:
from char_mlm import CharMLMDataset
from typing import List
from torch.utils.data.dataset import random_split, Dataset, Subset
import pandas as pd


def random_split_by_ratio(
    dataset: Dataset, ratio: List[int], *args, **kargs
) -> List[Subset]:
    dataset_length = len(dataset)
    ratio_sum = sum(ratio)
    lengths = [r * (dataset_length // ratio_sum) for r in ratio]
    lengths[0] += dataset_length % ratio_sum
    return random_split(dataset, lengths, *args, **kargs)


def mask_idx(text: str, idx: int) -> str:
   text = list(text)
   text[idx] = '[MASK]'
   return ''.join(text)


sents_orgin = pd.read_csv('./Data/en_setence.csv').setence.to_list()[:10]
sents, sents_masked = [], []

for sent in sents_orgin:
    for i in range(len(sent)):
        sents.append(sent)
        sents_masked.append(mask_idx(sent, i))

sents_num = len(sents)
sents

data = CharMLMDataset(sents_masked, sents)

train, dev, test = random_split_by_ratio(
    data, [4, 1, 1]
)
print(f'total: {len(data)}, train: {len(train)}, dev: {len(dev)}, test: {len(test)}')


Inputs: Encoding texts...: 100%|██████████| 1249/1249 [00:00<00:00, 26260.26it/s]
Labels: Encoding texts...: 100%|██████████| 1249/1249 [00:00<00:00, 27217.49it/s]

total: 1249, train: 833, dev: 208, test: 208





## Trainer & Model definition

In [4]:
from transformers import Trainer, BertForMaskedLM, BertConfig, TrainingArguments
import os
from datetime import datetime

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

MODEL_DIR = os.path.join(
    './models', datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

model_config = BertConfig(
    max_position_embeddings=1024,
)
model = BertForMaskedLM(model_config)

training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    num_train_epochs=30,
    evaluation_strategy='epoch',
    logging_dir=os.path.join(MODEL_DIR, 'tensorboard'),
    logging_strategy='epoch',
    log_level='warning',
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train,
    eval_dataset=dev,
)


## train

In [5]:
trainer.train()
trainer.save_model()
test_result = trainer.evaluate(test)
print(test_result)

Epoch,Training Loss,Validation Loss
1,4.3096,2.108867
2,0.6697,0.042432
3,0.0206,0.014642
4,0.0108,0.012362
5,0.0078,0.011815
6,0.006,0.010504
7,0.0051,0.010579
8,0.0044,0.01072
9,0.0037,0.010809
10,0.0032,0.011652


{'eval_loss': 0.014059482142329216, 'eval_runtime': 2.6785, 'eval_samples_per_second': 77.657, 'eval_steps_per_second': 9.707, 'epoch': 30.0}


# prediction