# Pretraining

## module test

In [None]:
from char_mlm import CharMLMDataset

test = CharMLMDataset(
    masked_texts=['t[MASK]st', 'hel[MASK]o'],
    label_texts=['test', 'hello']
)

print(test.batch_encoding)
print(test.tokenizer.decode(test[0]['input_ids']))
print(test.tokenizer.decode(test[0]['labels']))


## Loading dataset

In [None]:
from char_mlm import CharMLMDataset, mask_sents
import pandas as pd

MAX_SEQUENCE = 830
SAMPLE_SIZE = 2000

sents_origin = [
    s[:MAX_SEQUENCE] for s in
    pd.read_csv('./Data/en_setence.csv').clean.to_list()[:SAMPLE_SIZE]
]

test_sents_origin, train_sents_origin  = sents_origin[:len(sents_origin)//10], sents_origin[len(sents_origin)//10:]
train = CharMLMDataset(*mask_sents(train_sents_origin))
test = CharMLMDataset(*mask_sents(test_sents_origin))

print(f'train: {len(train)}, test: {len(test)}')


## Trainer & Model definition

In [None]:
from transformers import Trainer, BertForMaskedLM, BertConfig, TrainingArguments
import os
from datetime import datetime
import torch
import torch_ort
import gc

# WIPE MEMORY
# gc.collect()
# torch.cuda.empty_cache()

# os.environ['CUDA_VISIBLE_DEVICES'] = '0'

MODEL_DIR = os.path.join(
    './models', '2021-11-24-19-33-20' # datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

model_config = BertConfig(
    max_position_embeddings=1024,
)
model = BertForMaskedLM(model_config)
# model = torch_ort.ORTModule(model)
# model.to(torch.device('cuda:0')) # model default is cuda:0

training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    num_train_epochs=10,
    # evaluation_strategy='epoch',
    logging_dir=os.path.join(MODEL_DIR, 'tensorboard'),
    logging_strategy='epoch',
    log_level='warning',
    save_strategy='epoch'
)
training_args._n_gpu = 1

trainer = Trainer(
    model,
    training_args,
    train_dataset=train,
)

print('model:', model.device)
print('trainer:', training_args.device)


## Pretraining

In [None]:
trainer.train(resume_from_checkpoint=True)
trainer.save_model()
test_result = trainer.evaluate(test)
print(test_result)

## Training result on tensorboard

In [None]:
%load_ext tensorboard
%tensorboard --logdir models/2021-11-24-19-33-20/tensorboard/ --host 0.0.0.0

# Prediction

In [None]:
from char_mlm import CharMLMDataset, CharTokenizer
from transformers import AutoModelForMaskedLM
model = AutoModelForMaskedLM.from_pretrained(
    './models/2021-11-22-20-30-31/checkpoint-31566')
tokenizer = CharTokenizer()


In [None]:
import torch
inputs = ['[MASK]ello there!']
outputs = tokenizer.batch_decode(torch.argmax(model(**tokenizer(inputs))['logits'], -1))
print('inputs:')
print('\n'.join(['   ' + i for i in inputs]))
print('outputs:')
print('\n'.join(['   ' + o.replace("[PAD]", "") for o in outputs]))


# Finetuning

## Loading dataset

In [1]:
from char_mlm import CharMLMDataset, mask_sents
import pandas as pd

MAX_SEQUENCE = 830
SAMPLE_SIZE = 2000

sents_origin = [
    s[:MAX_SEQUENCE] for s in
    pd.read_csv('./Data/homo_dic_OCR.csv').dic.to_list()[:SAMPLE_SIZE]
]

train = CharMLMDataset(*mask_sents(sents_origin))

print(f'train: {len(train)}')


Inputs: Encoding texts...:   0%|          | 0/263129 [00:00<?, ?it/s]

Labels: Encoding texts...:   0%|          | 0/263129 [00:00<?, ?it/s]

train: 263129


## Defining model & trainer and finetuning

In [2]:
from transformers import Trainer, TrainingArguments, AutoModelForMaskedLM
import os
from datetime import datetime
import torch
import gc

# WIPE MEMORY
# gc.collect()
# torch.cuda.empty_cache()

PRETRAINED_MODEL = 'models/2021-11-24-19-33-20/checkpoint-111010'
OUT_DIR = os.path.join(
    './models', datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

model = AutoModelForMaskedLM.from_pretrained(
    PRETRAINED_MODEL
)

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=10,
    logging_dir=os.path.join(OUT_DIR, 'tensorboard'),
    logging_strategy='epoch',
    log_level='warning',
    save_strategy='epoch'
)
training_args._n_gpu = 1

trainer = Trainer(
    model,
    training_args,
    train_dataset=train,
)

print('model device:', model.device)
print('trainer device:', training_args.device)

trainer.train()
trainer.save_model()


model device: cuda:0
trainer device: cuda:0


Step,Training Loss
32892,0.0022
65784,0.0012
98676,0.0009
131568,0.0007
164460,0.0005
197352,0.0004
230244,0.0003
263136,0.0003
296028,0.0002
328920,0.0001
