# Loading model & tokenizer

In [None]:
from transformers import AutoTokenizer, BertForMaskedLM, BertConfig

MODEL_NAME = 'char-bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    use_fast=True,
)

config = BertConfig(
    max_position_embeddings=1024, 
    vocab_size=len(tokenizer)
)
model = BertForMaskedLM(config)

# Loading dataset

In [None]:
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from typing import List

MAX_SEQUENCE = 830
SAMPLE_SIZE = 100000

sents_origin = [
    s[:MAX_SEQUENCE].lower() for s in
    pd.read_csv('../en_setence.csv').clean.to_list()[:SAMPLE_SIZE]
]

def make_mlm_dataset(sentences: List[str], num_proc: int = 4):
    return Dataset.from_dict(
        {'text': sentences}
    ).map(
        lambda x: tokenizer(
            x['text'],
            return_special_tokens_mask=True,
        ),
        num_proc=num_proc,
    )
    
dataset = make_mlm_dataset(sents_origin)

# Defining Trainer

In [None]:
from transformers import Trainer, TrainingArguments
import os
from datetime import datetime
import torch
import torch_ort
import gc
from transformers import DataCollatorForLanguageModeling

## Wipe memory
gc.collect()
torch.cuda.empty_cache()

## Select visible gpus
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'

MODEL_DIR = os.path.join(
    datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

training_args = TrainingArguments(
    output_dir=MODEL_DIR,
    num_train_epochs=30,
    per_device_train_batch_size=8,
    # evaluation_strategy='epoch',
    logging_dir=os.path.join(MODEL_DIR, 'tensorboard'),
    logging_strategy='epoch',
    log_level='warning',
    save_strategy='epoch',
)
training_args._n_gpu = 1

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

print('MODEL_DIR:', MODEL_DIR)
print('model:', model.device)
print('trainer:', training_args.device)

trainer.train()#resume_from_checkpoint=True)
trainer.save_model()

# Pridiction

In [None]:
test_sents_origin = [
    'i don t think playing with yourself is awful but if friends relatives colleagues receive video of it bad for u',
    'if u want me to destroy this compromising evidence use my bitcoin wallet address you have day after',
    'received email do not mind on my illiteracy i am from china i uploaded the malicious program on your system',
    'the most interesting evidence that i stole its a videotape with your masturbation',
    'my deleterious soft at once set up on your system in addition it saved precisely the porn video you masturbated on',
]

test = make_mlm_dataset(test_sents_origin)

outputs = trainer.predict(test)

print('input:')
print('\n'.join(tokenizer.batch_decode(test['input_ids'])))
output_texts = tokenizer.batch_decode(torch.argmax(torch.Tensor(outputs.predictions), -1))
print('output:')
print('\n'.join(output_texts))