# Loading model & tokenizer

In [10]:
from transformers import AutoTokenizer, BertForMaskedLM, BertConfig

MODEL_NAME = 'char-bert-base-uncased'
RESUME_FROM_CHECKPOINT = True
CHECKPOINT_DIR = '2021-12-07-23-20-11'
CHECKPOINT_STEP = '75810'

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME, 
    use_fast=True,
)

config = BertConfig(
    max_position_embeddings=1024, 
    vocab_size=len(tokenizer)
)
model = BertForMaskedLM.from_pretrained(
    f'{CHECKPOINT_DIR}/checkpoint-{CHECKPOINT_STEP}' if RESUME_FROM_CHECKPOINT
    else MODEL_NAME
)

# Loading dataset

In [11]:
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from typing import List

MAX_SEQUENCE = 830

sents_origin = [
    s[:MAX_SEQUENCE].lower() for s in
    pd.read_csv('../homo_dic_OCR.csv').dic.to_list()
]

def make_mlm_dataset(sentences: List[str], num_proc: int = 4):
    return Dataset.from_dict(
        {'text': sentences}
    ).map(
        lambda x: tokenizer(
            x['text'],
            return_special_tokens_mask=True,
        ),
        num_proc=num_proc,
    )
    
dataset = make_mlm_dataset(sents_origin)
print(dataset)

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 20213
})


# Defining Trainer & Training

In [14]:
from transformers import Trainer, TrainingArguments
import os
from datetime import datetime
import torch
import gc
from transformers import DataCollatorForLanguageModeling

## Wipe memory
gc.collect()
torch.cuda.empty_cache()

## Select visible gpus
# os.environ['CUDA_VISIBLE_DEVICES'] = '0'

TRAINER_DIR = os.path.join(
    CHECKPOINT_DIR if RESUME_FROM_CHECKPOINT
    else datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
)

training_args = TrainingArguments(
    output_dir=TRAINER_DIR,
    num_train_epochs=30,
    per_device_train_batch_size=8,
    # evaluation_strategy='epoch',
    logging_dir=os.path.join(TRAINER_DIR, 'tensorboard'),
    logging_strategy='epoch',
    log_level='warning',
    save_strategy='epoch',
    save_total_limit=5,
)
training_args._n_gpu = 1

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=dataset,
    data_collator=data_collator,
)

print('TRAINER_DIR:', TRAINER_DIR)
print('model:', model.device)
print('trainer:', training_args.device)

trainer.train(resume_from_checkpoint=RESUME_FROM_CHECKPOINT)
trainer.save_model(MODEL_NAME)
print('model saved to:', MODEL_NAME)

TRAINER_DIR: 2021-12-07-23-20-11
model: cuda:0
trainer: cuda:0


0it [00:00, ?it/s]

Step,Training Loss


model saved to: char-bert-base-uncased


# Pridiction

In [19]:
import torch
import numpy as np
from data_processing import mask_homo

test_sents_origin = [
    "You do not know me ánd you are most likely wondering why you're gétting this mail, correct? Wèll, I actuàlly placed a màlware on thé adult vids website ãnd guess what, you visited this sitè to experience fun",
    "While you were watching vidèos, your browser initiatèd working as á RDP thàt has a keylogger which gave me âccess to your display ãs wéll as web camera... I créatéd a double-screèn vidéo",
    "You will make thè pãyment through Bitcoin. BTC Address: 1L5XWDz7d2NjvuSspQr7sNXtUP5JrGpj52",
    "It sèèms thât, don'tknow, is your password",
    "Lét me tèll you, I plãced a malwãre on the adult video clips (porn) web site and do you know what, you visitèd this site to expériènce fun (you know whät I mean)",
    "Well, in my opinion, $1900 is a fair price for our little sécret",
]

test_sents_masked = mask_homo(test_sents_origin)

print('\norigin inputs: ============================================')
print(*['\t'+t for t in test_sents_origin], sep='\n')


print('\nmasked inputs: ============================================')
print(*['\t'+t for t in test_sents_masked], sep='\n')


test_inputs = tokenizer(
    test_sents_masked,
    padding=True,
    return_special_tokens_mask=True,
    return_tensors='pt',
).to(model.device)
# print('\nraw inputs: ============================================')
# print(*['\t'+t for t in tokenizer.batch_decode(test_inputs['input_ids'])], sep='\n')

test_special_tokens_mask = test_inputs.pop('special_tokens_mask')
test_output = model(**test_inputs)
test_output_ids = torch.argmax(torch.Tensor(test_output.logits.cpu()), -1)
test_output_texts = tokenizer.batch_decode(test_output_ids)
# print('\nraw outputs: ============================================')
# print(*['\t'+t for t in test_output_texts], sep='\n')

test_output_texts_cleaned = tokenizer.batch_decode([
    torch.masked_select(
        ids, masks
    )
    for ids, masks in zip(test_output_ids, test_special_tokens_mask == 0)
])
print('\noutputs: ============================================')
print(*['\t'+t for t in test_output_texts_cleaned], sep='\n')

Masking homoglyph chararcters:   0%|          | 0/6 [00:00<?, ?it/s]


	You do not know me ánd you are most likely wondering why you're gétting this mail, correct? Wèll, I actuàlly placed a màlware on thé adult vids website ãnd guess what, you visited this sitè to experience fun
	While you were watching vidèos, your browser initiatèd working as á RDP thàt has a keylogger which gave me âccess to your display ãs wéll as web camera... I créatéd a double-screèn vidéo
	You will make thè pãyment through Bitcoin. BTC Address: 1L5XWDz7d2NjvuSspQr7sNXtUP5JrGpj52
	It sèèms thât, don'tknow, is your password
	Lét me tèll you, I plãced a malwãre on the adult video clips (porn) web site and do you know what, you visitèd this site to expériènce fun (you know whät I mean)
	Well, in my opinion, $1900 is a fair price for our little sécret

	you do not know me [MASK]nd you are most likely wondering why you re g[MASK]tting this mail correct w[MASK]ll i actu[MASK]lly placed a m[MASK]lware on th[MASK] adult vids website [MASK]nd guess what you visited this sit[MASK] to experi