In [None]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('/media/Data/HexTokenizer')
tokens = tokenizer('88a20 8a204 a2043 20439')

print(tokens)

In [None]:
from pathlib import Path
from datasets import *

paths = [str(x) for x in Path('/media/Data/onlytext').glob('**/*.csv')]

dataset = load_dataset("text", cache_dir='/media/Data/images', data_files=paths, split="train")

In [None]:
def encode(examples):
  
  return tokenizer(examples["text"], return_special_tokens_mask=True)

train_dataset = dataset.map(encode, batched=True, batch_size = 20000, num_proc = 16)

In [None]:
from pathlib import Path
from datasets import *

paths = [str(Path('/media/Data/onlytexttest/2019-03-08-13-24-30-192.168.1.197-8.final.csv'))]

train_dataset = load_dataset("text", cache_dir='/media/Data/images', data_files=paths, split="train")

In [None]:
def encode(examples):
  """Mapping function to tokenize the sentences passed without truncation"""
  return tokenizer(examples["text"], return_special_tokens_mask=True)

test_dataset = dataset.map(encode, batched=True, batch_size = 20000, num_proc = 16)

In [None]:
train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])

In [None]:
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [None]:
from itertools import chain
max_length = 512
train_dataset = train_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")
test_dataset = test_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")

In [None]:
train_dataset.set_format("torch")
test_dataset.set_format("torch")

len(train_dataset), len(test_dataset)

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [None]:
model_path = "/media/Data/pretrained-bert"
# make the directory if not already there
if not os.path.isdir(model_path):
  os.mkdir(model_path)

training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=6, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# train the model
trainer.train()