In [1]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('/media/Data/HexTokenizer')
tokens = tokenizer('88a20 8a204 a2043 20439')

print(tokens)

{'input_ids': [2, 141, 146, 150, 157, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1]}


In [2]:
from pathlib import Path
from datasets import *

train_paths = [str(x) for x in Path('/media/Data/onlytext').glob('**/*.csv')]
dataset = load_dataset("text", cache_dir='/media/Data/images', data_files=train_paths, split="train")



Using custom data configuration default-c142b1e1bd4c63f4


Downloading and preparing dataset text/default to /media/Data/images/text/default-c142b1e1bd4c63f4/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset text downloaded and prepared to /media/Data/images/text/default-c142b1e1bd4c63f4/0.0.0/21a506d1b2b34316b1e82d0bd79066905d846e5d7e619823c0dd338d6f1fa6ad. Subsequent calls will reuse this data.


In [3]:
d = dataset.train_test_split(test_size=0.01)

d["train"], d["test"]

(Dataset({
     features: ['text'],
     num_rows: 6049254
 }),
 Dataset({
     features: ['text'],
     num_rows: 61104
 }))

In [4]:
def encode(examples):
  
  return tokenizer(examples["text"], return_special_tokens_mask=True)

train_dataset = d["train"].map(encode, batched=True)


  0%|          | 0/6050 [00:00<?, ?ba/s]

In [5]:
train_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
train_dataset

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 6049254
})

In [6]:
train_dataset.save_to_disk("/media/Data/tmp/train.hf")

In [7]:
test_dataset = d["test"].map(encode, batched=True)

  0%|          | 0/62 [00:00<?, ?ba/s]

In [8]:
test_dataset.set_format(columns=["input_ids", "attention_mask", "special_tokens_mask"])
test_dataset

Dataset({
    features: ['text', 'input_ids', 'token_type_ids', 'attention_mask', 'special_tokens_mask'],
    num_rows: 61104
})

In [9]:
test_dataset.save_to_disk("/media/Data/tmp/test.hf")

In [1]:
from datasets import *

train_dataset = load_from_disk("/media/Data/tmp/train.hf")
test_dataset = load_from_disk("/media/Data/tmp/test.hf")

In [10]:
# maximum sequence length, lowering will result to faster training (when increasing batch size)
max_length = 512

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= max_length:
        total_length = (total_length // max_length) * max_length
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + max_length] for i in range(0, total_length, max_length)]
        for k, t in concatenated_examples.items()
    }
    return result

In [11]:
from itertools import chain

train_dataset = train_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")


Grouping texts in chunks of 512:   0%|          | 0/6050 [00:00<?, ?ba/s]

In [12]:
train_dataset.save_to_disk("/media/Data/tmp/trainnew.hf")

In [13]:
test_dataset = test_dataset.map(group_texts, batched=True,
                                    desc=f"Grouping texts in chunks of {max_length}")

Grouping texts in chunks of 512:   0%|          | 0/62 [00:00<?, ?ba/s]

In [14]:

test_dataset.save_to_disk("/media/Data/tmp/testnew.hf")

In [1]:
from datasets import *

train_dataset = load_from_disk("/media/Data/tmp/trainnew.hf")
test_dataset = load_from_disk("/media/Data/tmp/testnew.hf")

In [15]:
train_dataset.set_format("torch")
test_dataset.set_format("torch")

len(train_dataset), len(test_dataset)

(2029411, 20480)

In [16]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060


In [17]:
from transformers import *

max_length = 512
# 30,522 vocab is BERT's default vocab size, feel free to tweak
vocab_size = 30_522


model_config = BertConfig(vocab_size=vocab_size, max_position_embeddings=max_length)
model = BertForMaskedLM(config=model_config)
#model.to(device)

In [18]:
# initialize the data collator, randomly masking 20% (default is 15%) of the tokens for the Masked Language
# Modeling (MLM) task
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.2
)

In [None]:
import os
model_path = "/media/Data/pretrained-bert"
# make the directory if not already there
if not os.path.isdir(model_path):
  os.mkdir(model_path)

training_args = TrainingArguments(
    output_dir=model_path,          # output directory to where save model checkpoint
    evaluation_strategy="steps",    # evaluate each `logging_steps` steps
    overwrite_output_dir=True,      
    num_train_epochs=10,            # number of training epochs, feel free to tweak
    per_device_train_batch_size=6, # the training batch size, put it as high as your GPU memory fits
    gradient_accumulation_steps=8,  # accumulating the gradients before updating the weights
    per_device_eval_batch_size=32,  # evaluation batch size
    logging_steps=1000,             # evaluate, log and save model checkpoints every 1000 step
    save_steps=1000,
    # load_best_model_at_end=True,  # whether to load the best model (in terms of loss) at the end of training
    # save_total_limit=3,           # whether you don't have much space so you let only 3 model weights saved in the disk
)

# initialize the trainer and pass everything to it
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# train the model
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.
***** Running training *****
  Num examples = 2029411
  Num Epochs = 10
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 48
  Gradient Accumulation steps = 8
  Total optimization steps = 422790


Step,Training Loss,Validation Loss
1000,6.8447,6.572571
2000,6.5314,6.442201
3000,6.2434,5.386679
4000,5.0724,4.488007
5000,3.3035,2.295897
6000,1.9506,1.38592
7000,1.1985,0.777766
8000,0.7051,0.443978
9000,0.4259,0.254057
10000,0.2609,0.158311


The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 20480
  Batch size = 32
Saving model checkpoint to /media/Data/pretrained-bert/checkpoint-1000
Configuration saved in /media/Data/pretrained-bert/checkpoint-1000/config.json
Model weights saved in /media/Data/pretrained-bert/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 20480
  Batch size = 32
Saving model checkpoint to /media/Data/pretrained-bert/checkpoint-2000
Configuration saved in /media/Data/pretrained-bert/checkpoint-2000/config.json
Model weights saved in /media/Data/pretrained-bert/checkpoint-2000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argume

Model weights saved in /media/Data/pretrained-bert/checkpoint-18000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 20480
  Batch size = 32
Saving model checkpoint to /media/Data/pretrained-bert/checkpoint-19000
Configuration saved in /media/Data/pretrained-bert/checkpoint-19000/config.json
Model weights saved in /media/Data/pretrained-bert/checkpoint-19000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForMaskedLM.forward` and have been ignored: special_tokens_mask.
***** Running Evaluation *****
  Num examples = 20480
  Batch size = 32
Saving model checkpoint to /media/Data/pretrained-bert/checkpoint-20000
Configuration saved in /media/Data/pretrained-bert/checkpoint-20000/config.json
Model weights saved in /media/Data/pretrained-bert/checkpoint-20000/pyto