In [6]:
import os
import json

from datasets import concatenate_datasets, load_dataset
from tokenizers import BertWordPieceTokenizer
from transformers import BertTokenizerFast

In [2]:
wiki = load_dataset("wikimedia/wikipedia", "20231101.simple", split="train", cache_dir="data")

wiki = wiki.remove_columns([col for col in wiki.column_names if col != "text"])

d = wiki.train_test_split(test_size=0.1)

Using the latest cached version of the dataset since wikimedia/wikipedia couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration '20231101.simple' at data/wikimedia___wikipedia/20231101.simple/0.0.0/b04c8d1ceb2f5cd4588862100d08de323dccfbaa (last modified on Mon Apr 22 15:35:27 2024).


In [3]:
def dataset_to_text(dataset, output_filename="data.txt"):
    with open(output_filename, 'w') as f:
        for t in dataset["text"]:
            print(t, file=f)

dataset_to_text(d["train"], "train.txt")
dataset_to_text(d["test"], "test.txt")

In [None]:
special_tokens = [
    "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", "<S>", "<T>"
    ]

file = ["train.txt"]
vocab_size = 30_522
max_length = 512
truncatr_longer_samples = True
tokenizer = BertWordPieceTokenizer()
tokenizer.train(files=file, vocab_size=vocab_size, special_tokens=special_tokens)
tokenizer.enable_truncation(max_length=max_length)
model_path = "pretrained-bert"

if not os.path.exists(model_path):
    os.makedirs(model_path)

tokenizer.save_model(model_path)

with open(os.path.join(model_path, "config.json"), "w") as f:
    tokenizer_config = {
        "do_lower_case": True,
        "unk_token": "[UNK]",
        "sep_token": "[SEP]",
        "pad_token": "[PAD]",
        "cls_token": "[CLS]",
        "mask_token": "[MASK]",
        "model_max_length": max_length,
        "vocab_size": vocab_size,
        "max_len": max_length,
    }
    json.dump(tokenizer_config, f)

tokenizer = BertTokenizerFast.from_pretrained(model_path)