- Load a dataset

- Make it as a iterator

- Load a tokeniser

- Write a processing function 

- Map it to the dataset 

- Create a new tokeniser

- Train it with the dataset

- Write the post processing function

- Run the evaluation 

- push to hub

In [3]:
from datasets import load_dataset
from huggingface_hub import notebook_login

from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    AutoModelForSequenceClassification
)

import torch

In [8]:
from tokenizers import (
    normalizers,
    decoders,
    pre_tokenizers,
    processors,
    Tokenizer,
    trainers,
    models
)

In [4]:
from torch.utils.data import DataLoader

wiki_data = load_dataset("wikitext",
                         name="wikitext-2-raw-v1",
                         split='train')

wiki_loader = DataLoader(wiki_data,
                         batch_size=3,
                         collate_fn=lambda bat: [x['text'] for x in bat])

wiki_iter = iter(wiki_loader)
next(wiki_iter)

['', ' = Valkyria Chronicles III = \n', '']

In [5]:
model_cp = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_cp)
tokenizer.is_fast

True

In [6]:
gpt_updt_tokenizer = tokenizer.train_new_from_iterator(text_iterator=wiki_iter,
                                                       vocab_size=25000)
gpt_updt_tokenizer.vocab

{'itan': 5670,
 'ainen': 22468,
 'iner': 10881,
 'ĠHMS': 5147,
 'Ġtells': 6026,
 'Ġflesh': 10078,
 'Ġprisoners': 10200,
 'ĠTed': 15403,
 'ĠEg': 10420,
 'ĠFilming': 11816,
 'ĠRapids': 13179,
 'ĠAber': 16880,
 'Ġgreenish': 24252,
 'elf': 1001,
 'ĠHotel': 7505,
 'Ġhors': 4662,
 'archy': 15168,
 'ĠUyghurs': 16244,
 'ĠTw': 3573,
 'ĠWag': 8282,
 'Ġrejected': 4993,
 'Ġautomatic': 18441,
 'vius': 23433,
 'ĠPhoenix': 8332,
 'Ġleft': 1407,
 'Ġ1911': 7560,
 'ilty': 8670,
 'lef': 23517,
 'gmont': 24753,
 'Ġfollows': 4750,
 'ayana': 12702,
 'Ġbrief': 3411,
 'ĠCor': 4150,
 'Ġarguing': 10812,
 'estock': 12718,
 'ĠGaza': 20336,
 'Ġgen': 1914,
 'ĠShakespe': 7516,
 'Ġgalax': 14821,
 'ification': 3039,
 'slides': 23427,
 'ĠFine': 14592,
 'Ġthroat': 15760,
 'Ġyearly': 22562,
 'Ġqualifying': 12058,
 'glades': 6221,
 'Äģd': 4202,
 'Ġhosts': 12265,
 'oons': 12644,
 'Ġwithout': 1645,
 'ĠIsab': 5721,
 'Ġteen': 8574,
 'Ġdissipating': 12062,
 'ĠFar': 5742,
 'owl': 3090,
 'ĠMilan': 18323,
 'ĠJoey': 18872,
 'eu': 

In [22]:
len(wiki_data)

36718

In [23]:
wordpiece = Tokenizer(models.WordPiece(unk_token="[UNK]"))

In [24]:
wordpiece.normalizer = normalizers.BertNormalizer(lowercase=True)
wordpiece.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

In [26]:
piece_trainer = trainers.WordPieceTrainer(vocab_size=25000,
                                          special_tokens=["[UNK]","[PAD]",
                                                          "[CLS]", "[SEP]",
                                                          "[MASK]"])
wordpiece.train_from_iterator(wiki_iter, piece_trainer)

In [27]:
cls_id = wordpiece.token_to_id(f"[CLS]")
sep_id = wordpiece.token_to_id(f"[SEP]")
print(cls_id, sep_id)

2 3


In [29]:
wordpiece.encode("Test encoding the sentence").tokens

['[UNK]', '[UNK]', '[UNK]', '[UNK]']

In [31]:
wordpiece.post_processor = processors.TemplateProcessing(
    single=f"[CLS]:0 $A:0 [SEP]:0",
    pair=f"[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", cls_id),
        ("[SEP]", sep_id)
    ],
)

wordpiece.decoder = decoders.WordPiece(prefix="##")

In [32]:
wordpiece.encode("Test encoding the sentence").tokens

['[CLS]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[SEP]']