## Creating a Tokenizer

In [None]:
from datasets import load_dataset

dataset = load_dataset("shayharding/reuters-articles")
dataset

In [None]:
def create_full_article_column(article):
    return {'full_article': f"TITLE:{article['title']}\n\nBODY:{article['body']}"}

dataset = dataset.map(create_full_article_column)
dataset

In [None]:
dataset['train'][0]['full_article']

### Training our Tokenizer

In [None]:
## Create batched dataset for training as a generator
training_corpus = (
    dataset['train'][i : i + 1000]['full_article'] for i in range(0, len(dataset['train']), 1000)
)

In [None]:
from transformers import AutoTokenizer

## Create the original and new tokenizer so we can see the differences
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 52000)

## Use the same example for both
example = dataset['test'][2]['full_article']
example

In [None]:
## Let's see how the original tokenized the example
old_tokenizer.tokenize(example)

In [None]:
## Let's see how the new tokenizes the same example
tokenizer.tokenize(example)

In [None]:
## Save to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
tokenizer.push_to_hub("gpt2-reuters-tokenizer", private=True)