This notebook quickly applies our `calculate_duplicates` method from `captoken` to analyze the number of duplicates in the GPT-2 tokenizer.

In [None]:
import sys

sys.path.append("..")

from captoken import newsgroups

In [None]:
from transformers import GPT2TokenizerFast

# Load in the GPT-2 tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

### Analysis

In [None]:
vocab = set(tokenizer.vocab.keys())
# Unlike SentencePiece (which uses `▁`), this tokenizer uses 'Ġ' for whitespace.
spaced, spaced_cap, unspaced_cap = newsgroups.calculate_duplicates(vocab, space_tok="Ġ")

In [None]:
len(spaced), len(spaced_cap), len(unspaced_cap), len(spaced) + len(spaced_cap) + len(
    unspaced_cap
)

(4710, 4385, 2355, 11450)

### Baseline examples

These are the ones used in the chapter as illustrative examples.

In [None]:
tokenizer.tokenize("hello world"), tokenizer.encode("hello word")

(['hello', 'Ġworld'], [31373, 1573])

In [None]:
tokenizer.tokenize("incredible"), tokenizer.encode("incredible")

(['inc', 'redible'], [1939, 26260])

### Pathologies

In [None]:
tokenizer.tokenize(" hello"), tokenizer.encode(" hello")

(['Ġhello'], [23748])

In [None]:
tokenizer.tokenize("hello"), tokenizer.encode("hello")

(['hello'], [31373])

In [None]:
tokenizer.tokenize(" world"), tokenizer.encode(" world")

(['Ġworld'], [995])

In [None]:
tokenizer.tokenize("world"), tokenizer.encode("world")

(['world'], [6894])

In [None]:
tokenizer.tokenize(" cucumber"), tokenizer.encode(" cucumber")

(['Ġcuc', 'umber'], [38421, 4494])

In [None]:
tokenizer.tokenize("cucumber"), tokenizer.encode("cucumber")

(['c', 'uc', 'umber'], [66, 1229, 4494])

#### Capitalization

In [None]:
tokenizer.tokenize("Hello"), tokenizer.encode("Hello")

(['Hello'], [15496])

In [None]:
tokenizer.tokenize(" Hello"), tokenizer.encode(" Hello")

(['ĠHello'], [18435])

In [None]:
tokenizer.tokenize("Cucumber"), tokenizer.encode("Cucumber")

(['C', 'uc', 'umber'], [34, 1229, 4494])

In [None]:
tokenizer.tokenize(" Cucumber"), tokenizer.encode(" Cucumber")

(['ĠC', 'uc', 'umber'], [327, 1229, 4494])

In [None]:
tokenizer.tokenize("HELLO"), tokenizer.encode("HELLO")

(['HE', 'LL', 'O'], [13909, 3069, 46])

In [None]:
tokenizer.tokenize(" HELLO"), tokenizer.encode(" HELLO")

(['ĠHELL', 'O'], [47899, 46])