diff --git a/README.md b/README.md index 71387c518..01bff8cda 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ Start using in a matter of seconds: ```python # Tokenizers provides ultra-fast implementations of most current tokenizers: >>> from tokenizers import (ByteLevelBPETokenizer, - BPETokenizer, + CharBPETokenizer, SentencePieceBPETokenizer, BertWordPieceTokenizer) # Ultra-fast => they can encode 1GB of text in ~20sec on a standard server's CPU diff --git a/bindings/node/README.md b/bindings/node/README.md index de2eef36f..fed1cce2b 100644 --- a/bindings/node/README.md +++ b/bindings/node/README.md @@ -55,7 +55,7 @@ console.log(wpEncoded.getTypeIds()); ## Provided Tokenizers - - `BPETokenizer`: The original BPE + - `CharBPETokenizer`: The original BPE - `ByteLevelBPETokenizer`: The byte level version of the BPE - `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece - `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece diff --git a/bindings/python/README.md b/bindings/python/README.md index b3a3d84e2..3fdcd0c31 100644 --- a/bindings/python/README.md +++ b/bindings/python/README.md @@ -73,12 +73,12 @@ python setup.py install Using a pre-trained tokenizer is really simple: ```python -from tokenizers import BPETokenizer +from tokenizers import CharBPETokenizer # Initialize a tokenizer vocab = "./path/to/vocab.json" merges = "./path/to/merges.txt" -tokenizer = BPETokenizer(vocab, merges) +tokenizer = CharBPETokenizer(vocab, merges) # And then encode: encoded = tokenizer.encode("I can feel the magic, can you?") @@ -89,10 +89,10 @@ print(encoded.tokens) And you can train yours just as simply: ```python -from tokenizers import BPETokenizer +from tokenizers import CharBPETokenizer # Initialize a tokenizer -tokenizer = BPETokenizer() +tokenizer = CharBPETokenizer() # Then train it! tokenizer.train([ "./path/to/files/1.txt", "./path/to/files/2.txt" ]) @@ -106,7 +106,7 @@ tokenizer.save("./path/to/directory", "my-bpe") ### Provided Tokenizers - - `BPETokenizer`: The original BPE + - `CharBPETokenizer`: The original BPE - `ByteLevelBPETokenizer`: The byte level version of the BPE - `SentencePieceBPETokenizer`: A BPE implementation compatible with the one used by SentencePiece - `BertWordPieceTokenizer`: The famous Bert tokenizer, using WordPiece diff --git a/bindings/python/tokenizers/__init__.pyi b/bindings/python/tokenizers/__init__.pyi index 19340cdbf..982c3007b 100644 --- a/bindings/python/tokenizers/__init__.pyi +++ b/bindings/python/tokenizers/__init__.pyi @@ -7,7 +7,7 @@ from .trainers import * from .implementations import ( ByteLevelBPETokenizer as ByteLevelBPETokenizer, - BPETokenizer as BPETokenizer, + CharBPETokenizer as CharBPETokenizer, SentencePieceBPETokenizer as SentencePieceBPETokenizer, BertWordPieceTokenizer as BertWordPieceTokenizer, )