In [8]:
from transformers import AutoTokenizer
import pandas as pd

In [10]:
t5_tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [11]:
byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

In [28]:
test_cases = [
    "hello, world ", # Correct spelling
    "helo, world", # Missing 
    "hello, worlld", # Extra letter
    "g8 job", # Leetspeak
    "café Saarbrücken", # Accented characters
    "你好, 世界" # Non-Latin script
]

In [29]:
def compare_tokenisation(text, verbose=True): 
    """
    Compare T5 and ByT5 tokenisation strategies.
    
    Returns:
        dic: contains the detailed tokenisation info.
    """
    # T5 subword tokenization
    t5_tokens = t5_tokenizer.tokenize(text)
    t5_ids = t5_tokenizer.encode(text)

    # ByT5 byte-level tokenization
    byt5_tokens = byt5_tokenizer.tokenize(text)
    byt5_ids = byt5_tokenizer.encode(text)

    # Mapping
    byte_mapping = []
    for char in text:
        byte_val = ord(char)
        byte_mapping.append({
            "char": char,
            "byte": byte_val,
            "token_id": byte_val + 3 # ByT5 adds an offset of 3
        })
    # Organise the return data
    results = {
        "text" : text,
        "t5": {
            "tokens": t5_tokens,
            "ids": t5_ids,
            "count": len(t5_tokens)
        },
        "byt5": {
            "tokens": byt5_tokens,
            "ids": byt5_ids,
            "count": len(byt5_tokens)
        },
        "byte_mapping": byte_mapping
    }
    if verbose:
        print(f"\n{'='*60}")
        print(f"原文: {text}")
        print(f"{'='*60}")
        
        print(f"\nT5 Tokenization:")
        print(f"  Tokens: {t5_tokens}")
        print(f"  IDs: {t5_ids}")
        print(f"  Token count: {len(t5_tokens)}")
        
        print(f"\nbyT5 Tokenization:")
        print(f"  Tokens: {byt5_tokens}")
        print(f"  IDs: {byt5_ids}")
        print(f"  Token count: {len(byt5_tokens)}")
        
        print(f"\n字节映射:")
        for item in byte_mapping:
            print(f"  '{item['char']}' -> byte {item['byte']} -> token_id {item['token_id']}")
    return results 

In [31]:
# Check the test cases
for text in test_cases:
    compare_tokenisation(text)


原文: hello, world 

T5 Tokenization:
  Tokens: ['▁hello', ',', '▁world']
  IDs: [21820, 6, 296, 1]
  Token count: 3

byT5 Tokenization:
  Tokens: ['h', 'e', 'l', 'l', 'o', ',', ' ', 'w', 'o', 'r', 'l', 'd', ' ']
  IDs: [107, 104, 111, 111, 114, 47, 35, 122, 114, 117, 111, 103, 35, 1]
  Token count: 13

字节映射:
  'h' -> byte 104 -> token_id 107
  'e' -> byte 101 -> token_id 104
  'l' -> byte 108 -> token_id 111
  'l' -> byte 108 -> token_id 111
  'o' -> byte 111 -> token_id 114
  ',' -> byte 44 -> token_id 47
  ' ' -> byte 32 -> token_id 35
  'w' -> byte 119 -> token_id 122
  'o' -> byte 111 -> token_id 114
  'r' -> byte 114 -> token_id 117
  'l' -> byte 108 -> token_id 111
  'd' -> byte 100 -> token_id 103
  ' ' -> byte 32 -> token_id 35

原文: helo, world

T5 Tokenization:
  Tokens: ['▁', 'he', 'l', 'o', ',', '▁world']
  IDs: [3, 88, 40, 32, 6, 296, 1]
  Token count: 6

byT5 Tokenization:
  Tokens: ['h', 'e', 'l', 'o', ',', ' ', 'w', 'o', 'r', 'l', 'd']
  IDs: [107, 104, 111, 114, 47, 35,

In [None]:
from nltk.corpus import twitter_samples
tweets = twitter_samples.strings()
# Show a few tweets to check 
print(tweets[:20])