In [1]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.2.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (10 kB)
Downloading sentencepiece-0.2.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m11.4 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.2.1


In [24]:
import json
from transformers import AutoTokenizer

# ------------------------------
# Choose tokenizers to test
# ------------------------------
TOKENIZERS = [
    "meta-llama/Llama-2-7b-hf",
    "mistralai/Mistral-7B-v0.1",
]

# ------------------------------
# Character categories to test
# ------------------------------
special_chars = {
    # Variation selectors (U+FE00–U+FE0F, U+E0100–U+E01EF)
    "variation_selectors": [chr(cp) for cp in list(range(0xFE00, 0xFE10)) + list(range(0xE0100, 0xE01F0))],

    # Unicode “tag” characters (U+E0000–U+E001F)
    "tag_characters": [chr(0xE0000 + i) for i in range(32)],

    # Typical invisible characters
    "invisible_characters": [
        "\u200B",  # zero-width space
        "\u200C",  # zero-width non-joiner
        "\u200D",  # zero-width joiner
        "\u061C",  # Arabic letter mark
        "\u180E",  # Mongolian vowel separator
    ],

    # Bidi control characters
    "bidi_characters": [
        "\u202A", "\u202B", "\u202C", "\u202D", "\u202E"
    ],

}

# ------------------------------
# Analyze tokenization
# ------------------------------
all_results = {}

for model_name in TOKENIZERS:
    print(f"\nLoading tokenizer: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=True)

    model_results = {}

    for category, chars in special_chars.items():
        print(f"  Analyzing: {category}")
        model_results[category] = {}

        for char in chars:
            try:
                token_ids = tokenizer.encode(char, add_special_tokens=False)
                model_results[category][repr(char)] = token_ids
            except Exception as e:
                model_results[category][repr(char)] = f"Error: {str(e)}"

    all_results[model_name] = model_results


# ------------------------------
# Output JSON
# ------------------------------
print("\n=== TOKENIZATION RESULTS ===\n")
print(json.dumps(all_results, indent=2, ensure_ascii=False))



Loading tokenizer: meta-llama/Llama-2-7b-hf
  Analyzing: variation_selectors
  Analyzing: tag_characters
  Analyzing: invisible_characters
  Analyzing: bidi_characters

Loading tokenizer: mistralai/Mistral-7B-v0.1
  Analyzing: variation_selectors
  Analyzing: tag_characters
  Analyzing: invisible_characters
  Analyzing: bidi_characters

=== TOKENIZATION RESULTS ===

{
  "meta-llama/Llama-2-7b-hf": {
    "variation_selectors": {
      "'︀'": [
        29871,
        242,
        187,
        131
      ],
      "'︁'": [
        29871,
        242,
        187,
        132
      ],
      "'︂'": [
        29871,
        242,
        187,
        133
      ],
      "'︃'": [
        29871,
        242,
        187,
        134
      ],
      "'︄'": [
        29871,
        242,
        187,
        135
      ],
      "'︅'": [
        29871,
        242,
        187,
        136
      ],
      "'︆'": [
        29871,
        242,
        187,
        137
      ],
      "'︇'": [
        29871

In [25]:
all_results

{'meta-llama/Llama-2-7b-hf': {'variation_selectors': {"'︀'": [29871,
    242,
    187,
    131],
   "'︁'": [29871, 242, 187, 132],
   "'︂'": [29871, 242, 187, 133],
   "'︃'": [29871, 242, 187, 134],
   "'︄'": [29871, 242, 187, 135],
   "'︅'": [29871, 242, 187, 136],
   "'︆'": [29871, 242, 187, 137],
   "'︇'": [29871, 242, 187, 138],
   "'︈'": [29871, 242, 187, 139],
   "'︉'": [29871, 242, 187, 140],
   "'︊'": [29871, 242, 187, 141],
   "'︋'": [29871, 242, 187, 142],
   "'︌'": [29871, 242, 187, 143],
   "'︍'": [29871, 242, 187, 144],
   "'︎'": [29871, 242, 187, 145],
   "'️'": [29871, 30598],
   "'󠄀'": [29871, 246, 163, 135, 131],
   "'󠄁'": [29871, 246, 163, 135, 132],
   "'󠄂'": [29871, 246, 163, 135, 133],
   "'󠄃'": [29871, 246, 163, 135, 134],
   "'󠄄'": [29871, 246, 163, 135, 135],
   "'󠄅'": [29871, 246, 163, 135, 136],
   "'󠄆'": [29871, 246, 163, 135, 137],
   "'󠄇'": [29871, 246, 163, 135, 138],
   "'󠄈'": [29871, 246, 163, 135, 139],
   "'󠄉'": [29871, 246, 163, 135, 140],
   "'󠄊'": [

# Token Sequence Examples

## Variation Selectors

| Character | Tokens | Count |
|-----------|--------|-------|
| '︀' | `29871 242 187 131` | 4 |
| '︁' | `29871 242 187 132` | 4 |
| '︂' | `29871 242 187 133` | 4 |
| '︃' | `29871 242 187 134` | 4 |
| '︄' | `29871 242 187 135` | 4 |

**Average tokens per character: 3.88**

## Tag Characters

| Character | Tokens | Count |
|-----------|--------|-------|
| '\U000e0000' | `29871 246 163 131 131` | 5 |
| '\U000e0001' | `29871 246 163 131 132` | 5 |
| '\U000e0002' | `29871 246 163 131 133` | 5 |
| '\U000e0003' | `29871 246 163 131 134` | 5 |
| '\U000e0004' | `29871 246 163 131 135` | 5 |

**Average tokens per character: 5.00**

## Invisible Characters

| Character | Tokens | Count |
|-----------|--------|-------|
| '\u200b' | `29871 30166` | 2 |
| '\u200c' | `29871 30430` | 2 |
| '\u200d' | `29871 30722` | 2 |
| '\u061c' | `29871 219 159` | 3 |
| '\u180e' | `29871 228 163 145` | 4 |

**Average tokens per character: 2.60**

## Bidi Characters

| Character | Tokens | Count |
|-----------|--------|-------|
| '\u202a' | `29871 229 131 173` | 4 |
| '\u202b' | `29871 229 131 174` | 4 |
| '\u202c' | `29871 31379` | 2 |
| '\u202d' | `29871 31881` | 2 |
| '\u202e' | `29871 229 131 177` | 4 |

**Average tokens per character: 3.20**

## Deletion Characters

| Character | Tokens | Count |
|-----------|--------|-------|
| '̀' | `29871 30712` | 2 |
| '́' | `29871 30103` | 2 |
| '̂' | `29871 31500` | 2 |

**Average tokens per character: 2.00**

In [22]:
l = [seq for cat in token_ids.keys() for seq in token_ids[cat].items() ]
l

[("'︀'", [29871, 242, 187, 131]),
 ("'︁'", [29871, 242, 187, 132]),
 ("'︂'", [29871, 242, 187, 133]),
 ("'︃'", [29871, 242, 187, 134]),
 ("'︄'", [29871, 242, 187, 135]),
 ("'︅'", [29871, 242, 187, 136]),
 ("'︆'", [29871, 242, 187, 137]),
 ("'︇'", [29871, 242, 187, 138]),
 ("'︈'", [29871, 242, 187, 139]),
 ("'︉'", [29871, 242, 187, 140]),
 ("'︊'", [29871, 242, 187, 141]),
 ("'︋'", [29871, 242, 187, 142]),
 ("'︌'", [29871, 242, 187, 143]),
 ("'︍'", [29871, 242, 187, 144]),
 ("'︎'", [29871, 242, 187, 145]),
 ("'️'", [29871, 30598]),
 ("'\\U000e0000'", [29871, 246, 163, 131, 131]),
 ("'\\U000e0001'", [29871, 246, 163, 131, 132]),
 ("'\\U000e0002'", [29871, 246, 163, 131, 133]),
 ("'\\U000e0003'", [29871, 246, 163, 131, 134]),
 ("'\\U000e0004'", [29871, 246, 163, 131, 135]),
 ("'\\U000e0005'", [29871, 246, 163, 131, 136]),
 ("'\\U000e0006'", [29871, 246, 163, 131, 137]),
 ("'\\U000e0007'", [29871, 246, 163, 131, 138]),
 ("'\\U000e0008'", [29871, 246, 163, 131, 139]),
 ("'\\U000e0009'", [298