In [2]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl.metadata (6.7 kB)
Downloading tiktoken-0.12.0-cp313-cp313-manylinux_2_28_aarch64.whl (1.1 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.12.0


In [1]:
#!/usr/bin/env python3
"""
TOON Demo: Encode/Decode JSON to Token-Oriented Object Notation
- Encodes uniform arrays to tabular format
- Basic support for objects/arrays/primitives
- Token comparison with tiktoken
"""

import json
import re
import tiktoken  # For token counting

# ----------------------------------------------------------------------
# Simple TOON Encoder/Decoder (based on spec from GitHub/toon-format)
# ----------------------------------------------------------------------
def encode_toon(data, path="root"):
    """Encode JSON-like data to TOON string."""
    if isinstance(data, dict):
        toon = ""
        for k, v in data.items():
            sub = encode_toon(v, f"{path}.{k}")
            if sub.strip():
                toon += f"{k}: {sub}\n"
        return toon.rstrip()
    elif isinstance(data, list):
        if not data:
            return "[]"
        # Check if uniform array of dicts for tabular
        is_uniform = all(isinstance(item, dict) and set(item.keys()) == set(data[0].keys()) for item in data)
        if is_uniform and len(data) > 1:
            fields = list(data[0].keys())
            toon = f"{path}[{len(data)}]{{{','.join(fields)}}}:"
            for item in data:
                row = ','.join(str(item.get(f, '')) for f in fields)
                toon += f"\n{row}"
            return toon
        else:
            # Non-uniform: YAML-like list
            toon = f"{path}[{len(data)}]:"
            for i, item in enumerate(data, 1):
                sub = encode_toon(item, f"{path}[{i}]")
                toon += f"\n- {sub}".replace("\n", "\n  ")
            return toon
    else:
        # Primitive
        return str(data)

def decode_toon(toon_str):
    """Basic TOON to JSON decoder (simplified; parse headers & rows)."""
    data = {}
    lines = toon_str.strip().split('\n')
    stack = [data]
    current_key = None

    for line in lines:
        line = line.strip()
        if not line or line.startswith('#'):  # Skip comments/empty
            continue
        if ':' in line and not line.endswith(':'):
            # Key: value
            key, val = line.split(':', 1)
            key = key.strip()
            val = val.strip()
            if '[' in key:  # Array header
                # Parse array, e.g., users[2]{id,name}: \n1,Alice\n2,Bob
                m = re.match(r'(\w+)$$ (\d+) $$\{([^}]+)\}:', key)
                if m:
                    arr_key, size, fields = m.groups()
                    arr = []
                    for i in range(int(size)):
                        row_line = lines[lines.index(line) + 1 + i].strip()
                        row_vals = [v.strip() for v in row_line.split(',')]
                        item = {f: v for f, v in zip(fields.split(','), row_vals)}
                        arr.append(item)
                    stack[-1][arr_key] = arr
                    continue
                # Fallback to list
                arr_key, size = re.match(r'(\w+)$$ (\d+) $$:', key).groups()
                stack[-1][arr_key] = []  # Placeholder; extend for full parse
            else:
                stack[-1][key] = val
        # Indentation for nesting (simplified; assumes 2 spaces)
        elif line.startswith('  '):
            # Nested under current key
            pass  # Extend for full nesting support

    return data

def count_tokens(text, model="gpt-4"):
    """Estimate tokens with tiktoken."""
    enc = tiktoken.encoding_for_model(model)
    return len(enc.encode(text))

# ----------------------------------------------------------------------
# Demo Data & Run
# ----------------------------------------------------------------------
if __name__ == "__main__":
    # Sample JSON data (uniform array)
    sample_json = {
        "users": [
            {"id": 1, "name": "Alice", "role": "admin", "salary": 75000},
            {"id": 2, "name": "Bob", "role": "user", "salary": 65000},
            {"id": 3, "name": "Charlie", "role": "user", "salary": 70000}
        ]
    }

    json_str = json.dumps(sample_json, indent=2)
    print("=== JSON Input ===")
    print(json_str)
    json_tokens = count_tokens(json_str)
    print(f"JSON Tokens: {json_tokens}")

    # Encode to TOON
    toon_str = encode_toon(sample_json["users"], "users")
    print("\n=== TOON Output ===")
    print(toon_str)
    toon_tokens = count_tokens(toon_str)
    print(f"TOON Tokens: {toon_tokens}")
    savings = ((json_tokens - toon_tokens) / json_tokens) * 100
    print(f"Savings: {savings:.1f}%")

    # Decode back to JSON
    restored = decode_toon(toon_str)
    print("\n=== Decoded Back to Dict (Simplified) ===")
    print(json.dumps(restored, indent=2))

    # Test non-uniform
    print("\n=== Non-Uniform Array Demo ===")
    non_uniform = [{"id": 1, "tags": ["foo", "bar"]}, {"id": 2, "nested": {"key": "value"}}]
    toon_non = encode_toon(non_uniform, "items")
    print(toon_non)

=== JSON Input ===
{
  "users": [
    {
      "id": 1,
      "name": "Alice",
      "role": "admin",
      "salary": 75000
    },
    {
      "id": 2,
      "name": "Bob",
      "role": "user",
      "salary": 65000
    },
    {
      "id": 3,
      "name": "Charlie",
      "role": "user",
      "salary": 70000
    }
  ]
}
JSON Tokens: 108

=== TOON Output ===
users[3]{id,name,role,salary}:
1,Alice,admin,75000
2,Bob,user,65000
3,Charlie,user,70000
TOON Tokens: 36
Savings: 66.7%

=== Decoded Back to Dict (Simplified) ===
{}

=== Non-Uniform Array Demo ===
items[2]:
  - id: 1
  tags: items[1].tags[2]:
    - foo
    - bar
  - id: 2
  nested: key: value
