# Tokenization Playground

Use this notebook to explore how different tokenizers split your documents.

- Uses your `ingestion.py` (via `MinimalProcessor`) to parse the MSMARCO TSV.
- Compares `SimpleTokenizer`, our `SpacyTokenizer`, and a customizable spaCy tokenizer.
- Adjustable spaCy options: model, disabled pipeline components, lowercasing/ASCII folding, infix rules, URL/email handling.

Note: If `ipywidgets` isn't installed, the notebook falls back to non-interactive cells with defaults.


In [5]:
# Display settings to avoid truncation
import pandas as pd
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)


In [6]:
# Imports and setup
from dotenv import load_dotenv
import os
import json
from typing import Any, Optional

import pandas as pd

from ingestion import MinimalProcessor, Columns
from tokenization import SimpleTokenizer, SpacyTokenizer

# Optional: widgets
try:
    import ipywidgets as widgets
    from IPython.display import display
    HAS_WIDGETS = True
except Exception:
    HAS_WIDGETS = False

load_dotenv()

DOCUMENTS_PATH = os.getenv("DOCUMENTS", "msmarco-docs.tsv")

In [7]:
# Load a sample of documents using MinimalProcessor
from pathlib import Path


def parse_tsv_line(line: str) -> Optional[list[str]]:
    parts = line.rstrip("\n").split("\t")
    if len(parts) != 4:
        return None
    return parts


def load_documents(path: str | Path, limit: int = 50, skip: int = 0) -> list[dict[str, Any]]:
    docs: list[dict[str, Any]] = []
    proc = MinimalProcessor()
    added = 0
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"Documents file not found: {path}")

    with path.open("r", encoding="utf-8", errors="ignore") as f:
        # skip
        for _ in range(skip):
            if not f.readline():
                break
        for line in f:
            parsed = parse_tsv_line(line)
            if not parsed:
                continue
            doc_id = parsed[0]
            _, payload = proc.process(doc_id, parsed)
            try:
                obj = json.loads(payload)
                docs.append(obj)
            except json.JSONDecodeError:
                # Should not happen, but skip invalid
                continue
            added += 1
            if added >= limit:
                break
    return docs


try:
    documents: list[dict[str, Any]] = load_documents(DOCUMENTS_PATH, limit=100)
    print(f"Loaded {len(documents)} docs from {DOCUMENTS_PATH}")
except Exception as e:
    documents = []
    print(f"Failed to load documents: {e}")

# Helper to get combined text

def get_document_text(doc: dict[str, Any], include_title: bool = True, include_body: bool = True) -> str:
    title = str(doc.get(Columns.title.value, "")) if include_title else ""
    body = str(doc.get(Columns.body.value, "")) if include_body else ""
    sep = "\n\n" if title and body else ""
    return f"{title}{sep}{body}"


Loaded 100 docs from msmarco-docs.tsv


In [8]:
# Tokenizer builders and comparison utilities
from dataclasses import dataclass

@dataclass
class SpacyOptions:
    model: str = "blank"            # "blank" for spacy.blank("en"), or a model name like "en_core_web_sm"
    disable: list[str] | None = None # pipeline components to disable when loading a model
    lowercase: bool = True
    ascii_fold: bool = True
    add_url_rule: bool = True
    add_email_rule: bool = True
    custom_infixes: list[str] | None = None  # e.g. [r"\.", r"-", r"_", r"\/", r"\:"]


def build_spacy_tokenizer(opts: SpacyOptions) -> SpacyTokenizer:
    """Build our SpacyTokenizer honoring basic normalization and tokenizer rules.
    If opts.model == "blank", we create spacy.blank("en"); otherwise spacy.load.
    We optionally alter tokenizer URL/email matching and infix rules.
    """
    import spacy
    if opts.model in (None, "", "blank"):
        nlp = spacy.blank("en")
    else:
        try:
            nlp = spacy.load(opts.model, disable=opts.disable or [
                "tagger", "parser", "ner", "lemmatizer", "attribute_ruler", "textcat"
            ])
        except Exception:
            nlp = spacy.blank("en")

    # Customize tokenizer rules if requested
    from spacy.lang.char_classes import LIST_ELLIPSES, LIST_ICONS
    from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER
    from spacy.tokenizer import Tokenizer
    from spacy.util import compile_infix_regex

    # Start with default tokenizer
    tokenizer = nlp.tokenizer

    # Adjust URL/email matching via token_match
    token_match = tokenizer.token_match
    if opts.add_url_rule or opts.add_email_rule:
        import re
        url_pattern = r"https?://\S+" if opts.add_url_rule else None
        email_pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+" if opts.add_email_rule else None
        patterns = [p for p in [url_pattern, email_pattern] if p]
        if patterns:
            combined = re.compile("|".join(patterns))
            def _token_match(text: str) -> Optional[re.Match[str]]:  # type: ignore
                m = combined.match(text)
                return m if m else (token_match(text) if token_match else None)
            tokenizer.token_match = _token_match  # type: ignore

    # Adjust infixes if provided
    if opts.custom_infixes is not None:
        infixes = tuple(opts.custom_infixes)
        infix_re = compile_infix_regex(infixes)
        tokenizer.infix_finditer = infix_re.finditer  # type: ignore

    # Wrap into our SpacyTokenizer to leverage normalization and re-fallback
    st = SpacyTokenizer(model="blank", disable=None, lowercase=opts.lowercase, ascii_fold=opts.ascii_fold)
    st.nlp = nlp
    st.nlp.tokenizer = tokenizer
    return st


def compare_tokenizers(text: str, simple_tok: SimpleTokenizer, spacy_tok: SpacyTokenizer, custom_spacy_tok: SpacyTokenizer) -> dict[str, list[str]]:
    return {
        "SimpleTokenizer": simple_tok.tokenize(text),
        "SpacyTokenizer(default)": spacy_tok.tokenize(text),
        "SpacyTokenizer(custom)": custom_spacy_tok.tokenize(text),
    }

# Defaults
default_simple = SimpleTokenizer(lowercase=True, ascii_fold=True)
default_spacy = SpacyTokenizer(model="blank", disable=None, lowercase=True, ascii_fold=True)

# Example customizable options instance
default_opts = SpacyOptions(
    model=os.getenv("SPACY_MODEL", "blank"),
    disable=[c for c in (os.getenv("SPACY_DISABLE", "").split(",") if os.getenv("SPACY_DISABLE") else [])],
    lowercase=True,
    ascii_fold=True,
    add_url_rule=True,
    add_email_rule=True,
    custom_infixes=[r"\.", r"-", r"_", r"\/", r"\:"]
)

custom_spacy = build_spacy_tokenizer(default_opts)

In [9]:
# Interactive UI (if ipywidgets available)

def render_results(text: str, simple_tok: SimpleTokenizer, spacy_tok: SpacyTokenizer, custom_spacy_tok: SpacyTokenizer):
    # Show full document text
    print(f"Document text ({len(text)} chars):\n{text}\n")
    comp = compare_tokenizers(text, simple_tok, spacy_tok, custom_spacy_tok)
    rows = []
    for name, tokens in comp.items():
        rows.append({
            "Tokenizer": name,
            "Num tokens": len(tokens),
            "Tokens": " ".join(tokens),
        })
    df = pd.DataFrame(rows)
    display(df)

if HAS_WIDGETS and documents:
    # Controls
    doc_index = widgets.IntSlider(value=0, min=0, max=max(0, len(documents)-1), step=1, description="doc idx")
    include_title = widgets.Checkbox(value=True, description="include title")
    include_body = widgets.Checkbox(value=True, description="include body")

    spacy_model = widgets.Text(value=os.getenv("SPACY_MODEL", "blank"), description="spacy model")
    spacy_disable = widgets.Text(value=os.getenv("SPACY_DISABLE", ""), description="disable")
    lowercase = widgets.Checkbox(value=True, description="lowercase")
    ascii_fold = widgets.Checkbox(value=True, description="ascii fold")
    add_url = widgets.Checkbox(value=True, description="url rule")
    add_email = widgets.Checkbox(value=True, description="email rule")
    infixes = widgets.Text(value=r"\.|-|_|\/|\:", description="infixes regex |")

    out = widgets.Output()

    def _update(_=None):
        out.clear_output()
        with out:
            # Build current custom spacy tokenizer
            disable_list = [c for c in spacy_disable.value.split(",") if c.strip()]
            opts = SpacyOptions(
                model=spacy_model.value.strip() or "blank",
                disable=disable_list or None,
                lowercase=lowercase.value,
                ascii_fold=ascii_fold.value,
                add_url_rule=add_url.value,
                add_email_rule=add_email.value,
                custom_infixes=[p for p in infixes.value.split("|") if p]
            )
            custom_tok = build_spacy_tokenizer(opts)
            text = get_document_text(documents[doc_index.value], include_title.value, include_body.value)
            render_results(text, default_simple, default_spacy, custom_tok)

    for w in [doc_index, include_title, include_body, spacy_model, spacy_disable, lowercase, ascii_fold, add_url, add_email, infixes]:
        w.observe(_update, names=["value"])  # type: ignore

    control_box = widgets.VBox([
        widgets.HBox([doc_index, include_title, include_body]),
        widgets.HBox([spacy_model, spacy_disable]),
        widgets.HBox([lowercase, ascii_fold, add_url, add_email]),
        infixes,
    ])
    display(control_box, out)
    _update()
else:
    print("Widgets are unavailable or no documents loaded. See the fallback cell below.")


VBox(children=(HBox(children=(IntSlider(value=0, description='doc idx', max=99), Checkbox(value=True, descriptâ€¦

Output()

In [None]:
# Fallback demo (no widgets)
if not HAS_WIDGETS:
    examples: list[str] = []
    if documents:
        # Take first 3 documents' title+body
        for i in range(min(3, len(documents))):
            examples.append(get_document_text(documents[i], True, True))
    else:
        examples = [
            "Hello world! Email me at foo.bar+baz@example.com or visit https://spacy.io/usage.",
            "SpaCy v3.8's tokenizer: test-case with hyphenated-words_and_tokens/segments: 12:30pm.",
        ]

    # Build a couple of custom variants
    opts_a = SpacyOptions(model="blank", custom_infixes=[r"\.", r"-", r"_"], add_url_rule=True, add_email_rule=True)
    opts_b = SpacyOptions(model="blank", custom_infixes=[r"\-"], add_url_rule=False, add_email_rule=False, lowercase=False, ascii_fold=False)

    custom_a = build_spacy_tokenizer(opts_a)
    custom_b = build_spacy_tokenizer(opts_b)

    for idx, text in enumerate(examples):
        print(f"\n=== Example {idx} ===\n{text}\n")
        comp = {
            "SimpleTokenizer": default_simple.tokenize(text),
            "SpacyTokenizer(default)": default_spacy.tokenize(text),
            "SpacyTokenizer(custom A)": custom_a.tokenize(text),
            "SpacyTokenizer(custom B)": custom_b.tokenize(text),
        }
        df = pd.DataFrame({
            "Tokenizer": comp.keys(),
            "Num tokens": [len(v) for v in comp.values()],
            "Tokens": [" ".join(v) for v in comp.values()],
        })
        display(df)
else:
    print("Widgets available; use the interactive controls above.")


In [1]:
import redis
from utils.config import Config
cfg = Config(load=True)


# Configure connection via env vars or defaults
REDIS_HOST = cfg.REDIS_HOST
REDIS_PORT = cfg.REDIS_PORT
REDIS_DB = 0
REDIS_PASSWORD = None

r = redis.Redis(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, password=REDIS_PASSWORD)

n_keys = r.dbsize()

print(f"Connected to Redis at {REDIS_HOST}:{REDIS_PORT} db={REDIS_DB}. Keys present: {n_keys}")

# r.flushdb()
print("Redis DB flushed.")

Loading config from configs/base.yaml.
Connected to Redis at localhost:6379 db=0. Keys present: 46942
Redis DB flushed.


In [11]:
from typing import Any

# Fetch up to 10 keys starting with "token:" and print their raw DB representation.
# Relies on `r` (redis.Redis) already defined in an earlier cell.

match = "token:*"
def _decode_if_bytes(v: Any) -> Any:
    try:
        if isinstance(v, bytes):
            return v  # keep bytes to show raw DB bytes
        if isinstance(v, dict):
            # keep bytes in dict keys/values if present
            return v
    except Exception:
        pass
    return v

keys = []
for k in r.scan_iter(match=match):
    keys.append(k)
    if len(keys) >= 10:
        break

if not keys:
    print(f"No keys matching '{match}' found.")
else:
    for idx, key in enumerate(keys, 1):
        # normalize display of key and type
        key_display = key.decode() if isinstance(key, (bytes, bytearray)) else str(key)
        key_repr = repr(key)
        t = r.type(key)
        t_str = t.decode() if isinstance(t, (bytes, bytearray)) else str(t)

        # fetch raw value depending on type
        if t_str in ("string", "bytes", "basic-string"):
            val = r.get(key)
        elif t_str == "hash":
            val = r.hgetall(key)
        elif t_str == "list":
            val = r.lrange(key, 0, -1)
        elif t_str == "set":
            val = r.smembers(key)
        elif t_str == "zset":
            val = r.zrange(key, 0, -1, withscores=True)
        elif t_str == "stream":
            val = r.xrange(key, count=100)
        else:
            # fallback: try GET, otherwise raw type info
            try:
                val = r.get(key)
            except Exception:
                val = f"<unhandled type: {t_str}>"

        print(f"{idx}. Key: {key_display} (raw: {key_repr})  Type: {t_str}")
        print("Value (raw repr):")
        print(repr(_decode_if_bytes(val)))
        print("-" * 60)

1. Key: token:180rotten (raw: b'token:180rotten')  Type: hash
Value (raw repr):
{b'D1761147': b'{"tf": 1, "pos": [11]}'}
------------------------------------------------------------
2. Key: token:brockhampton (raw: b'token:brockhampton')  Type: hash
Value (raw repr):
{b'D2580791': b'{"tf": 1, "pos": [13710]}'}
------------------------------------------------------------
3. Key: token:taxus (raw: b'token:taxus')  Type: hash
Value (raw repr):
{b'D2226663': b'{"tf": 1, "pos": [149]}'}
------------------------------------------------------------
4. Key: token:424 (raw: b'token:424')  Type: hash
Value (raw repr):
{b'D431186': b'{"tf": 1, "pos": [3961]}'}
------------------------------------------------------------
5. Key: token:berg (raw: b'token:berg')  Type: hash
Value (raw repr):
{b'D2580791': b'{"tf": 1, "pos": [12943]}'}
------------------------------------------------------------
6. Key: token:igl (raw: b'token:igl')  Type: hash
Value (raw repr):
{b'D806663': b'{"tf": 2, "pos": [490, 

In [16]:
result = r.hgetall("token:tree")

result = {rkey.decode(): r.decode() for rkey, r in result.items()}
result

{'D687756': '{"tf": 2, "pos": [6421, 12201]}',
 'D1598311': '{"tf": 1, "pos": [152]}',
 'D3141039': '{"tf": 1, "pos": [22]}',
 'D3395201': '{"tf": 1, "pos": [454]}',
 'D574800': '{"tf": 1, "pos": [235]}',
 'D2632012': '{"tf": 2, "pos": [446, 5067]}',
 'D2146220': '{"tf": 1, "pos": [6051]}',
 'D2226663': '{"tf": 2, "pos": [152, 389]}',
 'D1887456': '{"tf": 1, "pos": [35]}',
 'D398522': '{"tf": 10, "pos": [9, 13, 93, 141, 174, 207, 415, 422, 509, 1050]}',
 'D1745929': '{"tf": 2, "pos": [33, 38]}',
 'D2520478': '{"tf": 1, "pos": [103]}',
 'D339947': '{"tf": 2, "pos": [162, 169]}',
 'D1142478': '{"tf": 1, "pos": [391]}',
 'D1647345': '{"tf": 1, "pos": [125]}',
 'D2805086': '{"tf": 1, "pos": [291]}',
 'D1725154': '{"tf": 1, "pos": [17]}',
 'D2499684': '{"tf": 1, "pos": [867]}',
 'D1533134': '{"tf": 1, "pos": [669]}',
 'D467137': '{"tf": 1, "pos": [51]}',
 'D1268130': '{"tf": 1, "pos": [262]}',
 'D297612': '{"tf": 1, "pos": [644]}',
 'D2761080': '{"tf": 1, "pos": [60]}',
 'D2362419': '{"tf":