# <b>Preprocessed document chunking and embedding pipeline</b>

#### Import required libraries.

In [1]:
import time
nb_start = time.time()

In [2]:
from typing import List, Dict, Optional, Tuple, Any
import pandas as pd
from tqdm.notebook import tqdm
import math, time, os
from pathlib import Path
import json

# If you need sentence splitting fallback:
try:
    from blingfire import text_to_sentences
except Exception:
    text_to_sentences = None

# Hugging Face tokenizer for accurate token counts (choose your embedding model)
from transformers import AutoTokenizer

In [3]:
# ---- Configure your paths here ----

# processed file
OUT_PARQ  = Path("../shared-data-library/out/df_passages.parquet")

# e.g., Path("../shared-data-library/out/df_processed.parquet") if you also want JSONL
OUT_JSONL = None

# 2k–10k works well; adjust to memory/CPU
BATCH_SIZE = 5000

#### 2) Choose a tokenizer (match your embedding model)

Pick the tokenizer that matches the embedding model you’ll use in retrieval (so chunk sizes reflect the true token budget).

In [4]:
# Use the tokenizer that matches your embedding model:
# Examples:
# TOKENIZER_MODEL = "intfloat/multilingual-e5-large"
TOKENIZER_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# TOKENIZER_MODEL = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)

In [5]:
def model_token_budget(tok, headroom=16, cap_default=512):
    """
    Returns a safe per-chunk token budget:
    - Uses tok.model_max_length if valid, else cap_default.
    - Leaves a small headroom for special tokens/prefixes.
    """
    max_len = getattr(tok, "model_max_length", None)
    if max_len is None or max_len > 100_000_000:  # some tokenizers set a huge sentinel
        max_len = cap_default
    return max(32, int(max_len - headroom))

TOKEN_BUDGET = model_token_budget(tokenizer, headroom=16, cap_default=tokenizer.model_max_length)
TOKEN_BUDGET


496

In [6]:
# Assumes you already defined `tokenizer`, TOKEN_BUDGET, and count_tokens() previously

def batch_count_tokens(texts: List[str]) -> List[int]:
    """
    Much faster than looping tokenizer.encode for each string.
    """
    if not texts:
        return []
    enc = tokenizer(texts, add_special_tokens=False, padding=False, truncation=False, return_length=True)
    # HF returns `length` for each item
    return [int(x) for x in enc["length"]]

2) Token helpers (accurate counting + token-level splitting)

In [7]:
def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text, add_special_tokens=False))

In [8]:
def split_by_tokens(text: str, max_tokens: int, overlap_tokens: int = 0):
    """
    Split text strictly by tokenizer tokens so each segment <= max_tokens.
    Overlap is in *tokens*, not characters, and works with any language (CJK too).
    """
    ids = tokenizer.encode(text, add_special_tokens=False)
    out = []
    i, n = 0, len(ids)
    while i < n:
        j = min(i + max_tokens, n)
        seg_ids = ids[i:j]
        out.append(tokenizer.decode(seg_ids, skip_special_tokens=True).strip())
        if j == n:
            break
        # move start forward, keeping overlap tokens
        i = j - overlap_tokens if overlap_tokens and (j - overlap_tokens) > i else j
    return [s for s in out if s]


In [9]:
def pack_sentences_to_passages_fast(
    sentences: List[str],
    sent_token_lens: List[int],
    token_limit: int,
    overlap_tokens: int = 40,
    min_chunk_tokens: int = 120,
) -> List[Tuple[str, int, int, int]]:
    """
    Returns list of (passage_text, start_sent_idx, end_sent_idx_inclusive, chunk_tokens).
    Assumes sent_token_lens[i] == token length of sentences[i] (already computed in batch).
    """
    passages = []
    i, n = 0, len(sentences)
    while i < n:
        cur_sents, cur_tok = [], 0
        start_i = i

        while i < n:
            s_tok = sent_token_lens[i]
            if s_tok > token_limit:
                # Split the single oversized sentence strictly by tokens (rare)
                parts = split_by_tokens(sentences[i], max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
                for part in parts:
                    passages.append((part, i, i, count_tokens(part)))
                i += 1
                start_i = i
                break

            if cur_tok + s_tok <= token_limit:
                cur_sents.append(sentences[i])
                cur_tok += s_tok
                i += 1
            else:
                break

        if cur_sents:
            text = " ".join(cur_sents).strip()
            # Safety: enforce token budget in case join changed tokenization
            tlen = count_tokens(text)
            if tlen > token_limit:
                parts = split_by_tokens(text, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
                for part in parts:
                    passages.append((part, start_i, i - 1, count_tokens(part)))
            else:
                passages.append((text, start_i, i - 1, tlen))

        # prepare overlap by sentences (approximate by token sum)
        if i < n and cur_sents:
            ov_tok = 0
            back = len(cur_sents) - 1
            while back >= 0 and ov_tok + sent_token_lens[start_i + back] <= overlap_tokens:
                ov_tok += sent_token_lens[start_i + back]
                back -= 1
            overlap_sents = (len(cur_sents) - 1) - back
            if overlap_sents > 0:
                i = max(i - overlap_sents, start_i)

    # merge tiny tail if possible
    if len(passages) >= 2:
        last_text, ls, le, ltok = passages[-1]
        if ltok < min_chunk_tokens:
            prev_text, ps, pe, ptok = passages[-2]
            merged = (prev_text + " " + last_text).strip()
            mlen = count_tokens(merged)
            if mlen <= token_limit:
                passages[-2] = (merged, ps, le, mlen)
                passages.pop()

    # Final enforce (paranoid mode)
    safe = []
    for text, s, e, tlen in passages:
        if tlen <= token_limit:
            safe.append((text, s, e, tlen))
        else:
            parts = split_by_tokens(text, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
            for p in parts:
                safe.append((p, s, e, count_tokens(p)))
    return safe


#### 3) Sentence fallback (if your df lacks sentences)

In [10]:
def ensure_sentences(row) -> List[str]:
    if isinstance(row.get("sentences"), list) and row["sentences"]:
        return row["sentences"]
    # fallback: lightweight multilingual splitter
    if text_to_sentences is not None:
        return [s.strip() for s in text_to_sentences(row.get("clean_text","")).split("\n") if s.strip()]
    # last-resort: split on punctuation (rough)
    import re
    txt = (row.get("clean_text") or "").strip()
    parts = re.split(r'(?<=[.!?。\u3002！？])\s+', txt)
    return [p for p in parts if p]

#### 4) Sentence-wise greedy packing into passages

- Pack whole sentences until the token_limit would be exceeded.
- Keep a small sentence overlap (by tokens) to avoid cutting context.
- If a single sentence is longer than the limit, we split that sentence on whitespace to fit.
- Ensure the final chunk isn’t tiny (merge forward/back when possible).

In [11]:
def pack_sentences_to_passages(
    sentences,
    token_limit: int,
    overlap_tokens: int = 40,
    min_chunk_tokens: int = 120
):
    """
    Returns list of (passage_text, start_sent_idx, end_sent_idx_inclusive),
    guaranteeing each passage <= token_limit (tokenizer-accurate).
    """
    passages = []
    i, n = 0, len(sentences)

    while i < n:
        cur_sents, cur_tok = [], 0
        start_i = i

        while i < n:
            s = sentences[i]
            s_tok = count_tokens(s)

            # If a single sentence is too long, split it by tokens now.
            if s_tok > token_limit:
                long_parts = split_by_tokens(s, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
                for part in long_parts:
                    passages.append((part, i, i))
                i += 1
                start_i = i  # reset packing window after forcing splits
                break

            # Try to add sentence; if it would exceed, emit current chunk
            if cur_tok + s_tok <= token_limit:
                cur_sents.append(s)
                cur_tok += s_tok
                i += 1
            else:
                break

        if cur_sents:
            chunk_text = " ".join(cur_sents).strip()
            # Safety: if the joined chunk still exceeds (rare), split by tokens
            if count_tokens(chunk_text) > token_limit:
                parts = split_by_tokens(chunk_text, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
                for part in parts:
                    passages.append((part, start_i, i - 1))
            else:
                passages.append((chunk_text, start_i, i - 1))

        # Prepare sentence-level overlap for next window
        if i < n and passages and cur_sents:
            # choose last sentences whose total tokens ≈ overlap_tokens
            ov_sents, ov_tok = [], 0
            for s in reversed(cur_sents):
                t = count_tokens(s)
                if ov_tok + t > overlap_tokens and ov_sents:
                    break
                ov_sents.insert(0, s)
                ov_tok += t
            if ov_sents:
                i = max(i - len(ov_sents), start_i)

    # Merge a too-small tail into the previous chunk (if it keeps budget)
    if len(passages) >= 2:
        last_text, ls, le = passages[-1]
        if count_tokens(last_text) < min_chunk_tokens:
            prev_text, ps, pe = passages[-2]
            merged = (prev_text + " " + last_text).strip()
            if count_tokens(merged) <= token_limit:
                passages[-2] = (merged, ps, le)
                passages.pop()

    # Final safety: enforce budget on every chunk (handles corner cases)
    safe = []
    for text, s, e in passages:
        if count_tokens(text) <= token_limit:
            safe.append((text, s, e))
        else:
            parts = split_by_tokens(text, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
            for p in parts:
                safe.append((p, s, e))
    return safe


In [12]:
def split_long_sentence(sent: str, token_limit: int) -> List[str]:
    """If one sentence exceeds token_limit, split it on whitespace to fit."""
    words = sent.split()
    chunks, cur, cur_tok = [], [], 0
    for w in words:
        t = count_tokens(w + (" " if cur else ""))
        if cur_tok + t > token_limit and cur:
            chunks.append(" ".join(cur))
            cur, cur_tok = [], 0
        cur.append(w)
        cur_tok += t
    if cur:
        chunks.append(" ".join(cur))
    return chunks if chunks else [sent]  # fallback

#### 5) Apply to your preprocessed DataFrame (batched)

In [13]:
def explode_to_passages(
    df: pd.DataFrame,
    token_limit: int = 350,
    overlap_tokens: int = 40,
    min_chunk_tokens: int = 120,
    batch_size: int = 5000
) -> pd.DataFrame:
    rows = []
    total = len(df)
    for start in tqdm(range(0, total, batch_size), desc="Chunking to passages"):
        part = df.iloc[start:start+batch_size]
        for doc_id, row in part.iterrows():
            sents = ensure_sentences(row)
            chunks = pack_sentences_to_passages(
                sents,
                token_limit=token_limit,
                overlap_tokens=overlap_tokens,
                min_chunk_tokens=min_chunk_tokens
            )
            for chunk_idx, (text, s_start, s_end) in enumerate(chunks):
                rows.append({
                    "doc_id": doc_id,
                    "chunk_id": chunk_idx,
                    "_site": row.get("_site", None),
                    "title": row.get("title", None),
                    "lang": row.get("lang", None),
                    "chunk_text": text,
                    "chunk_tokens": count_tokens(text),
                    "sent_start": int(s_start),
                    "sent_end": int(s_end),
                    # (optional) keep a small snippet for quick previews
                    "preview": text[:160].replace("\n"," ") + ("…" if len(text) > 160 else "")
                })
    return pd.DataFrame(rows)

In [14]:
# Choose your target within the model budget (e.g., ~350) but cap at TOKEN_BUDGET
TARGET_SIZE = 350
EFFECTIVE_LIMIT = min(TARGET_SIZE, TOKEN_BUDGET)

In [15]:
from pathlib import Path

def explode_to_passages_fast_batched(
    df: pd.DataFrame,
    token_limit: int,
    overlap_tokens: int = 40,
    min_chunk_tokens: int = 120,
    batch_size: int = 5000,
    out_dir: str = "data/passages_parts",
    out_prefix: str = "passages_part",
    resume: bool = True,  # skip parts that already exist
) -> pd.DataFrame:
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    total_docs = len(df)
    part_files = []

    pbar = tqdm(range(0, total_docs, batch_size), desc="Chunking batches", unit="docs")
    part_idx = 0
    for start in pbar:
        end = min(start + batch_size, total_docs)
        part = df.iloc[start:end].copy()
        part_file = os.path.join(out_dir, f"{out_prefix}_{part_idx:04d}.parquet")
        if resume and os.path.exists(part_file):
            part_files.append(part_file)
            part_idx += 1
            continue

        t0 = time.time()
        out_rows = []

        # Precompute sentence token lengths per doc with **batch tokenization**
        for doc_id, row in part.iterrows():
            sents = row["sentences"] if isinstance(row.get("sentences"), list) else []
            if not sents:
                # fallback from clean_text if needed
                sents = [row.get("clean_text","")] if row.get("clean_text") else []

            sent_lens = batch_count_tokens(sents) if sents else [0]

            chunks = pack_sentences_to_passages_fast(
                sentences=sents,
                sent_token_lens=sent_lens,
                token_limit=token_limit,
                overlap_tokens=overlap_tokens,
                min_chunk_tokens=min_chunk_tokens,
            )

            for chunk_idx, (text, s_start, s_end, tlen) in enumerate(chunks):
                out_rows.append({
                    "doc_id": doc_id,
                    "chunk_id": chunk_idx,
                    "site": row.get("site"),
                    "title": row.get("title"),
                    "lang": row.get("lang"),
                    "chunk_text": text,
                    "chunk_tokens": tlen,
                    "sent_start": int(s_start),
                    "sent_end": int(s_end),
                    "preview": (text[:160].replace("\n"," ") + ("…" if len(text) > 160 else "")),
                })

        df_part = pd.DataFrame(out_rows)
        df_part.to_parquet(part_file, index=False, compression="zstd")
        part_files.append(part_file)

        dt = time.time() - t0
        pbar.set_postfix({
            "docs": f"{end}/{total_docs}",
            "chunks": len(df_part),
            "sec/batch": f"{dt:.1f}",
            "chunks/s": f"{(len(df_part)/(dt+1e-9)):.1f}",
        })
        part_idx += 1

    # Combine parts (optional; or keep parts for sharded indexing)
    df_all = pd.concat((pd.read_parquet(f) for f in part_files), ignore_index=True)
    return df_all


### <b>Run tokenization process on pre-processes data</b>

#### Load preprocessed data.

In [16]:
# Example: load the duplicates file
df_processed = pd.read_parquet("../shared-data-library/out/df_processed.parquet")

#### Validate preprocessed data.

In [17]:
df_processed.head()

Unnamed: 0,site,title,text,clean_text,lang,sentences,tokens,n_sentences,n_tokens
0,armenia__textcontent_article,Երբեք չէի պատկերացնի,<p>“I have never thought that I can do importa...,"""I have never thought that I can do important ...",en,"[""I have never thought that I can do important...","["", I, have, never, thought, that, I, can, do,...",9,260
1,armenia__textcontent_article,Երբեք չէի պատկերացնի,<p>We spoke to Heghine for a long time and she...,We spoke to Heghine for a long time and she of...,en,[We spoke to Heghine for a long time and she o...,"[We, spoke, to, Heghine, for, a, long, time, a...",13,357
2,armenia__textcontent_article,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,<p>Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի...,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,hy,[Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի V...,"[Տավուշի, մարզի, Ներքին, Ծաղկավան, գյուղի, դպր...",1,96
3,armenia__textcontent_article,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,<p>Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղու...,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,hy,[Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությ...,"[Վերջին, երեք, տարիներին, ՅՈՒՆԻՍԵՖ, -, ն, այս,...",1,109
4,armenia__textcontent_article,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,<p>ՅՈՒՆԻՍԵՖ-ի ԱՌՆ ծրագրի ղեկավար Տիգրան Թովմաս...,ՅՈՒՆԻՍԵՖ-ի ԱՌՆ ծրագրի ղեկավար Տիգրան Թովմասյան...,hy,[ՅՈՒՆԻՍԵՖ-ի ԱՌՆ ծրագրի ղեկավար Տիգրան Թովմասյա...,"[ՅՈՒՆԻՍԵՖ, -, ի, ԱՌՆ, ծրագրի, ղեկավար, Տիգրան,...",2,270


In [18]:
# Run main function to tokenize processed documents.
df_passages = explode_to_passages_fast_batched(
    df=df_processed,
    token_limit=EFFECTIVE_LIMIT,
    overlap_tokens=40,
    min_chunk_tokens=120,
    batch_size=BATCH_SIZE,
    out_dir="../shared-data-library/passages",
    out_prefix="passages_mE5_350",
    resume=True,   # reruns will skip completed parts
)

Chunking batches:   0%|          | 0/5 [00:00<?, ?docs/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (519 > 512). Running this sequence through the model will result in indexing errors


In [19]:
print(f"✅ Created {len(df_passages):,} passages from {len(df_processed):,} docs.")
print("Max tokens:", df_passages['chunk_tokens'].max(), "| Budget:", TOKEN_BUDGET)

✅ Created 51,968 passages from 21,845 docs.
Max tokens: 350 | Budget: 496


In [20]:
df_passages.head()

Unnamed: 0,doc_id,chunk_id,site,title,lang,chunk_text,chunk_tokens,sent_start,sent_end,preview
0,0,0,armenia__textcontent_article,Երբեք չէի պատկերացնի,en,"""I have never thought that I can do important ...",288,0,0,"""I have never thought that I can do important ..."
1,1,0,armenia__textcontent_article,Երբեք չէի պատկերացնի,en,We spoke to Heghine for a long time and she of...,350,0,0,We spoke to Heghine for a long time and she of...
2,1,1,armenia__textcontent_article,Երբեք չէի պատկերացնի,en,"responsibility, this is her opportunity to als...",68,0,0,"responsibility, this is her opportunity to als..."
3,2,0,armenia__textcontent_article,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,hy,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,174,0,0,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...
4,3,0,armenia__textcontent_article,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,hy,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,181,0,0,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...


#### 6) Save passages (for embeddings)

In [21]:
OUT_PARQ.parent.mkdir(parents=True, exist_ok=True)
df_passages.to_parquet(OUT_PARQ, index=False, compression="zstd")

if OUT_JSONL:
    OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)
    with open(OUT_JSONL, "w", encoding="utf-8") as f:
        for rec in df_passages.to_dict(orient="records"):
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

OUT_PARQ, OUT_JSONL

(PosixPath('../shared-data-library/out/df_passages.parquet'), None)

In [22]:
print("⏱ Notebook executed in %.2f minutes" % ((time.time()-nb_start)/60))

⏱ Notebook executed in 6.54 minutes
