# <b>Embedding and Indexing Pipeline</b>

****

## <b>Data preparation and text preprocessing pipeline</b>

#### 1) Setup (run once per notebook/kernel)

In [None]:
# If you see "tokenizers parallelism" warnings from other libs, silence them:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


#### 2) Imports & config

In [None]:
import os
import re
import json
import unicodedata
from pathlib import Path
from typing import List, Dict

import pandas as pd
from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import ftfy
import langid
from blingfire import text_to_sentences, text_to_words

# Optional tokenizers (used if available)
try:
    import jieba  # zh
except Exception:
    jieba = None

try:
    from fugashi import Tagger  # ja
    _ja_tagger = Tagger()
except Exception:
    _ja_tagger = None

try:
    import pythainlp.tokenize as thai_tok  # th
except Exception:
    thai_tok = None

# ---- Configure your paths here ----
IN_DIR    = Path("datasets/mdocuments")                 # folder containing many *.csv like site_a.csv, site_b.csv
OUT_PARQ  = Path("data/multilingual_processed.parquet")
OUT_JSONL = None  # e.g., Path("data/multilingual_processed.jsonl") if you also want JSONL
BATCH_SIZE = 5000  # 2k–10k works well; adjust to memory/CPU


#### 3) Cleaning & tokenization helpers (Unicode-safe, multilingual)

In [None]:
# Pre-compiled regexes
URL_RE = re.compile(r'https?://\S+|www\.\S+', flags=re.IGNORECASE)
CTRL_RE = re.compile(r'[\u0000-\u001f\u007f\u200b\u200c\u200d]')  # control + zero-width chars
MULTISPACE_RE = re.compile(r'\s+')

def normalize_text(text: str) -> str:
    """Unicode-safe normalization & light cleaning that respects multilingual scripts."""
    if not isinstance(text, str):
        return ""
    s = ftfy.fix_text(text)                           # fix mojibake
    s = unicodedata.normalize("NFC", s)               # compose accents consistently
    s = BeautifulSoup(s, "html.parser").get_text(" ") # strip HTML safely
    s = URL_RE.sub(" <URL> ", s)                      # keep a URL placeholder (useful for boundaries)
    s = CTRL_RE.sub(" ", s)                           # drop control / zero-width chars
    s = MULTISPACE_RE.sub(" ", s).strip()
    # Intentionally NOT lowercasing (Turkish I/ı, German ß, proper nouns)
    return s

def detect_lang(text: str) -> str:
    if not text:
        return "unk"
    code, _ = langid.classify(text)
    return code

def sent_tokenize(text: str) -> List[str]:
    if not text:
        return []
    sents = text_to_sentences(text).split("\n")
    return [s.strip() for s in sents if s.strip()]

def word_tokenize(text: str, lang: str) -> List[str]:
    if not text:
        return []
    lang = (lang or "").split("_")[0]
    if lang == "zh" and jieba is not None:
        return [t.strip() for t in jieba.cut(text) if t.strip()]
    if lang == "ja" and _ja_tagger is not None:
        return [w.surface for w in _ja_tagger(text) if w.surface.strip()]
    if lang == "th" and thai_tok is not None:
        return [t.strip() for t in thai_tok.word_tokenize(text) if t.strip()]
    # Default fast multilingual fallback
    return [w for w in text_to_words(text).split() if w]

def preprocess_row(row: Dict) -> Dict:
    raw = row.get("text", "")
    clean = normalize_text(raw)
    lang = detect_lang(clean) if clean else "unk"
    sents = sent_tokenize(clean)
    tokens = word_tokenize(clean, lang)
    return {
        "site": row.get("_site", ""),
        "title": row.get("title", ""),
        "text": raw,
        "clean_text": clean,
        "lang": lang,
        "sentences": sents,
        "tokens": tokens,
        "n_sentences": len(sents),
        "n_tokens": len(tokens),
    }


#### 4) Load many CSVs (keep only title, text, add site)

In [None]:
def load_many_csvs(in_dir: Path) -> pd.DataFrame:
    paths = sorted(in_dir.glob("*.csv"))
    if not paths:
        raise FileNotFoundError(f"No CSV files found in: {in_dir.resolve()}")

    frames = []
    for p in tqdm(paths, desc="Loading CSVs"):
        site = p.stem  # from "site_name.csv"
        try:
            df = pd.read_csv(
                p,
                encoding="utf-8-sig",
                on_bad_lines="skip",
                dtype={"title": "string", "text": "string"},
                usecols=lambda c: c in ("title", "text"),
            )
        except Exception:
            # Fallback if columns/encodings are messy
            df = pd.read_csv(p, encoding="utf-8-sig", on_bad_lines="skip")
            for col in ("title", "text"):
                if col not in df.columns:
                    df[col] = pd.NA
            df = df[["title", "text"]]

        df["title"] = df["title"].astype("string")
        df["text"]  = df["text"].astype("string")
        df["_site"] = site
        frames.append(df)

    all_df = pd.concat(frames, ignore_index=True)
    # Drop rows where both title and text are empty
    mask_empty = all_df["title"].fillna("").str.strip().eq("") & all_df["text"].fillna("").str.strip().eq("")
    all_df = all_df[~mask_empty].reset_index(drop=True)
    return all_df


#### 5) Remove duplicate texts (Language-safe)

In [None]:
import re

def make_dedupe_key(s: str) -> str:
    """
    Create a canonical key for deduplication:
    - normalize with normalize_text (fix mojibake, NFC, strip HTML, keep <URL>, remove control chars)
    - collapse whitespace
    - casefold for robust, language-aware case-insensitivity
    """
    s = normalize_text(s if isinstance(s, str) else "")
    s = re.sub(r"\s+", " ", s).strip()
    return s.casefold()

def dedupe_by_text(df: pd.DataFrame):
    df = df.copy()
    # Build key on the raw text (so we can drop before heavy processing)
    df["__dedupe_key"] = df["text"].fillna("").map(make_dedupe_key)

    # Mark dupes; keep the first occurrence
    dup_mask = df["__dedupe_key"].duplicated(keep="first")
    n_dups = int(dup_mask.sum())

    df_dups   = df.loc[dup_mask, ["_site", "title", "text", "__dedupe_key"]].reset_index(drop=True)
    df_nodup  = df.loc[~dup_mask].drop(columns=["__dedupe_key"]).reset_index(drop=True)

    return df_nodup, df_dups, n_dups

#### 6) Batch processing (single process—safe in notebooks)

In [None]:
def process_dataframe_batched(df: pd.DataFrame, batch_size: int = 5000) -> pd.DataFrame:
    df = df[["_site", "title", "text"]].copy()
    n = len(df)
    results = []

    for start in tqdm(range(0, n, batch_size), desc="Processing batches"):
        end = min(n, start + batch_size)
        records = df.iloc[start:end].to_dict(orient="records")
        out = [preprocess_row(r) for r in records]
        part = pd.DataFrame(out)
        part["lang"] = part["lang"].astype("category")
        results.append(part)

    processed = pd.concat(results, ignore_index=True)

    # If memory becomes an issue, consider these toggles:
    # processed["tokens"] = processed["tokens"].apply(lambda t: " ".join(t))  # store tokens as a single string
    # processed = processed.drop(columns=["sentences"])                        # drop sentence list
    return processed

In [None]:
df_raw = load_many_csvs(IN_DIR)
print(f"Loaded {len(df_raw):,} rows from {IN_DIR}")
df_raw.head(3)

df_nodup, df_dups, n_dups = dedupe_by_text(df_raw)
print(f"🔁 Removed {n_dups:,} duplicate texts. Remaining: {len(df_nodup):,} rows.")

# (Optional) Save a log of removed duplicates for auditing
df_dups.to_parquet("data/duplicates_removed.parquet", index=False)

In [None]:
df_processed = process_dataframe_batched(df_nodup, batch_size=BATCH_SIZE)
print(f"Processed rows: {len(df_processed):,}")
df_processed.head(3)

#### 7) Save outputs (Parquet is best for Unicode + lists)

In [None]:
OUT_PARQ.parent.mkdir(parents=True, exist_ok=True)
df_processed.to_parquet(OUT_PARQ, index=False, compression="zstd")

if OUT_JSONL:
    OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)
    with open(OUT_JSONL, "w", encoding="utf-8") as f:
        for rec in df_processed.to_dict(orient="records"):
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")

OUT_PARQ, OUT_JSONL


### Review duplicates

In [None]:
# Example: load the duplicates file
df_dups = pd.read_parquet("data/duplicates_removed.parquet")

# Or load the main processed file
df_processed = pd.read_parquet("data/multilingual_processed.parquet")

In [None]:
print(f"Found {len(df_dups):,} duplicates removed")

# Peek at first few duplicates
df_dups.head(10)


In [None]:
# Group duplicates to see clusters of texts that were considered the same
dup_groups = df_dups.groupby("__dedupe_key")["text"].apply(list).reset_index()
dup_groups.head(5)

In [None]:
def make_dedupe_key(s: str) -> str:
    import re, unicodedata, ftfy
    from bs4 import BeautifulSoup
    s = ftfy.fix_text(s if isinstance(s, str) else "")
    s = unicodedata.normalize("NFC", s)
    s = BeautifulSoup(s, "html.parser").get_text(" ")
    s = re.sub(r'https?://\S+|www\.\S+', " <URL> ", s)
    s = re.sub(r'[\u0000-\u001f\u007f\u200b\u200c\u200d]', " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s.casefold()

df_dups["__dedupe_key"] = df_dups["text"].fillna("").map(make_dedupe_key)


******

## <b>Document chunking and embedding pipeline.</b>

#### 1) Install and Import

In [None]:
# %pip install -q transformers  # for accurate token counting with your HF model
# If you didn't use my earlier cells:
# %pip install -q blingfire ftfy beautifulsoup4 langid tqdm pyarrow

from typing import List, Dict, Optional, Tuple
import pandas as pd
from tqdm.notebook import tqdm

# If you need sentence splitting fallback:
try:
    from blingfire import text_to_sentences
except Exception:
    text_to_sentences = None

# Hugging Face tokenizer for accurate token counts (choose your embedding model)
from transformers import AutoTokenizer


#### 2) Choose a tokenizer (match your embedding model)

Pick the tokenizer that matches the embedding model you’ll use in retrieval (so chunk sizes reflect the true token budget).

In [None]:
# Use the tokenizer that matches your embedding model:
# Examples:
# TOKENIZER_MODEL = "intfloat/multilingual-e5-large"
# TOKENIZER_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
TOKENIZER_MODEL = "intfloat/multilingual-e5-large"
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_MODEL)

def model_token_budget(tok, headroom=16, cap_default=512):
    """
    Returns a safe per-chunk token budget:
    - Uses tok.model_max_length if valid, else cap_default.
    - Leaves a small headroom for special tokens/prefixes.
    """
    max_len = getattr(tok, "model_max_length", None)
    if max_len is None or max_len > 100_000_000:  # some tokenizers set a huge sentinel
        max_len = cap_default
    return max(32, int(max_len - headroom))

TOKEN_BUDGET = model_token_budget(tokenizer, headroom=16, cap_default=512)
TOKEN_BUDGET


In [None]:
from typing import List, Tuple, Dict, Any
import math, time, os
import pandas as pd
from tqdm.notebook import tqdm

# Assumes you already defined `tokenizer`, TOKEN_BUDGET, and count_tokens() previously

def batch_count_tokens(texts: List[str]) -> List[int]:
    """
    Much faster than looping tokenizer.encode for each string.
    """
    if not texts:
        return []
    enc = tokenizer(texts, add_special_tokens=False, padding=False, truncation=False, return_length=True)
    # HF returns `length` for each item
    return [int(x) for x in enc["length"]]

2) Token helpers (accurate counting + token-level splitting)

In [None]:
def count_tokens(text: str) -> int:
    return len(tokenizer.encode(text, add_special_tokens=False))

def split_by_tokens(text: str, max_tokens: int, overlap_tokens: int = 0):
    """
    Split text strictly by tokenizer tokens so each segment <= max_tokens.
    Overlap is in *tokens*, not characters, and works with any language (CJK too).
    """
    ids = tokenizer.encode(text, add_special_tokens=False)
    out = []
    i, n = 0, len(ids)
    while i < n:
        j = min(i + max_tokens, n)
        seg_ids = ids[i:j]
        out.append(tokenizer.decode(seg_ids, skip_special_tokens=True).strip())
        if j == n:
            break
        # move start forward, keeping overlap tokens
        i = j - overlap_tokens if overlap_tokens and (j - overlap_tokens) > i else j
    return [s for s in out if s]


In [None]:
def pack_sentences_to_passages_fast(
    sentences: List[str],
    sent_token_lens: List[int],
    token_limit: int,
    overlap_tokens: int = 40,
    min_chunk_tokens: int = 120,
) -> List[Tuple[str, int, int, int]]:
    """
    Returns list of (passage_text, start_sent_idx, end_sent_idx_inclusive, chunk_tokens).
    Assumes sent_token_lens[i] == token length of sentences[i] (already computed in batch).
    """
    passages = []
    i, n = 0, len(sentences)
    while i < n:
        cur_sents, cur_tok = [], 0
        start_i = i

        while i < n:
            s_tok = sent_token_lens[i]
            if s_tok > token_limit:
                # Split the single oversized sentence strictly by tokens (rare)
                parts = split_by_tokens(sentences[i], max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
                for part in parts:
                    passages.append((part, i, i, count_tokens(part)))
                i += 1
                start_i = i
                break

            if cur_tok + s_tok <= token_limit:
                cur_sents.append(sentences[i])
                cur_tok += s_tok
                i += 1
            else:
                break

        if cur_sents:
            text = " ".join(cur_sents).strip()
            # Safety: enforce token budget in case join changed tokenization
            tlen = count_tokens(text)
            if tlen > token_limit:
                parts = split_by_tokens(text, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
                for part in parts:
                    passages.append((part, start_i, i - 1, count_tokens(part)))
            else:
                passages.append((text, start_i, i - 1, tlen))

        # prepare overlap by sentences (approximate by token sum)
        if i < n and cur_sents:
            ov_tok = 0
            back = len(cur_sents) - 1
            while back >= 0 and ov_tok + sent_token_lens[start_i + back] <= overlap_tokens:
                ov_tok += sent_token_lens[start_i + back]
                back -= 1
            overlap_sents = (len(cur_sents) - 1) - back
            if overlap_sents > 0:
                i = max(i - overlap_sents, start_i)

    # merge tiny tail if possible
    if len(passages) >= 2:
        last_text, ls, le, ltok = passages[-1]
        if ltok < min_chunk_tokens:
            prev_text, ps, pe, ptok = passages[-2]
            merged = (prev_text + " " + last_text).strip()
            mlen = count_tokens(merged)
            if mlen <= token_limit:
                passages[-2] = (merged, ps, le, mlen)
                passages.pop()

    # Final enforce (paranoid mode)
    safe = []
    for text, s, e, tlen in passages:
        if tlen <= token_limit:
            safe.append((text, s, e, tlen))
        else:
            parts = split_by_tokens(text, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
            for p in parts:
                safe.append((p, s, e, count_tokens(p)))
    return safe


#### 3) Sentence fallback (if your df lacks sentences)

In [None]:
def ensure_sentences(row) -> List[str]:
    if isinstance(row.get("sentences"), list) and row["sentences"]:
        return row["sentences"]
    # fallback: lightweight multilingual splitter
    if text_to_sentences is not None:
        return [s.strip() for s in text_to_sentences(row.get("clean_text","")).split("\n") if s.strip()]
    # last-resort: split on punctuation (rough)
    import re
    txt = (row.get("clean_text") or "").strip()
    parts = re.split(r'(?<=[.!?。\u3002！？])\s+', txt)
    return [p for p in parts if p]


#### 4) Sentence-wise greedy packing into passages

- Pack whole sentences until the token_limit would be exceeded.
- Keep a small sentence overlap (by tokens) to avoid cutting context.
- If a single sentence is longer than the limit, we split that sentence on whitespace to fit.
- Ensure the final chunk isn’t tiny (merge forward/back when possible).

In [None]:
def pack_sentences_to_passages(
    sentences,
    token_limit: int,
    overlap_tokens: int = 40,
    min_chunk_tokens: int = 120
):
    """
    Returns list of (passage_text, start_sent_idx, end_sent_idx_inclusive),
    guaranteeing each passage <= token_limit (tokenizer-accurate).
    """
    passages = []
    i, n = 0, len(sentences)

    while i < n:
        cur_sents, cur_tok = [], 0
        start_i = i

        while i < n:
            s = sentences[i]
            s_tok = count_tokens(s)

            # If a single sentence is too long, split it by tokens now.
            if s_tok > token_limit:
                long_parts = split_by_tokens(s, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
                for part in long_parts:
                    passages.append((part, i, i))
                i += 1
                start_i = i  # reset packing window after forcing splits
                break

            # Try to add sentence; if it would exceed, emit current chunk
            if cur_tok + s_tok <= token_limit:
                cur_sents.append(s)
                cur_tok += s_tok
                i += 1
            else:
                break

        if cur_sents:
            chunk_text = " ".join(cur_sents).strip()
            # Safety: if the joined chunk still exceeds (rare), split by tokens
            if count_tokens(chunk_text) > token_limit:
                parts = split_by_tokens(chunk_text, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
                for part in parts:
                    passages.append((part, start_i, i - 1))
            else:
                passages.append((chunk_text, start_i, i - 1))

        # Prepare sentence-level overlap for next window
        if i < n and passages and cur_sents:
            # choose last sentences whose total tokens ≈ overlap_tokens
            ov_sents, ov_tok = [], 0
            for s in reversed(cur_sents):
                t = count_tokens(s)
                if ov_tok + t > overlap_tokens and ov_sents:
                    break
                ov_sents.insert(0, s)
                ov_tok += t
            if ov_sents:
                i = max(i - len(ov_sents), start_i)

    # Merge a too-small tail into the previous chunk (if it keeps budget)
    if len(passages) >= 2:
        last_text, ls, le = passages[-1]
        if count_tokens(last_text) < min_chunk_tokens:
            prev_text, ps, pe = passages[-2]
            merged = (prev_text + " " + last_text).strip()
            if count_tokens(merged) <= token_limit:
                passages[-2] = (merged, ps, le)
                passages.pop()

    # Final safety: enforce budget on every chunk (handles corner cases)
    safe = []
    for text, s, e in passages:
        if count_tokens(text) <= token_limit:
            safe.append((text, s, e))
        else:
            parts = split_by_tokens(text, max_tokens=token_limit, overlap_tokens=overlap_tokens // 2)
            for p in parts:
                safe.append((p, s, e))
    return safe


In [None]:
def split_long_sentence(sent: str, token_limit: int) -> List[str]:
    """If one sentence exceeds token_limit, split it on whitespace to fit."""
    words = sent.split()
    chunks, cur, cur_tok = [], [], 0
    for w in words:
        t = count_tokens(w + (" " if cur else ""))
        if cur_tok + t > token_limit and cur:
            chunks.append(" ".join(cur))
            cur, cur_tok = [], 0
        cur.append(w)
        cur_tok += t
    if cur:
        chunks.append(" ".join(cur))
    return chunks if chunks else [sent]  # fallback

#### 5) Apply to your preprocessed DataFrame (batched)

In [None]:
def explode_to_passages(
    df: pd.DataFrame,
    token_limit: int = 350,
    overlap_tokens: int = 40,
    min_chunk_tokens: int = 120,
    batch_size: int = 5000
) -> pd.DataFrame:
    rows = []
    total = len(df)
    for start in tqdm(range(0, total, batch_size), desc="Chunking to passages"):
        part = df.iloc[start:start+batch_size]
        for doc_id, row in part.iterrows():
            sents = ensure_sentences(row)
            chunks = pack_sentences_to_passages(
                sents,
                token_limit=token_limit,
                overlap_tokens=overlap_tokens,
                min_chunk_tokens=min_chunk_tokens
            )
            for chunk_idx, (text, s_start, s_end) in enumerate(chunks):
                rows.append({
                    "doc_id": doc_id,
                    "chunk_id": chunk_idx,
                    "_site": row.get("_site", None),
                    "title": row.get("title", None),
                    "lang": row.get("lang", None),
                    "chunk_text": text,
                    "chunk_tokens": count_tokens(text),
                    "sent_start": int(s_start),
                    "sent_end": int(s_end),
                    # (optional) keep a small snippet for quick previews
                    "preview": text[:160].replace("\n"," ") + ("…" if len(text) > 160 else "")
                })
    return pd.DataFrame(rows)

In [None]:
# Choose your target within the model budget (e.g., ~350) but cap at TOKEN_BUDGET
TARGET_SIZE = 350
EFFECTIVE_LIMIT = min(TARGET_SIZE, TOKEN_BUDGET)

In [None]:
EFFECTIVE_LIMIT

In [None]:
from pathlib import Path

def explode_to_passages_fast_batched(
    df: pd.DataFrame,
    token_limit: int,
    overlap_tokens: int = 40,
    min_chunk_tokens: int = 120,
    batch_size: int = 5000,
    out_dir: str = "data/passages_parts",
    out_prefix: str = "passages_part",
    resume: bool = True,  # skip parts that already exist
) -> pd.DataFrame:
    Path(out_dir).mkdir(parents=True, exist_ok=True)
    total_docs = len(df)
    part_files = []

    pbar = tqdm(range(0, total_docs, batch_size), desc="Chunking batches", unit="docs")
    part_idx = 0
    for start in pbar:
        end = min(start + batch_size, total_docs)
        part = df.iloc[start:end].copy()
        part_file = os.path.join(out_dir, f"{out_prefix}_{part_idx:04d}.parquet")
        if resume and os.path.exists(part_file):
            part_files.append(part_file)
            part_idx += 1
            continue

        t0 = time.time()
        out_rows = []

        # Precompute sentence token lengths per doc with **batch tokenization**
        for doc_id, row in part.iterrows():
            sents = row["sentences"] if isinstance(row.get("sentences"), list) else []
            if not sents:
                # fallback from clean_text if needed
                sents = [row.get("clean_text","")] if row.get("clean_text") else []

            sent_lens = batch_count_tokens(sents) if sents else [0]

            chunks = pack_sentences_to_passages_fast(
                sentences=sents,
                sent_token_lens=sent_lens,
                token_limit=token_limit,
                overlap_tokens=overlap_tokens,
                min_chunk_tokens=min_chunk_tokens,
            )

            for chunk_idx, (text, s_start, s_end, tlen) in enumerate(chunks):
                out_rows.append({
                    "doc_id": doc_id,
                    "chunk_id": chunk_idx,
                    "_site": row.get("_site"),
                    "title": row.get("title"),
                    "lang": row.get("lang"),
                    "chunk_text": text,
                    "chunk_tokens": tlen,
                    "sent_start": int(s_start),
                    "sent_end": int(s_end),
                    "preview": (text[:160].replace("\n"," ") + ("…" if len(text) > 160 else "")),
                })

        df_part = pd.DataFrame(out_rows)
        df_part.to_parquet(part_file, index=False, compression="zstd")
        part_files.append(part_file)

        dt = time.time() - t0
        pbar.set_postfix({
            "docs": f"{end}/{total_docs}",
            "chunks": len(df_part),
            "sec/batch": f"{dt:.1f}",
            "chunks/s": f"{(len(df_part)/(dt+1e-9)):.1f}",
        })
        part_idx += 1

    # Combine parts (optional; or keep parts for sharded indexing)
    df_all = pd.concat((pd.read_parquet(f) for f in part_files), ignore_index=True)
    return df_all

# Use your model budget (from previous step)
TARGET_SIZE = 350
EFFECTIVE_LIMIT = min(TARGET_SIZE, TOKEN_BUDGET)


In [None]:
df_passages = explode_to_passages_fast_batched(
    df=df_processed,
    token_limit=EFFECTIVE_LIMIT,
    overlap_tokens=40,
    min_chunk_tokens=120,
    batch_size=5000,
    out_dir="data/passages_parts",
    out_prefix="passages_mE5_350",
    resume=True,   # reruns will skip completed parts
)

print(f"✅ Created {len(df_passages):,} passages from {len(df_processed):,} docs.")
print("Max tokens:", df_passages['chunk_tokens'].max(), "| Budget:", TOKEN_BUDGET)

#### 6) Save passages (for indexing)

In [None]:
out_parq_passages = "data/passages_mE5_350.parquet"  # include model/limit in name for clarity
pd.Series(df_passages.columns.tolist(), name="columns")  # quick glance
df_passages.to_parquet(out_parq_passages, index=False, compression="zstd")
out_parq_passages


********

## <b>Multilingual embedding pipeline:</b>

#### 1) Imports, device, and model choice

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"   # was "false" earlier for safety

In [59]:
# 0) Config
import os, math, time, numpy as np, pandas as pd, torch
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel

os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")  # speed up tokenization
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE  = torch.float16 if DEVICE.type == "cuda" else torch.float32

# Choose model (speed tiers):
# FAST:    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" (384d)
# BALANCE: "intfloat/multilingual-e5-base" (768d)
# QUALITY: "intfloat/multilingual-e5-large" (1024d)
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=(torch.float16 if DEVICE.type=="cuda" else None))
model.to(DEVICE).eval()

# Enable fused attention (BetterTransformer) if available
try:
    from optimum.bettertransformer import BetterTransformer
    model = BetterTransformer.transform(model)
except Exception:
    pass  # it's fine if not installed

def model_token_budget(tok, headroom=16, cap_default=512):
    ml = getattr(tok, "model_max_length", None)
    if ml is None or ml > 100_000_000: ml = cap_default
    return max(32, int(ml - headroom))
TOKEN_BUDGET = model_token_budget(tokenizer)

def add_passage_prefix(texts):
    # Only E5 needs "passage: " prefix
    return [f"passage: {t}" for t in texts] if "intfloat/multilingual-e5" in MODEL_NAME.lower() else texts

@torch.no_grad()
def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-6)
    return summed / counts

def embed_batch(texts, max_len=TOKEN_BUDGET):
    enc = tokenizer(
        texts, padding=True, truncation=True, max_length=max_len,
        return_tensors="pt"
    )
    enc = {k: v.to(DEVICE, non_blocking=True) for k, v in enc.items()}
    with torch.inference_mode(), (
        torch.autocast(device_type=DEVICE.type, dtype=torch.float16) if DEVICE.type=="cuda" else torch.no_grad()
    ):
        out = model(**enc)
        pooled = mean_pool(out.last_hidden_state, enc["attention_mask"])
        pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
    # Keep float32 for FAISS stability downstream
    return pooled.to(torch.float32).cpu().numpy()


BetterTransformer is deprecated and will be removed in Optimum v2.0.


In [60]:
# 1) Inputs & outputs
texts = df_passages["chunk_text"].astype(str).tolist()
N = len(texts)

# Choose batch size
BATCH = 768 if DEVICE.type=="cuda" else 256   # tune: 512–1024 (GPU), 64–128 (CPU)
OUT_DIR = "data/embeddings_fast"
os.makedirs(OUT_DIR, exist_ok=True)

# Determine embedding dimensionality once (dry run on 1 example)
test_vec = embed_batch(add_passage_prefix([texts[0]]))
DIM = test_vec.shape[1]

# Use a memory-mapped array to write incrementally (resumable)
mmap_path = os.path.join(OUT_DIR, f"{MODEL_NAME.split('/')[-1]}_{DIM}d_50k_float32.mm")
embs = np.memmap(mmap_path, dtype="float32", mode="w+", shape=(N, DIM))

# Optional: resume support — check how many rows already filled (NaNs if unwritten)
# For a fresh run, start = 0. If resuming, detect start index from a sidecar file.
start = 0
sidecar = mmap_path + ".idx"
if os.path.exists(sidecar):
    try:
        start = int(open(sidecar).read().strip())
    except Exception:
        start = 0

t0 = time.time()
for i in tqdm(range(start, N, BATCH), desc=f"Embedding on {DEVICE}", unit="batch"):
    j = min(i + BATCH, N)
    batch = add_passage_prefix(texts[i:j])
    vecs = embed_batch(batch, max_len=TOKEN_BUDGET)
    embs[i:j, :] = vecs
    # Flush progress & write checkpoint index
    embs.flush()
    with open(sidecar, "w") as f:
        f.write(str(j))
    # Lightweight throughput display
    done = j
    dt = time.time() - t0
    if dt > 0:
        tqdm.write(f"done {done}/{N} | {(done/dt):.1f} chunks/s | ETA {(N-done)/(done/dt+1e-9):.1f}s")

# Convert memmap to .npy cleanly
final_npy = os.path.join(OUT_DIR, f"{MODEL_NAME.split('/')[-1]}_{DIM}d_50k_float32.npy")
np.save(final_npy, np.asarray(embs))
os.remove(sidecar)  # cleanup resume marker
print("Saved:", final_npy)

Embedding on cpu:   0%|          | 0/198 [00:00<?, ?batch/s]

done 1696/51968 | 46.1 chunks/s | ETA 1089.5s
done 1952/51968 | 24.8 chunks/s | ETA 2016.6s
done 2208/51968 | 18.5 chunks/s | ETA 2690.3s
done 2464/51968 | 15.4 chunks/s | ETA 3220.0s
done 2720/51968 | 13.7 chunks/s | ETA 3586.0s
done 2976/51968 | 12.5 chunks/s | ETA 3927.1s
done 3232/51968 | 11.5 chunks/s | ETA 4238.4s
done 3488/51968 | 10.8 chunks/s | ETA 4487.0s
done 3744/51968 | 10.3 chunks/s | ETA 4697.6s
done 4000/51968 | 9.6 chunks/s | ETA 4976.9s
done 4256/51968 | 9.3 chunks/s | ETA 5118.1s
done 4512/51968 | 9.0 chunks/s | ETA 5250.0s
done 4768/51968 | 8.8 chunks/s | ETA 5337.4s
done 5024/51968 | 8.6 chunks/s | ETA 5430.3s
done 5280/51968 | 8.5 chunks/s | ETA 5511.0s
done 5536/51968 | 8.3 chunks/s | ETA 5589.1s
done 5792/51968 | 8.2 chunks/s | ETA 5634.5s
done 6048/51968 | 8.0 chunks/s | ETA 5773.9s
done 6304/51968 | 7.9 chunks/s | ETA 5816.1s
done 6560/51968 | 7.8 chunks/s | ETA 5850.6s
done 6816/51968 | 7.7 chunks/s | ETA 5883.8s
done 7072/51968 | 7.6 chunks/s | ETA 5940.6s
d

******

## <b>Indexing the embedded vectors</b>

In [71]:
# df_passages must already exist with at least:
# ["doc_id","chunk_id","chunk_text","preview","lang","_site","title","chunk_tokens"]

# 1) Stable global ID to align all stores (FAISS / Elasticsearch / SQLite)
df_passages = df_passages.copy()
df_passages["global_chunk_id"] = (
    df_passages["doc_id"].astype(str) + ":" + df_passages["chunk_id"].astype(int).astype(str)
)

In [73]:
df_passages.head()

Unnamed: 0,doc_id,chunk_id,_site,title,lang,chunk_text,chunk_tokens,sent_start,sent_end,preview,global_chunk_id
0,0,0,,Երբեք չէի պատկերացնի,en,"""I have never thought that I can do important ...",288,0,0,"""I have never thought that I can do important ...",0:0
1,1,0,,Երբեք չէի պատկերացնի,en,We spoke to Heghine for a long time and she of...,350,0,0,We spoke to Heghine for a long time and she of...,1:0
2,1,1,,Երբեք չէի պատկերացնի,en,"responsibility, this is her opportunity to als...",68,0,0,"responsibility, this is her opportunity to als...",1:1
3,2,0,,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,hy,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,174,0,0,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,2:0
4,3,0,,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,hy,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,181,0,0,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,3:0


In [74]:
# 2) Choose the columns you’ll want at retrieval time (add more if you need)
meta_cols = [
    "global_chunk_id", "doc_id", "chunk_id", "_site", "lang",
    "title", "preview", "chunk_tokens"
]
meta = df_passages[meta_cols].rename(columns={"_site":"site"})

In [76]:
meta.head()

Unnamed: 0,global_chunk_id,doc_id,chunk_id,site,lang,title,preview,chunk_tokens
0,0:0,0,0,,en,Երբեք չէի պատկերացնի,"""I have never thought that I can do important ...",288
1,1:0,1,0,,en,Երբեք չէի պատկերացնի,We spoke to Heghine for a long time and she of...,350
2,1:1,1,1,,en,Երբեք չէի պատկերացնի,"responsibility, this is her opportunity to als...",68
3,2:0,2,0,,hy,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,174
4,3:0,3,0,,hy,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,181


In [77]:
# 3) Save to Parquet (this is the file you’ll later load as meta)
MODEL_TAG = "paraphrase-multilingual-MiniLM-L12-v2"   # or your chosen model name
out_dir = "data/embeddings_fast"
os.makedirs(out_dir, exist_ok=True)
meta_path = os.path.join(out_dir, f"{MODEL_TAG}_passages_meta.parquet")
meta.to_parquet(meta_path, index=False)
print("Saved meta:", meta_path, "| rows:", len(meta))

Saved meta: data/embeddings_fast/paraphrase-multilingual-MiniLM-L12-v2_passages_meta.parquet | rows: 51968


#### 1) Dense index (Semantic) with FAISS.

FAISS is perfect for 50K - 500K vectors on one machine.

In [78]:
embedding_file_name = f"{MODEL_NAME.split('/')[-1]}_{DIM}d_50k_float32.npy"
metadata_file_name = f"{MODEL_TAG}_passages_meta.parquet"

In [79]:
import numpy as np, faiss

# Load your embeddings and metadata (from earlier step)
emb = np.load(f"data/embeddings_fast/{embedding_file_name}").astype("float32")  # (N, d) L2-normalized
meta = pd.read_parquet(f"data/embeddings_fast/{metadata_file_name}")  # includes global_chunk_id, etc.

In [86]:
d = emb.shape[1]
index = faiss.IndexFlatIP(d)   # use IP since vectors are L2-normalized -> cosine
index.add(emb)

faiss.write_index(index, "data/faiss_index/passages_flatip.faiss")

Query side:


In [87]:
def faiss_search(query_vecs: np.ndarray, k=10):
    # query_vecs must be L2-normalized float32 (m x d)
    D, I = index.search(query_vecs.astype("float32"), k)
    # Map to metadata rows
    hits = []
    for q, (scores, idxs) in enumerate(zip(D, I)):
        rows = meta.iloc[idxs].copy()
        rows["dense_score"] = scores
        rows["q"] = q
        hits.append(rows)
    return pd.concat(hits, ignore_index=True)


******

# <b>Keyword based search</b>

### <b>2. Prepare data & bulk-ingest passages</b>

In [88]:
import pandas as pd

# Ensure stable IDs
df_passages = df_passages.copy()
df_passages["global_chunk_id"] = (
    df_passages["doc_id"].astype(str) + ":" + df_passages["chunk_id"].astype(int).astype(str)
)

# Minimal columns for ES
to_index = df_passages.rename(columns={"_site": "site"})[
    ["global_chunk_id","doc_id","chunk_id","site","lang","title","chunk_text","preview","chunk_tokens"]
].fillna({"title":"", "preview":"", "chunk_text":""})


In [92]:
from elasticsearch import Elasticsearch, helpers

es = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "DfZP9TzO")   # 👈 add this
)

def gen_actions(df):
    for r in df.itertuples(index=False):
        yield {
            "_index": "passages_bm25",
            "_id": r.global_chunk_id,
            "_source": r._asdict()
        }

# Ingest
helpers.bulk(es, gen_actions(to_index), request_timeout=180)
es.indices.refresh(index="passages_bm25")


  helpers.bulk(es, gen_actions(to_index), request_timeout=180)


ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

### <b>3. Keyword search functions (single & batch)

In [94]:
def bm25_search(query_str: str, k: int = 50) -> pd.DataFrame:
    body = {
      "size": k,
      "query": { "multi_match": {
        "query": query_str,
        "fields": ["title^2", "chunk_text"]
      }}
    }
    res = es.search(index="passages_bm25", body=body)
    rows = []
    for rank, hit in enumerate(res["hits"]["hits"], start=1):
        src = hit["_source"]
        rows.append({
            "query": query_str,
            "rank": rank,
            "sparse_score": hit["_score"],
            "global_chunk_id": src["global_chunk_id"],
            "doc_id": src["doc_id"],
            "chunk_id": src["chunk_id"],
            "title": src.get("title",""),
            "site": src.get("site",""),
            "lang": src.get("lang",""),
            "preview": src.get("preview","")
        })
    return pd.DataFrame(rows)

def bm25_search_batch(queries: list[str], k: int = 50) -> pd.DataFrame:
    all_rows = []
    for qid, q in enumerate(queries):
        df = bm25_search(q, k=k)
        if df.empty:
            continue
        df.insert(0, "query_id", qid)
        all_rows.append(df)
    return pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame()


#### <b>4. Convert chunk results → doc results (best chunk per doc) </b>

In [95]:
def bm25_doc_level(df_hits: pd.DataFrame, top_k_docs: int = 10) -> pd.DataFrame:
    # keep best chunk per (query_id, doc_id)
    sub = (df_hits
           .sort_values(["query_id","doc_id","sparse_score"], ascending=[True, True, False])
           .groupby(["query_id","doc_id"], as_index=False)
           .first())
    # rerank docs per query
    sub["rank"] = sub.groupby("query_id")["sparse_score"].rank(ascending=False, method="first").astype(int)
    sub = sub.sort_values(["query_id","rank"]).groupby("query_id").head(top_k_docs).reset_index(drop=True)
    return sub


### <b> 5. Run the baseline for a list of queries.</b>

In [102]:
# Option A: directly pass a python list
queries = [
    "child immunization reduces mortality",
    "política de vacunación infantil",
    "टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण",
    "儿童免疫接种 对 死亡率 的 影响"
]

# Option B: load from CSV with mapping (query_id, query_text)
# queries_df = pd.read_csv("data/queries.csv")
# queries = queries_df["query_text"].tolist()

df_sparse_chunks = bm25_search_batch(queries, k=50)
df_sparse_docs   = bm25_doc_level(df_sparse_chunks, top_k_docs=10)

In [103]:
# Inspect
df_sparse_chunks[df_sparse_chunks['query'] == 'टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण'].head(10)[["query_id","query","rank","sparse_score","doc_id","chunk_id","title","lang","preview"]]

Unnamed: 0,query_id,query,rank,sparse_score,doc_id,chunk_id,title,lang,preview
100,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,1,162.91196,15900,0,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,hi,चिकित्सा के क्षेत्र में टीकाकरण का अहम योगदान ...
101,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,2,162.91196,15901,0,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,hi,टीकाकरण जिंदगियां बचाता है मॉर्डन टीकों और ओरल...
102,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,3,162.91196,15902,0,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,hi,टीकाकरण अगली पीढ़ी की रक्षा करता है टीकाकरण ने...
103,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,4,162.91196,15903,0,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,hi,पैसे बचाने और आर्थिक स्थिति ठीक रखने में मददगा...
104,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,5,162.91196,15904,0,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,hi,समय पर टीकाकरण करवा कर बच्चों में रोकी जा सकती...
105,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,6,162.91196,15905,0,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,hi,आपके परिजनों और दोस्तों की सुरक्षा के लिए टीका...
106,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,7,53.344685,15911,1,बच्चों के टीकाकरण से संबंधित अक्सर पूछे जाने व...,hi,) और मातृ संचारित हेपेटाइटिस बी (हेप बी टीका) ...
107,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,8,43.010044,15912,0,बच्चों के टीकाकरण से संबंधित अक्सर पूछे जाने व...,hi,कुछ टीके उन बच्चों को क्यों नहीं लगाए जाते जो ...
108,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,9,42.75853,15925,0,बाधाओं से परे घर-घर टीकाकरण करने पहुंचते हैं स...,hi,राजस्थान के सिरोही जिले में गरासिया नामक आदिवा...
109,2,टीकाकरण क्यों जरूरी है इसके पांच मुख्य कारण,10,40.111618,15910,0,बच्चों के टीकाकरण से संबंधित अक्सर पूछे जाने व...,hi,भारत में टीकाकरण ने न सिर्फ समुदायों को चिकित्...


### 6) Evaluate (re-using your earlier evaluator)

If you’ve plugged in the evaluation helpers I gave you, this is straightforward. We’ll evaluate at doc-level (recommended for document search).

In [None]:
# Ground truth CSV with columns: query_id, doc_id, relevant
GT_PATH = "data/ground/ground_truth_sample.csv"

# Reuse the evaluate_retrieval() function you already have
per_q, summary = evaluate_retrieval(
    df_hits=df_sparse_docs.rename(columns={"sparse_score":"score"}),  # evaluator expects 'score' or 'dense_score'
    gt_path_or_df=GT_PATH,
    id_level="doc",       # evaluate documents
    k_list=(1,3,5,10),
    gain_scheme="exp2"
)

display(per_q.head())
display(summary)


In [None]:
df_sparse_chunks.to_parquet("eval/sparse_chunks.parquet", index=False)
df_sparse_docs.to_parquet("eval/sparse_docs.parquet", index=False)
per_q.to_parquet("eval/metrics_sparse_per_query.parquet", index=False)
summary.to_csv("eval/metrics_sparse_summary.csv", index=False)


## <b>Project preparation steps</b>

### Import required libraries.

In [None]:
# Data processing libraries.
import pandas as pd
import numpy as np
import re
import string
import unicodedata
import emoji
import math

# Nltk libraries for text cleaning and processing.
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

import stopwordsiso as stopwords
from unidecode import unidecode

# Import libraries for text cleaning.
from bs4 import BeautifulSoup

# Thread pooling.
from multiprocessing.dummy import Pool

# Import system specific libraries.
import os
import glob
import yaml
from tqdm import tqdm

# Import fast text library for language detection.
import fasttext

# Import libraries for performance evaluation and measurements.
import time
import torch

# Import FAISS library for indexing embedded vectors.
import faiss

# Sentence transformer based models.
from sentence_transformers import SentenceTransformer

# Import pickle for saving and loading objects.
import pickle

# Import sqlite3 library for storing metadata.
import sqlite3

# Ignore future and deprecated warnings to get cleaner output.
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Logs.
import logging

In [None]:
# Download the required nltk packages.
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

### Configuration logs

In [None]:
# Configure logging
logging.basicConfig(
    filename="pipeline.log",         # log file name
    level=logging.INFO,              # logging level (INFO, DEBUG, ERROR)
    format="%(asctime)s - %(levelname)s - %(message)s",  # log format
    filemode="w"                     # overwrite log file each run ("a" to append)
)

logger = logging.getLogger(__name__)

### Define constants

In [None]:
CONFIG_DIRECTORY_PATH = "config"
DATASET_DIRECTORY_PATH = "datasets"
DATA_DIRECTORY_PATH = "data"
MULTILINGUAL_DOCUMENTS_DIRECTORY_PATH = "datasets/multilingual_documents"

### Helping functions

#### 1. Load configuration file.

In [None]:
# Load project specific configuration file.
def load_config(filename):
    config_file_path = f"{CONFIG_DIRECTORY_PATH}/{filename}.yml"
    with open(config_file_path, "r") as f:
        config = yaml.safe_load(f)
    
    # Return config file.
    return config

#### 2. Get Language detection model.

In [None]:
# Get pre-trained language detection model.
def get_langauge_detection_model(language_detection_config):
    model = language_detection_config['model']
    pre_trained_model_filepath = f"{DATA_DIRECTORY_PATH}/{model}"
    if not os.path.exists(pre_trained_model_filepath):
        raise FileNotFoundError(f"{pre_trained_model_filepath} not found. Download it from model's website.")
    else:
        return fasttext.load_model(pre_trained_model_filepath)

### Global variables

In [None]:
# Load configured data.
site_metadata_config = load_config('sites-metadata');
project_config = load_config('project');

# Load fast track model for language detection.
fast_track_language_detection_model = get_langauge_detection_model(project_config['language_detection']);

### 3. Get metadata

In [None]:
def get_metadata():
    return project_config['metadata']

*****

# <b>Text preprocessing pipeline</b>

### Helper functions to preprocess text data

#### 1. Clean text.

In [None]:
# Clean text to remove html formattings, emojis, puntuations and normalize spaces.
def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    
    # Remove html formats.
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove emojis.
    text = emoji.replace_emoji(text, replace="")
    
    # Remove puntuations.
    text = re.sub(r"[^\w\s]", " ", text)
    
    # Remove normalize spaces.
    text = re.sub(r"\s+", " ", text).strip()

    # Remove url links.
    text = re.sub(r"http\S+|www\S+", "", text)

    return text

#### 2. Text normalisation.

In [None]:
# Normalize text.
def normalize_text(text: str) -> str:
    text = text.lower()
    text = unidecode(text)
    return text

#### 3. Tokenization and filter.

In [None]:
# Tokenize texts in sentences and words, and remove stopwords.
def tokenize_and_filter(row, axis = 1):
    text = row['cleaned_text']
    lang = row['language']

    # Sentence tokenize.
    sentences = sent_tokenize(text)

    # Word tokenize
    words = word_tokenize(text)

    # remove stopwords if available for that language.
    if stopwords.has_lang(lang):
        sw = stopwords.stopwords(lang)
        words = [w for w in words if w not in sw]

    return pd.Series({"sentences": sentences, "tokens": words})
    

#### 4. Comibined all processeing steps in a single function.

In [None]:
# ---------------- Text Cleaning ----------------
def process_text(text: str):
    if not isinstance(text, str) or not text.strip():
        return None
    
    text = clean_text(text)
    text = normalize_text(text)
    
    return text

#### 4. Language detection process

In [None]:
# ---------------- Single-Batch Language Detection ----------------
def detect_language_batch(text_batch):
    labels, _ = fast_track_language_detection_model.predict(text_batch, k=1)
    return [lbl[0].replace("__label__", "") if lbl else "unknown" for lbl in labels]

4. Thread based batch language detection process.

In [None]:
# ---------------- Threaded Batch Language Detection ----------------
def batch_detect_language_parallel(texts, batch_size=1000):
    chunks = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
    languages = []

    with Pool() as pool:  # ThreadPool
        for batch_result in tqdm(pool.imap(detect_language_batch, chunks), total=len(chunks), desc="Language Detection"):
            languages.extend(batch_result)
    
    return languages

#### 5. Preprocessing single dataframe.

In [None]:
# ---------------- Preprocess Single DataFrame ----------------
def preprocess_dataframe_parallel(df, metadata):
    """
    Takes a dataframe and a text column, returns new Dataframe with 
    cleaned, normalized, language, tokens.
    """
    initial_count = len(df)
    text_column = metadata['text_column']
    batch_size = metadata['batch_size']
    
    # Remove duplicates
    tqdm.pandas(desc="Removing duplicates....")
    df = df.drop_duplicates(subset=[text_column])
    duplicates_removed = initial_count - len(df)
    
    # Clean text with progress bar
    tqdm.pandas(desc="Cleaning text....")
    df['cleaned_text'] = df[text_column].progress_apply(process_text)

    
    # Remove empty cleaned text
    df = df[df['cleaned_text'].notna()].reset_index(drop=True)
    
    # Detect languages in parallel
    texts = df['cleaned_text'].tolist()
    df['language'] = batch_detect_language_parallel(texts, batch_size=batch_size)

    # Remove unknown languages
    df = df[df['language'] != "unknown"].reset_index(drop=True)

    # tokenization, processin.
    tqdm.pandas(desc="Tokenizing text....")
    df_tokens = df.progress_apply(tokenize_and_filter, axis = 1)
    df = df.join(df_tokens)
    
    # Capture site stats
    site_stats = {
        "total_rows": initial_count,
        "duplicates_removed": duplicates_removed,
        "rows_kept": len(df),
        "languages_detected": df['language'].unique().tolist()
    }
    
    return df, site_stats

*****

# <b> Data preparation pipeline.</b>

### Multiple csv files reading and processing with summary.

In [None]:
# ---------------- Multi-Site CSV Pipeline with Summary ----------------
def process_all_sites_with_summary(metadata):

    # Get list of sites.
    sites = site_metadata_config.get('sites', [])

    # Prepare return variables.
    all_dfs = []
    summary_list = []

    for site_csv in sites:
        csv_directory = f"{MULTILINGUAL_DOCUMENTS_DIRECTORY_PATH}/{site_csv}"
        print(csv_directory)

        # ---------------- Get list of CSV files ----------------
        csv_files = glob.glob(os.path.join(csv_directory, "*.csv"))
        logger.info(f"Found {len(csv_files)} CSV files.")

        # ---------------- Read all CSVs and combine ----------------
        for file in csv_files:
            if os.path.exists(file):
                input_df = pd.read_csv(file)
                df = input_df.copy()
                logger.info(f"[INFO] Processing site: {site_csv} ({len(df)} rows)")

                # Trigger cleaning of dataframes.
                df_cleaned, site_stats = preprocess_dataframe_parallel(df, metadata)
                logger.info(f"[INFO] Done {site_csv}: {site_stats['duplicates_removed']} duplicates removed, {site_stats['rows_kept']} rows kept")
                site_stats["site"] = site_csv
                summary_list.append(site_stats)
                all_dfs.append(df_cleaned)
            else:
                
                logger.info(f"[WARNING] File not found: {file}")
    
    # Merge all cleaned DataFrames
    if all_dfs:
        merged_df = pd.concat(all_dfs, ignore_index=True)
        merged_df = merged_df.drop_duplicates(subset=[metadata['text_column'], 'cleaned_text']).reset_index(drop=True)
        logger.info(f"[INFO] Merged DataFrame contains {len(merged_df)} unique rows after deduplication")
        
        # Save if requested
        save_path = f"{DATA_DIRECTORY_PATH}/{metadata['processed_file_name']}.csv"
        save_format = metadata['data_file_format']
        if save_path:
            if save_format.lower() == "csv":
                merged_df.to_csv(save_path, index=False)
            elif save_format.lower() == "parquet":
                merged_df.to_parquet(save_path, index=False)
            else:
                print(f"[WARNING] Unknown save_format '{save_format}'. Skipping save.")
            logger.info(f"[INFO] Saved merged DataFrame to {save_path}")
        
        # Create summary DataFrame
        summary_df = pd.DataFrame(summary_list)
        print("\n[INFO] Site Summary Table:")
        print(summary_df)
        
        return merged_df, summary_df
    else:
        print("[INFO] No valid data found in any site CSVs.")
        return pd.DataFrame(), pd.DataFrame()
    

In [None]:
# Get the list of sites and candidate languages configurations.
metadata = get_metadata()

# Trigger pipeline.
processed_df, summary_df = process_all_sites_with_summary(metadata)


### 1. Data collection

#### Sites
1. Global (https://www.unicef.org/)
2. Armenia (https://www.unicef.org/armenia/)
3. Bangladesh (https://www.unicef.org/bangladesh/)
4. Cambodia (https://www.unicef.org/cambodia/)
5. China (https://www.unicef.org/china/)
6. ECA (https://www.unicef.org/eca/)
7. India (https://www.unicef.org/india/)
8. Myanmar (https://www.unicef.org/myanmar)
9. Peru (https://www.unicef.org/peru/)
10. Vietnam (https://www.unicef.org/vietnam/)

#### Low level Language

#### Collect text data of press releases and articles from all candidate sites.

#### Get CSV data into dataframe and execute pre-processing steps.

1. Read data from csv files.

******

## Load processed and cleaned data.

In [None]:
save_path = f"{DATA_DIRECTORY_PATH}/{project_config['metadata']['processed_file_name']}.csv"
df = pd.read_csv(save_path)

In [None]:
df

In [None]:
import re
import unicodedata
import pandas as pd
from bs4 import BeautifulSoup
import ftfy
import langid
from tqdm.auto import tqdm
from blingfire import text_to_sentences, text_to_words

# ---- Optional tokenizers (if available) for better segmentation in some languages
try:
    import jieba             # zh
except Exception:
    jieba = None

try:
    from fugashi import Tagger  # ja
    _ja_tagger = Tagger() if 'Tagger' in globals() else None
except Exception:
    _ja_tagger = None

try:
    import pythainlp.tokenize as thai_tok  # th
except Exception:
    thai_tok = None

tqdm.pandas()  # enable progress bars on DataFrame.apply

URL_RE = re.compile(r'https?://\S+|www\.\S+', flags=re.IGNORECASE)
CTRL_RE = re.compile(r'[\u0000-\u001f\u007f\u200b\u200c\u200d]')  # control + ZW* chars

def normalize_text(text: str) -> str:
    """Unicode-safe normalization & light cleaning that respects multilingual scripts."""
    if not isinstance(text, str):
        return ""
    # Fix mojibake & odd encodings, normalize Unicode
    s = ftfy.fix_text(text)
    s = unicodedata.normalize("NFC", s)

    # Strip HTML if any
    s = BeautifulSoup(s, "html.parser").get_text(" ")

    # Remove URLs (optional): replace with a token so sentence boundaries remain sane
    s = URL_RE.sub(" <URL> ", s)

    # Remove control chars (keep emojis & CJK intact)
    s = CTRL_RE.sub(" ", s)

    # Collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()

    # DO NOT lowercase by default (can harm German ß, Turkish I/ı, proper nouns, etc.)
    # If you need case-insensitive search later, use .casefold() at query time.
    return s

def detect_lang(text: str) -> str:
    """Fast, offline language detection."""
    if not text:
        return "unk"
    code, _ = langid.classify(text)
    return code

def sent_tokenize(text: str) -> list[str]:
    """Multilingual sentence segmentation via blingfire (robust & very fast)."""
    if not text:
        return []
    # blingfire returns '\n' separated sentences
    sents = text_to_sentences(text).split("\n")
    # Clean up any blanks
    return [s.strip() for s in sents if s.strip()]

def word_tokenize(text: str, lang: str) -> list[str]:
    """
    Word tokenization with multilingual fallbacks:
    - zh: jieba if available, else blingfire
    - ja: fugashi if available, else blingfire
    - th: PyThaiNLP if available, else blingfire
    - others: blingfire
    (blingfire respects CJK/emoji and is a sensible default.)
    """
    if not text:
        return []

    lang = (lang or "").split("_")[0]  # normalize like 'zh', 'ja', 'th', 'en', etc.

    if lang == "zh" and jieba is not None:
        return [t.strip() for t in jieba.cut(text) if t.strip()]
    if lang == "ja" and _ja_tagger is not None:
        return [w.surface for w in _ja_tagger(text) if w.surface.strip()]
    if lang == "th" and thai_tok is not None:
        return [t.strip() for t in thai_tok.word_tokenize(text) if t.strip()]

    # Default: blingfire word breaker -> returns a space-separated string
    return [w for w in text_to_words(text).split() if w]

def preprocess_row(row):
    """
    Apply the full pipeline to a single row with columns:
    - 'title' (kept as-is)
    - 'text'  (processed)
    Returns a dict to be expanded into new columns.
    """
    raw = row.get("text", "")
    clean = normalize_text(raw)
    lang = detect_lang(clean) if clean else "unk"
    sents = sent_tokenize(clean)
    tokens = word_tokenize(clean, lang)

    return {
        "ai_clean_text": clean,
        "ai_lang": lang,
        "ai_sentences": sents,
        "ai_tokens": tokens,
        "ai_n_sentences": len(sents),
        "ai_n_tokens": len(tokens),
    }

# ------------- Example usage -------------
# Suppose you already have a DataFrame df with columns: 'title', 'text'
# df = pd.read_csv("your_multilingual.csv")  # for example

# Process with a progress bar:
# (If you see tokenizer parallelism warnings anywhere else in your stack, you can:
#   import os; os.environ["TOKENIZERS_PARALLELISM"] = "false"
# They typically come from Hugging Face tokenizers, not used here.)
results = df.progress_apply(preprocess_row, axis=1, result_type="expand")

# Merge back with original columns (keeping title/text)
df_processed = pd.concat([df[["title", "text"]], results], axis=1)

# Optionally, persist efficiently:
# df_processed.to_parquet("multilingual_processed.parquet", index=False)
# Or CSV (loses Python list types unless you json-encode them):
# df_processed.to_json("multilingual_processed.jsonl", orient="records", lines=True, force_ascii=False)


In [None]:
df_processed

In [None]:
summary_df

******

## <b>Model Embedding Pipeline</b>

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # avoids fork warning

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor, as_completed

class EmbeddingPipeline:
    def __init__(self, model_name="all-MiniLM-L6-v2", index_dir="./index_store", batch_size=512, n_workers=4):
        self.model = SentenceTransformer(model_name)
        self.index_dir = index_dir
        os.makedirs(index_dir, exist_ok=True)
        self.index = None
        self.batch_size = batch_size
        self.dim = None
        self.texts = []
        self.n_workers = n_workers

    def generate_embeddings_batch(self, texts, save_path=None):
        """Generate embeddings in batches, normalize for cosine similarity, optionally save to disk."""
        all_embeddings = []
        for start in range(0, len(texts), self.batch_size):
            batch_texts = texts[start:start+self.batch_size]
            batch_embeddings = self.model.encode(batch_texts, convert_to_numpy=True)
            faiss.normalize_L2(batch_embeddings)
            all_embeddings.append(batch_embeddings)
            print(f"Processed batch {start}-{start+len(batch_texts)}")
        all_embeddings = np.vstack(all_embeddings)
        print(f"Total embeddings shape: {all_embeddings.shape}")

        if save_path:
            np.save(save_path, all_embeddings)
            print(f"Saved embeddings to {save_path}")

        self.dim = all_embeddings.shape[1]
        self.texts = texts
        return all_embeddings

    def build_index(self, embeddings, index_type="flat", **kwargs):
        """Build FAISS index (flat, hnsw, or ivf) with cosine similarity."""
        if self.dim is None:
            self.dim = embeddings.shape[1]

        if index_type == "flat":
            self.index = faiss.IndexFlatIP(self.dim)

        elif index_type == "hnsw":
            M = kwargs.get("M", 16)
            efConstruction = kwargs.get("efConstruction", 100)
            self.index = faiss.IndexHNSWFlat(self.dim, M, faiss.METRIC_INNER_PRODUCT)
            self.index.hnsw.efConstruction = efConstruction

        elif index_type == "ivf":
            nlist = kwargs.get("nlist", 100)
            quantizer = faiss.IndexFlatIP(self.dim)
            self.index = faiss.IndexIVFFlat(quantizer, self.dim, nlist, faiss.METRIC_INNER_PRODUCT)
            print("Training IVF index...")
            self.index.train(embeddings)

        else:
            raise ValueError("Unsupported index type. Use 'flat', 'hnsw', or 'ivf'.")

        # Add in batches
        for i in range(0, embeddings.shape[0], self.batch_size):
            self.index.add(embeddings[i:i+self.batch_size])

    def save_index(self, name="index.faiss"):
        path = os.path.join(self.index_dir, name)
        faiss.write_index(self.index, path)
        print(f"Index saved at {path}")

    def load_index(self, name="index.faiss"):
        path = os.path.join(self.index_dir, name)
        self.index = faiss.read_index(path)
        print(f"Index loaded from {path}")

    def search(self, query, top_k=5, as_df=True):
        query_embedding = self.model.encode([query], convert_to_numpy=True)
        faiss.normalize_L2(query_embedding)
        D, I = self.index.search(query_embedding, top_k)

        if as_df:
            results = []
            for rank, (score, idx) in enumerate(zip(D[0], I[0]), start=1):
                results.append({
                    "query": query,
                    "rank": rank,
                    "index": int(idx),
                    "cosine_similarity": float(score),
                    "text": self.texts[idx] if self.texts else None
                })
            return pd.DataFrame(results)
        return D, I

    def search_batch(self, queries, top_k=5):
        query_embeddings = self.model.encode(queries, convert_to_numpy=True)
        faiss.normalize_L2(query_embeddings)
        D, I = self.index.search(query_embeddings, top_k)

        all_results = []
        for q_idx, query in enumerate(queries):
            for rank, (score, idx) in enumerate(zip(D[q_idx], I[q_idx]), start=1):
                all_results.append({
                    "query": query,
                    "rank": rank,
                    "index": int(idx),
                    "cosine_similarity": float(score),
                    "text": self.texts[idx] if self.texts else None
                })
        return pd.DataFrame(all_results)

In [None]:
pipeline = EmbeddingPipeline(batch_size=1024, n_workers=8)

In [None]:
sentences = df['cleaned_text']

In [None]:
# Step 1: Generate embeddings in batches
embeddings = pipeline.generate_embeddings_batch(sentences.to_list(), save_path="embeddings_100k.npy")

In [None]:
# Build HNSW index using threads
pipeline.build_index(embeddings, index_type="flat", M=16, efConstruction=100)

In [None]:

# Step 4: Save index
pipeline.save_index("faiss_hnsw_parallel.faiss")

### Load saved vector indexed data.

In [None]:
pipeline = EmbeddingPipeline()
pipeline.load("large_index.faiss", "metadata_large.pkl")

In [None]:
# Step 3: Search batch queries
queries = ["Unicef kids", "उम्मीदों की नई सुबह", "mundo"]
df_results = pipeline.search_batch(queries, top_k=5)

In [None]:
df_results

*****

# Query Processing Pipeline

### Load index for query processing.

In [None]:
# Later reload:
index = faiss.read_index("paraphrase-multilingual-MiniLM-L12-v2__indexes.faiss")

#### Query to indexed data.

In [None]:
# Query FAISS
query = "situation reports in hindi"
query_vec = model.encode([query], convert_to_numpy=True).astype("float32")
D, I = index.search(query_vec, k=3)

## Print results.

In [None]:
# Fetch metadata from SQLite
print("\nSearch results:")
I[0][0]

In [104]:
pip freeze > requirements_new.txt


Note: you may need to restart the kernel to use updated packages.


In [105]:
conda env export --no-builds > environment.yml



Note: you may need to restart the kernel to use updated packages.
