# <b>Multilingual Indexing</b>

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"   # was "false" earlier for safety

In [None]:
# 0) Config
import os, math, time, numpy as np, pandas as pd, torch
from tqdm.notebook import tqdm
from transformers import AutoTokenizer, AutoModel

In [None]:
os.environ.setdefault("TOKENIZERS_PARALLELISM", "true")  # speed up tokenization
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DTYPE  = torch.float16 if DEVICE.type == "cuda" else torch.float32

### Choose model

In [None]:
# Choose model (speed tiers):
# FAST:    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" (384d)
# BALANCE: "intfloat/multilingual-e5-base" (768d)
# QUALITY: "intfloat/multilingual-e5-large" (1024d)
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model = AutoModel.from_pretrained(MODEL_NAME, torch_dtype=(torch.float16 if DEVICE.type=="cuda" else None))
model.to(DEVICE).eval()

# Enable fused attention (BetterTransformer) if available
try:
    from optimum.bettertransformer import BetterTransformer
    model = BetterTransformer.transform(model)
except Exception:
    pass  # it's fine if not installed

In [None]:
def model_token_budget(tok, headroom=16, cap_default=512):
    ml = getattr(tok, "model_max_length", None)
    if ml is None or ml > 100_000_000: ml = cap_default
    return max(32, int(ml - headroom))
TOKEN_BUDGET = model_token_budget(tokenizer)

In [None]:
def add_passage_prefix(texts):
    # Only E5 needs "passage: " prefix
    return [f"passage: {t}" for t in texts] if "intfloat/multilingual-e5" in MODEL_NAME.lower() else texts

In [None]:
@torch.no_grad()
def mean_pool(last_hidden_state, attention_mask):
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    summed = (last_hidden_state * mask).sum(dim=1)
    counts = mask.sum(dim=1).clamp(min=1e-6)
    return summed / counts

In [None]:
def embed_batch(texts, max_len=TOKEN_BUDGET):
    enc = tokenizer(
        texts, padding=True, truncation=True, max_length=max_len,
        return_tensors="pt"
    )
    enc = {k: v.to(DEVICE, non_blocking=True) for k, v in enc.items()}
    with torch.inference_mode(), (
        torch.autocast(device_type=DEVICE.type, dtype=torch.float16) if DEVICE.type=="cuda" else torch.no_grad()
    ):
        out = model(**enc)
        pooled = mean_pool(out.last_hidden_state, enc["attention_mask"])
        pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
    # Keep float32 for FAISS stability downstream
    return pooled.to(torch.float32).cpu().numpy()

### <b>Create embeddings:</b>

Load Chunked and tokenized data passages.

In [None]:
# Example: load the duplicates file
df_passages = pd.read_parquet("../shared-data-library/out/df_passages.parquet")

In [None]:
df_passages.head()

Input and outputs creation

In [None]:
# 1) Inputs & outputs
texts = df_passages["chunk_text"].astype(str).tolist()
N = len(texts)

In [None]:
# Choose batch size
BATCH = 768 if DEVICE.type=="cuda" else 256   # tune: 512–1024 (GPU), 64–128 (CPU)
OUT_DIR = "data/embed"
os.makedirs(OUT_DIR, exist_ok=True)

In [None]:
# Determine embedding dimensionality once (dry run on 1 example)
test_vec = embed_batch(add_passage_prefix([texts[0]]))
DIM = test_vec.shape[1]

In [None]:
# Use a memory-mapped array to write incrementally (resumable)
mmap_path = os.path.join(OUT_DIR, f"{MODEL_NAME.split('/')[-1]}_{DIM}d_50k_float32.mm")
embs = np.memmap(mmap_path, dtype="float32", mode="w+", shape=(N, DIM))

In [None]:
# Optional: resume support — check how many rows already filled (NaNs if unwritten)
# For a fresh run, start = 0. If resuming, detect start index from a sidecar file.
start = 0
sidecar = mmap_path + ".idx"
if os.path.exists(sidecar):
    try:
        start = int(open(sidecar).read().strip())
    except Exception:
        start = 0

In [None]:
t0 = time.time()
for i in tqdm(range(start, N, BATCH), desc=f"Embedding on {DEVICE}", unit="batch"):
    j = min(i + BATCH, N)
    batch = add_passage_prefix(texts[i:j])
    vecs = embed_batch(batch, max_len=TOKEN_BUDGET)
    embs[i:j, :] = vecs
    # Flush progress & write checkpoint index
    embs.flush()
    with open(sidecar, "w") as f:
        f.write(str(j))
    # Lightweight throughput display
    done = j
    dt = time.time() - t0
    if dt > 0:
        tqdm.write(f"done {done}/{N} | {(done/dt):.1f} chunks/s | ETA {(N-done)/(done/dt+1e-9):.1f}s")

#### Write embedded data to npy file.

In [None]:
# Convert memmap to .npy cleanly
final_npy = os.path.join(OUT_DIR, f"{MODEL_NAME.split('/')[-1]}_{DIM}d_50k_float32.npy")
np.save(final_npy, np.asarray(embs))

# cleanup resume marker.
os.remove(sidecar)  
print("Saved:", final_npy)

### <b>Create DENSE index using FAISS.</b>

Stable global ID to align all stores (FAISS / Elasticsearch / SQLite)

In [60]:
# 1) Stable global ID to align all stores (FAISS / Elasticsearch / SQLite)
df_passages["global_chunk_id"] = (
    df_passages["doc_id"].astype(str) + ":" + df_passages["chunk_id"].astype(int).astype(str)
)

In [62]:
df_passages.head()

Unnamed: 0,doc_id,chunk_id,site,title,lang,chunk_text,chunk_tokens,sent_start,sent_end,preview,global_chunk_id
0,0,0,armenia__textcontent_article,Երբեք չէի պատկերացնի,en,"""I have never thought that I can do important ...",288,0,0,"""I have never thought that I can do important ...",0:0
1,1,0,armenia__textcontent_article,Երբեք չէի պատկերացնի,en,We spoke to Heghine for a long time and she of...,350,0,0,We spoke to Heghine for a long time and she of...,1:0
2,1,1,armenia__textcontent_article,Երբեք չէի պատկերացնի,en,"responsibility, this is her opportunity to als...",68,0,0,"responsibility, this is her opportunity to als...",1:1
3,2,0,armenia__textcontent_article,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,hy,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,174,0,0,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,2:0
4,3,0,armenia__textcontent_article,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,hy,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,181,0,0,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,3:0


#### Prepare metadata: Choose the columns you’ll want at retrieval time (add more if you need)

In [65]:
# 2) Choose the columns you’ll want at retrieval time (add more if you need)
metadata_columns = [
    "global_chunk_id", "doc_id", "chunk_id", "site", "lang",
    "title", "preview", "chunk_tokens"
]
df_meta = df_passages[metadata_columns].rename(columns={"_site":"site"})

In [66]:
df_meta.head()

Unnamed: 0,global_chunk_id,doc_id,chunk_id,site,lang,title,preview,chunk_tokens
0,0:0,0,0,armenia__textcontent_article,en,Երբեք չէի պատկերացնի,"""I have never thought that I can do important ...",288
1,1:0,1,0,armenia__textcontent_article,en,Երբեք չէի պատկերացնի,We spoke to Heghine for a long time and she of...,350
2,1:1,1,1,armenia__textcontent_article,en,Երբեք չէի պատկերացնի,"responsibility, this is her opportunity to als...",68
3,2:0,2,0,armenia__textcontent_article,hy,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,174
4,3:0,3,0,armenia__textcontent_article,hy,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,181


#### Save metadata bind with model name.

In [67]:
# 3) Save to Parquet (this is the file you’ll later load as meta)
# chosen model name
MODEL_TAG = "paraphrase-multilingual-MiniLM-L12-v2"   

In [None]:
meta_path = os.path.join('../shared-data-library/metadata/', f"{MODEL_TAG}__meta.parquet")
df_meta.to_parquet(meta_path, index=False)

print("Saved meta:", meta_path, "| rows:", len(df_meta))

Saved meta: data/embed/paraphrase-multilingual-MiniLM-L12-v2__meta.parquet | rows: 51968


****