# <b>Created DENSE and SPARSE indexes through various indexing techniques.</b>

#### 1) Dense index (Semantic) with FAISS.

FAISS is perfect for 50K - 500K vectors on one machine.

In [12]:
import numpy as np, faiss
from pathlib import Path
import os, math, time, numpy as np, pandas as pd, torch

from elasticsearch import Elasticsearch, helpers

In [13]:
MODEL_TAG = "paraphrase-multilingual-MiniLM-L12-v2"
METADATA_FILENAME = f"{MODEL_TAG}__meta.parquet"
METADATA_FILENAME

'paraphrase-multilingual-MiniLM-L12-v2__meta.parquet'

In [3]:
df_meta = pd.read_parquet(f"data/embed/{METADATA_FILENAME}")  # includes global_chunk_id, etc.

In [None]:
df_meta.head()

Unnamed: 0,global_chunk_id,doc_id,chunk_id,site,lang,title,preview,chunk_tokens
0,0:0,0,0,armenia__textcontent_article,en,Երբեք չէի պատկերացնի,"""I have never thought that I can do important ...",288
1,1:0,1,0,armenia__textcontent_article,en,Երբեք չէի պատկերացնի,We spoke to Heghine for a long time and she of...,350
2,1:1,1,1,armenia__textcontent_article,en,Երբեք չէի պատկերացնի,"responsibility, this is her opportunity to als...",68
3,2:0,2,0,armenia__textcontent_article,hy,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,Տավուշի մարզի Ներքին Ծաղկավան գյուղի դպրոցի VI...,174
4,3:0,3,0,armenia__textcontent_article,hy,Աղետներին պատրաստ դպրոց` սահմանամերձ գյուղում,Վերջին երեք տարիներին ՅՈՒՆԻՍԵՖ-ն այս ուղղությա...,181


Load metadata and embedding files.

In [5]:
EMBED_FILENAME = f"{MODEL_TAG}_384d_50k_float32.npy"
EMBED_FILENAME

'paraphrase-multilingual-MiniLM-L12-v2_384d_50k_float32.npy'

In [6]:
# Load your embeddings and metadata (from earlier step)
embeddings = np.load(f"data/embed/{EMBED_FILENAME}").astype("float32")  # (N, d) L2-normalized


In [7]:
# Validate dimensions.
dim = embeddings.shape[1]

## Write index to FAISS.

In [8]:
## Filename.
INDEX_FILENAME = f"{MODEL_TAG}__indexes.faiss"

In [9]:
INDEX_FILENAME

'paraphrase-multilingual-MiniLM-L12-v2__indexes.faiss'

In [10]:
# use IP since vectors are L2-normalized -> cosine
index = faiss.IndexFlatIP(dim)

# Add embeddings.
index.add(embeddings)

In [11]:
# Write index to the shared data library.
faiss.write_index(index, f"../shared-data-library/indexes/faiss/{INDEX_FILENAME}")

****

## <b>Keyword based indexes (SPARSE)</b>

### <b>2. Prepare data & bulk-ingest passages</b>

In [16]:
# Example: load the duplicates file
df_passages = pd.read_parquet("../shared-data-library/out/df_passages.parquet")

In [17]:
# Ensure stable IDs
df_passages = df_passages.copy()
df_passages["global_chunk_id"] = (
    df_passages["doc_id"].astype(str) + ":" + df_passages["chunk_id"].astype(int).astype(str)
)

In [19]:
# Minimal columns for ES
to_index = df_passages.rename(columns={"site": "site"})[
    ["global_chunk_id","doc_id","chunk_id","site","lang","title","chunk_text","preview","chunk_tokens"]
].fillna({"title":"", "preview":"", "chunk_text":""})

In [22]:
# Create instance of elasticsearch server.
es = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "DfZP9TzO")   # 👈 add this
)


In [24]:
def gen_actions(df):
    for r in df.itertuples(index=False):
        yield {
            "_index": "passages_bm25",
            "_id": r.global_chunk_id,
            "_source": r._asdict()
        }

In [25]:
# Ingest
helpers.bulk(es, gen_actions(to_index), request_timeout=180)

  helpers.bulk(es, gen_actions(to_index), request_timeout=180)


(51968, [])

In [27]:
# Refresh sparse index.
es.indices.refresh(index="passages_bm25")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

******