In [1]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
# Uncomment if chromadb is not installed
!pip install --quiet chromadb transformers torch

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m71.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━

In [2]:

import os
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import chromadb

# Config
MODEL_NAME = 'intfloat/e5-base-v2'
CHROMA_PATH = 'hajj_e5_chroma'
COLLECTION_NAME = 'hajj_e5'

# Prefixes
PASSAGE_PREFIX = 'passage: '
QUERY_PREFIX = 'query: '

# Paths to load data
JSON_PATH = '/kaggle/input/rijvxetjkr3wkr/hajj_chunks_e5.json'
NPY_PATH = '/kaggle/input/rijvxetjkr3wkr/hajj_embeddings_e5.npy'


In [4]:

# Load model and tokenizer for query encoding
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

# Helper to encode and normalise text
def embed_query(text: str):
    input_text = QUERY_PREFIX + text
    encoded = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512)
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        out = model(**encoded)
        token_embeds = out.last_hidden_state
        mask = encoded['attention_mask'].unsqueeze(-1)
        sum_embeds = (token_embeds * mask).sum(dim=1)
        sum_mask = mask.sum(dim=1)
        embed = (sum_embeds / sum_mask).squeeze(0).cpu().numpy()
    # Normalise
    norm = np.linalg.norm(embed)
    if norm > 0:
        embed = embed / norm
    return embed


In [5]:

# Load chunks and embeddings
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    chunks = json.load(f)
embeddings = np.load(NPY_PATH)

print(f"Loaded {len(chunks)} chunks and embeddings {embeddings.shape}")

# Initialise Chroma client
client = chromadb.PersistentClient(path=CHROMA_PATH)

# Drop existing collection if needed
try:
    client.delete_collection(name=COLLECTION_NAME)
except Exception:
    pass

collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={'hnsw:space': 'cosine'}
)

# Prepare ids, documents, metadatas
ids = [f"chunk_{c['chunk_id']}" for c in chunks]
texts = [c['text'] for c in chunks]
metadatas = [{'start_token': c['start_token'], 'end_token': c['end_token']} for c in chunks]

# Add to collection in one call (small dataset)
collection.add(
    ids=ids,
    documents=texts,
    metadatas=metadatas,
    embeddings=embeddings.tolist()
)

print(f"Collection '{COLLECTION_NAME}' now has {collection.count()} embeddings.")


Loaded 400 chunks and embeddings (400, 768)
Collection 'hajj_e5' now has 400 embeddings.


In [9]:

# Example query
def search(query_str, top_k=10, re_rank=True):
    query_embed = embed_query(query_str)
    result = collection.query(query_embeddings=[query_embed.tolist()], n_results=top_k)
    ids = result['ids'][0]
    dists = result['distances'][0]
    docs = result['documents'][0]
    metas = result['metadatas'][0]
    hits = []
    for i, id_, dist in zip(range(len(ids)), ids, dists):
        hits.append({'id': id_, 'distance': dist, 'text': docs[i], 'metadata': metas[i]})
    # Lexical re‑ranking based on word overlap
    if re_rank:
        query_tokens = set(query_str.lower().split())
        for h in hits:
            text_tokens = set(h['text'].lower().split())
            overlap = len(query_tokens & text_tokens)
            h['lexical_score'] = overlap
        hits.sort(key=lambda x: x['lexical_score'], reverse=True)
    return hits


In [13]:
# Perform a search
query = "Tawaf steps"
hits = search(query, top_k=20, re_rank=True)
print(f"Top results for query: '{query}'")
for i, h in enumerate(hits[:5], 1):
#    print(f"Rank {i}: distance={h['distance']:.4f}, lexical_score={h['lexical_score']}, text snippet={h['text'][:150].replace('',' ')}")
    snippet = h['text'][:].replace("\n", " ")
    print(f"\nRank {i}: \ndistance={h['distance']:.4f}, \nlexical_score={h['lexical_score']}, \ntext snippet={snippet}\n\n")


Top results for query: 'Tawaf steps'

Rank 1: 
distance=0.1527, 
lexical_score=2, 
text snippet=##cautionary measures - special wheelchairs available for elderly and those in need sunnahs of tawaf : - al - ithtiba : men expose right shoulder during tawaf - ar - ramal : fast walking with small steps during first three circuits if possible - supplication : increase supplications, especially say between yemeni corner and black stone : " rabbanaa aatinaa fid - dunyaa hassanatan wa fil aakhirati hassanah, waqinaa'adhaab - an - naar " two rak'aas after tawaf



Rank 2: 
distance=0.1275, 
lexical_score=1, 
text snippet=- don't push in crowds - avoid placing feet on side brushes tawaf detailed instructions starting and ending tawaf : - begin from the black stone corner ( green sign indicates this on upper floors ) - perform takbeer ( allaahu akbar ) when passing the black stone each round - point toward black stone with hand, then begin tawaf with ka'ba to your left ( counterclockwise ) - kiss