In [7]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'
# Uncomment if chromadb is not installed
!pip install --quiet chromadb transformers torch

In [8]:

import os
import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import chromadb

# Config
MODEL_NAME = 'intfloat/e5-base-v2'
CHROMA_PATH = 'hajj_e5_chroma'
COLLECTION_NAME = 'hajj_e5'

# Prefixes
PASSAGE_PREFIX = 'passage: '
QUERY_PREFIX = 'query: '

# Paths to load data
JSON_PATH = '/kaggle/input/rijvxetjkr3wkr/hajj_chunks_e5.json'
NPY_PATH = '/kaggle/input/rijvxetjkr3wkr/hajj_embeddings_e5.npy'


In [9]:

# Load model and tokenizer for query encoding
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

# Helper to encode and normalise text
def embed_query(text: str):
    input_text = QUERY_PREFIX + text
    encoded = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512)
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        out = model(**encoded)
        token_embeds = out.last_hidden_state
        mask = encoded['attention_mask'].unsqueeze(-1)
        sum_embeds = (token_embeds * mask).sum(dim=1)
        sum_mask = mask.sum(dim=1)
        embed = (sum_embeds / sum_mask).squeeze(0).cpu().numpy()
    # Normalise
    norm = np.linalg.norm(embed)
    if norm > 0:
        embed = embed / norm
    return embed


In [10]:
# Load cross-encoder for re-ranking
try:
    from transformers import AutoTokenizer as CETokenizer, AutoModelForSequenceClassification
    ce_model_name = 'cross-encoder/ms-marco-MiniLM-L-6-v2'
    ce_tokenizer = CETokenizer.from_pretrained(ce_model_name)
    ce_model = AutoModelForSequenceClassification.from_pretrained(ce_model_name)
    ce_model.to(device)
    ce_model.eval()
    USE_CROSS_ENCODER = True
except Exception as e:
    print(f"Failed to load cross-encoder model: {e}")
    USE_CROSS_ENCODER = False

# Define re-ranking function using cross-encoder
def rerank_cross_encoder(query_str, candidates):
    """
    Re-rank candidate documents using a cross-encoder.
    Args:
        query_str (str): The user query.
        candidates (list[dict]): List of candidate hits returned by vector search.
    Returns:
        list[dict]: Candidates sorted by cross-encoder score.
    """
    if not USE_CROSS_ENCODER or len(candidates) == 0:
        return candidates
    # Prepare text pairs
    queries = [query_str] * len(candidates)
    docs = [c['text'] for c in candidates]
    enc = ce_tokenizer(queries, docs, padding=True, truncation=True, return_tensors='pt').to(device)
    with torch.no_grad():
        logits = ce_model(**enc).logits.squeeze()
    # Ensure logits is 1D
    scores = logits.cpu().numpy().flatten().tolist()
    for cand, s in zip(candidates, scores):
        cand['cross_score'] = float(s)
    # Sort by cross-encoder score (higher is better)
    candidates.sort(key=lambda x: x.get('cross_score', 0), reverse=True)
    return candidates


In [11]:

# Load chunks and embeddings
with open(JSON_PATH, 'r', encoding='utf-8') as f:
    chunks = json.load(f)
embeddings = np.load(NPY_PATH)

print(f"Loaded {len(chunks)} chunks and embeddings {embeddings.shape}")

# Initialise Chroma client
client = chromadb.PersistentClient(path=CHROMA_PATH)

# Drop existing collection if needed
try:
    client.delete_collection(name=COLLECTION_NAME)
except Exception:
    pass

collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    metadata={'hnsw:space': 'cosine'}
)

# Prepare ids, documents, metadatas
ids = [f"chunk_{c['chunk_id']}" for c in chunks]
texts = [c['text'] for c in chunks]
metadatas = [{'start_token': c['start_token'], 'end_token': c['end_token']} for c in chunks]

# Add to collection in one call (small dataset)
collection.add(
    ids=ids,
    documents=texts,
    metadatas=metadatas,
    embeddings=embeddings.tolist()
)

print(f"Collection '{COLLECTION_NAME}' now has {collection.count()} embeddings.")


Loaded 400 chunks and embeddings (400, 768)
Collection 'hajj_e5' now has 400 embeddings.


In [21]:
# Example query
def search(query_str, top_k=10, re_rank=True):
    """
    Search for relevant passages using vector similarity and optionally re-rank them.
    Args:
        query_str (str): User query.
        top_k (int): Number of candidates to retrieve from vector search.
        re_rank (bool): Whether to apply re-ranking to the candidates.
    Returns:
        list[dict]: List of hit dictionaries containing id, distance, text, metadata, and scores.
    """
    query_embed = embed_query(query_str)
    result = collection.query(query_embeddings=[query_embed.tolist()], n_results=top_k)
    ids = result['ids'][0]
    dists = result['distances'][0]
    docs = result['documents'][0]
    metas = result['metadatas'][0]
    hits = []
    for i, (id_, dist) in enumerate(zip(ids, dists)):
        hit = {
            'id': id_,
            'distance': float(dist),
            'text': docs[i],
            'metadata': metas[i]
        }
        hits.append(hit)
    # Lexical overlap score
    query_tokens = set(query_str.lower().split())
    for h in hits:
        text_tokens = set(h['text'].lower().split())
        h['lexical_score'] = len(query_tokens & text_tokens)
    # Apply re-ranking if enabled
    if re_rank:
        # First, use cross-encoder for fine-grained scoring
        hits = rerank_cross_encoder(query_str, hits)
        # If cross-encoder is not available, sort by lexical score as fallback
        if not USE_CROSS_ENCODER:
            hits.sort(key=lambda x: x['lexical_score'], reverse=True)
    return hits


In [32]:
# Perform a search
query = "buy food"
hits = search(query, top_k=40, re_rank=True)
print(f"Top results for query: '{query}'")
for i, h in enumerate(hits[:5], 1):
    snippet = h['text'][:].replace("\n", " ")
    print(f"\nRank {i}: \ndistance={h['distance']:.4f}, \nlexical_score={h['lexical_score']}, \ntext snippet={snippet}\n\n")


Top results for query: 'buy food'

Rank 1: 
distance=0.1793, 
lexical_score=2, 
text snippet=when going out during the day, avoid direct exposure to the sun and use a light coloured parasol - try to get enough sleep at night and avoid staying up late, because lack of sleep exposes the body to stress and lowers resistance protect yourself from food poisoning prevention guidelines : - avoid storing cooked food or eating it a long time after buying it, especially while moving for long periods - be careful when storing cooked food at room temperature for more than two hours as this leads to the proliferation of germs and the possibility of food poisoning - make sure to wash fruits and vegetables thoroughly before eating them - do not buy food from street vendors -



Rank 2: 
distance=0.2429, 
lexical_score=2, 
text snippet=eat regularly and drink plenty of fluids to prevent dehydration - do not buy food from street vendors face mask usage usage requirements : - prevent spread of infectiou