In [1]:
# Install required packages (if not already installed)
!pip install --quiet chromadb transformers torch llama-cpp-python


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 MB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m95.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:

import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import chromadb
from llama_cpp import Llama

# Config for embedding model and Chroma
MODEL_NAME = 'intfloat/e5-base-v2'
CHROMA_PATH = '/kaggle/input/database-haj/hajj_e5_chroma_backup'
COLLECTION_NAME = 'hajj_e5'
PASSAGE_PREFIX = 'passage: '
QUERY_PREFIX = 'query: '
JSON_PATH = '/kaggle/input/database-haj/hajj_chunks_e5.json'
NPY_PATH = '/kaggle/input/database-haj/hajj_embeddings_e5.npy'

# Path to your quantised LLM file (gguf format)
#LLM_PATH = '/path/to/mistral-7b-instruct-q4_k_m.gguf'  # TODO: replace with actual path on your Pi

# Maximum tokens for generation and context
MAX_TOKENS = 256

# Device selection
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [5]:

# Load E5 model and tokenizer for query encoding
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

def embed_query(text: str):
    """Encode a query string into an embedding vector using E5 and normalise it."""
    input_text = QUERY_PREFIX + text
    encoded = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512)
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        out = model(**encoded)
        token_embeds = out.last_hidden_state
        mask = encoded['attention_mask'].unsqueeze(-1)
        sum_embeds = (token_embeds * mask).sum(dim=1)
        sum_mask = mask.sum(dim=1)
        embed = (sum_embeds / sum_mask).squeeze(0).cpu().numpy()
    norm = np.linalg.norm(embed)
    if norm > 0:
        embed = embed / norm
    return embed


In [7]:

# Copy the folder into a writable location (if it came from a read-only dataset)
import shutil
shutil.copytree('/kaggle/input/database-haj/hajj_e5_chroma_backup', '/kaggle/working/hajj_e5_chroma')

# Then point Chroma at the copy
CHROMA_PATH = '/kaggle/working/hajj_e5_chroma'
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection(name='hajj_e5', metadata={'hnsw:space': 'cosine'})

# Helper search function with lexical re-ranking as fallback
def search(query_str: str, top_k: int = 10, re_rank: bool = True):
    query_embed = embed_query(query_str)
    result = collection.query(query_embeddings=[query_embed.tolist()], n_results=top_k)
    ids = result['ids'][0]
    dists = result['distances'][0]
    docs = result['documents'][0]
    metas = result['metadatas'][0]
    hits = []
    for id_, dist, doc, meta in zip(ids, dists, docs, metas):
        hits.append({'id': id_, 'distance': float(dist), 'text': doc, 'metadata': meta})
    if re_rank:
        query_tokens = set(query_str.lower().split())
        for h in hits:
            text_tokens = set(h['text'].lower().split())
            h['lexical_score'] = len(query_tokens & text_tokens)
        hits.sort(key=lambda x: x['lexical_score'], reverse=True)
    return hits


In [27]:

def build_prompt(question: str, sources: list):
    """Construct a prompt for the LLM using the question and retrieved sources."""
    prompt_lines = []
    prompt_lines.append("You are an assistant answering questions about Hajj and Umrah.")
    prompt_lines.append("Answer concisely in plain English so that the response can be read aloud.")
    prompt_lines.append("Keep the answer to no more than 3–4 sentences.")
    prompt_lines.append(f"Question: {question}")
    prompt_lines.append("Sources:")
    for i, src in enumerate(sources, 1):
        text = src['text'].replace("", " ").strip()
        if len(text) > 300:
            text = text[:297] + '...'
        prompt_lines.append(f"[{i}] {text}")
    prompt_lines.append("Answer:")
    return "".join(prompt_lines)


In [16]:
!pip -q install llama-cpp-python huggingface_hub

In [23]:
from huggingface_hub import hf_hub_download, list_repo_files

repo = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"

files = [f for f in list_repo_files(repo) if f.lower().endswith(".gguf")]
candidates = [f for f in files if "q4_k_m" in f.lower()]

if not candidates:
    raise RuntimeError("No Q4_K_M .gguf found in the repo. Available: " + ", ".join(files))

filename = candidates[0]  
print("Selected file:", filename)

LLM_PATH = hf_hub_download(repo_id=repo, filename=filename)
print("Downloaded to:", LLM_PATH)

Selected file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
Downloaded to: /root/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF/snapshots/3a6fbf4a41a1d52e415a4958cde6856d34b2db93/mistral-7b-instruct-v0.2.Q4_K_M.gguf


In [24]:

# Load quantised local model using llama-cpp-python
# Ensure the .gguf model file exists at LLM_PATH on your Raspberry Pi
llm = Llama(model_path=LLM_PATH, n_ctx=2048)

def generate_answer(question: str, top_k: int = 5):
    """Retrieve sources and generate an answer using the local LLM."""
    hits = search(question, top_k=top_k, re_rank=True)
    prompt = build_prompt(question, hits)
    result = llm(prompt, max_tokens=MAX_TOKENS, temperature=0.2, top_p=0.95, stop=["Sources:", "Question:"])
    answer = result['choices'][0]['text'].strip()
    return answer, hits


llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF/snapshots/3a6fbf4a41a1d52e415a4958cde6856d34b2db93/mistral-7b-instruct-v0.2.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loa

In [28]:

# Example question
question = "What are the steps of Umrah?"
answer, sources = generate_answer(question, top_k=5)
print("Answer:", answer)


Llama.generate: 45 prefix-match hit, remaining 781 prompt tokens to eval
llama_perf_context_print:        load time =   86818.14 ms
llama_perf_context_print: prompt eval time =   79613.16 ms /   781 tokens (  101.94 ms per token,     9.81 tokens per second)
llama_perf_context_print:        eval time =   25778.43 ms /    76 runs   (  339.19 ms per token,     2.95 tokens per second)
llama_perf_context_print:       total time =  105436.06 ms /   857 tokens
llama_perf_context_print:    graphs reused =         72


Answer: The steps of Umrah include wearing Ihram at the Miqat, entering the holy area of Masjid Al Haram, performing Tawaf around the Kaaba seven times, and performing Sa'ee between Safa and Marwa seven times. Additionally, there are official requirements such as identity verification and permit issuance to perform Umrah at specific times.


In [30]:
print("Sources used:")

for i, src in enumerate(sources, 1):
    print(f"[{i}] {src['text'][:150].replace('','')}")

Sources used:
[1] ##jj and umrah consecutively ; for they remove poverty and sin as the bellows removes impurity from iron. " - repeating umrah expiates the sins commit
[2] umrah, the pilgrim turns toward the house of allah with his heart, tongue, limbs, and seeks his mercy and pleasure. umrah is an act of worshipping all
[3] for umrah in the hajj - months ( shawwal, dhul - qi'dah and dhul - hijjah ) - when he reaches makkah, he performs tawaf and sa'i for his umrah, shaves
[4] official for exact umrah timing - know your residence location and save the address - remember your bus stop and meeting point - check gate panels for
[5] 2. identity verification 3. permit issued according to available date must perform umrah at specified times shown in permit. choosing less crowded tim
