In [1]:
# Install required packages (if not already installed)
!pip install --quiet chromadb transformers torch


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.6/50.6 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m90.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m100.4 MB/s[0m eta [36m0:0

In [None]:
!pip -q install --upgrade openai

In [2]:
import zipfile
import os

zip_path = "/content/hajj_e5_chroma_backup.zip"
extract_path = "/content/my_unzipped"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [3]:

import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

import json
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import chromadb
from llama_cpp import Llama
api_key = ""
client = OpenAI(api_key=api_key)

# Config for embedding model and Chroma
MODEL_NAME = 'intfloat/e5-base-v2'
CHROMA_PATH = extract_path
COLLECTION_NAME = 'hajj_e5'
PASSAGE_PREFIX = 'passage: '
QUERY_PREFIX = 'query: '

# Path to your quantised LLM file (gguf format)
#LLM_PATH = '/path/to/mistral-7b-instruct-q4_k_m.gguf'  # TODO: replace with actual path on your Pi

# Maximum tokens for generation and context
MAX_TOKENS = 256

# Device selection
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [4]:

# Load E5 model and tokenizer for query encoding
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

def embed_query(text: str):
    """Encode a query string into an embedding vector using E5 and normalise it."""
    input_text = QUERY_PREFIX + text
    encoded = tokenizer(input_text, return_tensors='pt', truncation=True, max_length=512)
    encoded = {k: v.to(device) for k, v in encoded.items()}
    with torch.no_grad():
        out = model(**encoded)
        token_embeds = out.last_hidden_state
        mask = encoded['attention_mask'].unsqueeze(-1)
        sum_embeds = (token_embeds * mask).sum(dim=1)
        sum_mask = mask.sum(dim=1)
        embed = (sum_embeds / sum_mask).squeeze(0).cpu().numpy()
    norm = np.linalg.norm(embed)
    if norm > 0:
        embed = embed / norm
    return embed


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [6]:

# Copy the folder into a writable location (if it came from a read-only dataset)
import shutil
shutil.copytree(extract_path, '/content/hajj_e5_chroma')

# Then point Chroma at the copy
CHROMA_PATH = '/content/hajj_e5_chroma'
client = chromadb.PersistentClient(path=CHROMA_PATH)
collection = client.get_or_create_collection(name='hajj_e5', metadata={'hnsw:space': 'cosine'})

# Helper search function with lexical re-ranking as fallback
def search(query_str: str, top_k: int = 10, re_rank: bool = True):
    query_embed = embed_query(query_str)
    result = collection.query(query_embeddings=[query_embed.tolist()], n_results=top_k)
    ids = result['ids'][0]
    dists = result['distances'][0]
    docs = result['documents'][0]
    metas = result['metadatas'][0]
    hits = []
    for id_, dist, doc, meta in zip(ids, dists, docs, metas):
        hits.append({'id': id_, 'distance': float(dist), 'text': doc, 'metadata': meta})
    if re_rank:
        query_tokens = set(query_str.lower().split())
        for h in hits:
            text_tokens = set(h['text'].lower().split())
            h['lexical_score'] = len(query_tokens & text_tokens)
        hits.sort(key=lambda x: x['lexical_score'], reverse=True)
    return hits


In [7]:

def build_prompt(question: str, sources: list):
    """Construct a prompt for the LLM using the question and retrieved sources."""
    prompt_lines = []
    prompt_lines.append("You are an assistant answering questions about Hajj and Umrah.")
    prompt_lines.append("Answer concisely in plain English so that the response can be read aloud.")
    prompt_lines.append("Keep the answer to no more than 3–4 sentences.")
    prompt_lines.append(f"Question: {question}")
    prompt_lines.append("Sources:")
    for i, src in enumerate(sources, 1):
        text = src['text'].replace("", " ").strip()
        if len(text) > 300:
            text = text[:297] + '...'
        prompt_lines.append(f"[{i}] {text}")
    prompt_lines.append("Answer:")
    return "".join(prompt_lines)


In [8]:
def generate_answer(question: str, top_k: int = 5):
    hits = search(question, top_k=top_k, re_rank=True)
    prompt = build_prompt(question, hits)
    result = client.chat.completions.create(
        model="gpt-4o-mini",   # or "gpt-4o", "gpt-4-turbo"
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
        temperature=0.2,
        max_tokens=MAX_TOKENS,
        stop=["Sources:", "Question:"]
    )
    answer = result.choices[0].message.content.strip()
    return answer, hits

#main

In [13]:
# ---- 1) Setup
from google.colab import output, userdata
from IPython.display import Javascript, display
from base64 import b64decode
from openai import OpenAI
from openai import OpenAI
from io import BytesIO
from google.colab import userdata
from IPython.display import HTML, display
from base64 import b64encode
import time
buf = BytesIO()
# ---- 2) JS widget to record audio from the mic
RECORD_JS = r"""
async function recordAudio(){
  const stream = await navigator.mediaDevices.getUserMedia({audio:true});
  const mediaRecorder = new MediaRecorder(stream);
  let chunks = [];
  const startBtn = document.createElement('button');
  const stopBtn  = document.createElement('button');
  startBtn.textContent = '🎙️ Start recording';
  stopBtn.textContent  = '⏹️ Stop';
  stopBtn.disabled = true;
  document.body.appendChild(startBtn);
  document.body.appendChild(stopBtn);

  return await new Promise((resolve) => {
    mediaRecorder.ondataavailable = e => chunks.push(e.data);
    mediaRecorder.onstop = async () => {
      const blob = new Blob(chunks, {type: 'audio/webm'});
      const reader = new FileReader();
      reader.onload = () => resolve(reader.result);  // data: URL (base64)
      reader.readAsDataURL(blob);
      stream.getTracks().forEach(t => t.stop());
      startBtn.remove(); stopBtn.remove();
    };

    startBtn.onclick = () => { mediaRecorder.start(); startBtn.disabled = true; stopBtn.disabled = false; };
    stopBtn.onclick  = () => { mediaRecorder.stop();  stopBtn.disabled  = true; };
  });
}
"""

def record_to_file(out_path="mic.webm"):
  display(Javascript(RECORD_JS))
  data_url = output.eval_js("recordAudio()")  # wait for user to stop
  header, b64data = data_url.split(',', 1)
  with open(out_path, "wb") as f:
    f.write(b64decode(b64data))
  return out_path

In [None]:

# ---- 3) Record, then transcribe with gpt-4o-transcribe
audio_path = record_to_file("mic.webm")
print("Saved:", audio_path)

with open(audio_path, "rb") as af:
  # Supported models include gpt-4o-transcribe and gpt-4o-mini-transcribe
  # API: https://platform.openai.com/docs/api-reference/audio
  transcript = client.audio.transcriptions.create(
      model="gpt-4o-transcribe",
      file=af,
      response_format="text"
  )
print(transcript,"\n")
answer, sources = generate_answer(transcript, top_k=5)
print("Answer:", answer)
with client.audio.speech.with_streaming_response.create(
    model="gpt-4o-mini-tts",
    voice="coral",
    input=answer,
    instructions="British accent",
) as resp:
    for chunk in resp.iter_bytes():
        buf.write(chunk)

mp3_bytes = buf.getvalue()
b64 = b64encode(mp3_bytes).decode("ascii")
html = f"""
<audio controls autoplay>
  <source src="data:audio/mpeg;base64,{b64}" type="audio/mpeg">
  Your browser does not support the audio element.
</audio>
"""
display(HTML(html))


In [None]:
print("Sources used:")

for i, src in enumerate(sources, 1):
    print(f"[{i}] {src['text'][:150].replace('','')}")