In [1]:
!pip install -q supabase

In [10]:
from google.colab import userdata
import numpy as np
from contextvars import ContextVar
from supabase import create_client
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()

SUPABASE_URL = userdata.get("SUPABASE_URL")
SUPABASE_KEY = userdata.get("SUPABASE_KEY")

supabase = create_client(SUPABASE_URL, SUPABASE_KEY)


> Check if the Supabase Conenction is Live

In [4]:
supabase.table("surahs").select("*").limit(5).execute()

APIResponse(data=[{'id': 1, 'number': 1, 'name_simple': 'Al-Faatiha', 'name_arabic': 'سُورَةُ ٱلْفَاتِحَةِ', 'name_english': 'The Opening', 'revelation_place': 'Meccan', 'verses_count': 7}, {'id': 2, 'number': 2, 'name_simple': 'Al-Baqara', 'name_arabic': 'سُورَةُ البَقَرَةِ', 'name_english': 'The Cow', 'revelation_place': 'Medinan', 'verses_count': 286}, {'id': 3, 'number': 3, 'name_simple': 'Aal-i-Imraan', 'name_arabic': 'سُورَةُ آلِ عِمۡرَانَ', 'name_english': 'The Family of Imraan', 'revelation_place': 'Medinan', 'verses_count': 200}, {'id': 4, 'number': 4, 'name_simple': 'An-Nisaa', 'name_arabic': 'سُورَةُ النِّسَاءِ', 'name_english': 'The Women', 'revelation_place': 'Medinan', 'verses_count': 176}, {'id': 5, 'number': 5, 'name_simple': 'Al-Maaida', 'name_arabic': 'سُورَةُ المَائـِدَةِ', 'name_english': 'The Table', 'revelation_place': 'Medinan', 'verses_count': 120}], count=None)

# Verse Chunking

To support Quran verse chunking, we will store the chunks in the database using the following table:

### Table: `verse_chunks`

| Column Name     | Data Type               | Constraints                  | Description |
|-----------------|------------------------|-------------------------------|-------------|
| `id`            | `bigserial`            | `NOT NULL`, Primary Key       | Unique identifier for each chunk |
| `chunk_key`     | `character varying(50)`| `NOT NULL`                    | A unique key for the chunk (e.g., `2:1-5`) |
| `surah_id`      | `integer`              | `NOT NULL`                    | ID of the Surah the chunk belongs to |
| `start_verse`   | `integer`              | `NOT NULL`                    | Starting verse number of the chunk |
| `end_verse`     | `integer`              | `NOT NULL`                    | Ending verse number of the chunk |
| `text_uthmani`  | `text`                 | `NOT NULL`                    | Uthmani Arabic text of the chunk |
| `text_simple`   | `text`                 | `NOT NULL`                    | Simplified Arabic text of the chunk |
| `text_english`  | `text`                 | `NOT NULL`                    | English translation of the chunk |

### Description

- **Purpose:** Store chunks of Quran verses for easier retrieval and processing.  
- **Chunking Strategy:** Each chunk may consist of a range of verses (`start_verse` → `end_verse`).  
- **Unique Key:** `chunk_key` provides a quick reference for queries (e.g., `"2:1-5"`).  
- **Text Fields:** All three editions of the Quran (Uthmani, Simple, English) are stored for multi-lingual access.


In [5]:
def fetch_data(table_name,
               fields=[],
               filters={},
               order_by=None,
               desc=False,
               offset=0,
               limit=1000):

  fields = fields or "*"
  query = supabase.table(table_name).select(fields)

  for key, value in filters.items():
      query = query.eq(key, value)

  if order_by:
      query = query.order(order_by, desc=desc)

  response = query.limit(limit).offset(offset).execute()

  if not response or not response.data:
    return []

  return response.data


In [18]:
fetch_data("verses", limit=5)

[{'id': 1,
  'surah_id': 1,
  'verse_number': 1,
  'verse_key': '1:1',
  'juz_number': 1,
  'ruku_number': 1,
  'text_uthmani': '\ufeffبِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ',
  'text_simple': '\ufeffبِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ',
  'text_english': 'In the name of Allah, the Entirely Merciful, the Especially Merciful.'},
 {'id': 2,
  'surah_id': 1,
  'verse_number': 2,
  'verse_key': '1:2',
  'juz_number': 1,
  'ruku_number': 1,
  'text_uthmani': 'ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ',
  'text_simple': 'الْحَمْدُ لِلَّهِ رَبِّ الْعَالَمِينَ',
  'text_english': '[All] praise is [due] to Allah, Lord of the worlds -'},
 {'id': 3,
  'surah_id': 1,
  'verse_number': 3,
  'verse_key': '1:3',
  'juz_number': 1,
  'ruku_number': 1,
  'text_uthmani': 'ٱلرَّحْمَٰنِ ٱلرَّحِيمِ',
  'text_simple': 'الرَّحْمَٰنِ الرَّحِيمِ',
  'text_english': 'The Entirely Merciful, the Especially Merciful,'},
 {'id': 4,
  'surah_id': 1,
  'verse_number': 4,
  'verse_key': '1:4',
  'juz_number': 1,
  'ruk

In [14]:
def generate_verse_chunks(context_window=5):
    """
    Fetches Quran verses, chunks them using a sliding window, and inserts them
    into the `verse_chunks` table with separate fields for current verse text,
    context window text, and current verse number.

    Args:
        context_window (int): Number of verses in each context window.
    """
    print(f"Starting verse chunking with context window: {context_window}")

    # Fetch all surahs
    surahs_data = fetch_data("surahs", limit=200)
    if not surahs_data:
        print("No surahs found. Aborting chunking process.")
        return

    all_chunks_to_insert = []

    for surah in surahs_data:
        surah_id = surah['id']
        surah_name = surah['name_simple']
        surah_verses_count = surah['verses_count']

        print(f"Processing Surah: {surah_name} (ID: {surah_id}, Verses: {surah_verses_count})")

        # Fetch all verses for current surah
        verses = fetch_data(
            "verses",
            filters={"surah_id": surah_id},
            order_by="verse_number",
            desc=False,
            limit=surah_verses_count
        )

        if not verses:
            print(f"No verses found for Surah {surah_name}. Skipping.")
            continue

        # Clean BOM from text fields
        for v in verses:
            for field in ['text_uthmani', 'text_simple', 'text_english']:
                if v.get(field):
                    v[field] = v[field].lstrip('\ufeff')

        num_verses = len(verses)

        # Generate chunks using sliding window
        for i in range(num_verses):
            current_verse = verses[i]

            # Determine context window slice
            context_start = i
            context_end = min(i + context_window, num_verses)
            context_verses = verses[context_start:context_end]

            # Build context text
            context_text_uthmani = " ".join([v['text_uthmani'] for v in context_verses])
            context_text_simple = " ".join([v['text_simple'] for v in context_verses])
            context_text_english = " ".join([v['text_english'] for v in context_verses])

            chunk_data = {
                "chunk_key": f"{surah_id}:{current_verse['verse_number']}",
                "surah_id": surah_id,
                "start_verse": current_verse['verse_number'],
                "end_verse": current_verse['verse_number'],
                "current_verse": current_verse['verse_number'],
                "text_uthmani": current_verse['text_uthmani'],
                "text_simple": current_verse['text_simple'],
                "text_english": current_verse['text_english'],
                "context_text_uthmani": context_text_uthmani,
                "context_text_simple": context_text_simple,
                "context_text_english": context_text_english
            }

            all_chunks_to_insert.append(chunk_data)

    batch_size = 500
    total_batches = (len(all_chunks_to_insert) + batch_size - 1) // batch_size

    for i in range(0, len(all_chunks_to_insert), batch_size):
        batch = all_chunks_to_insert[i:i + batch_size]
        try:
            supabase.table("verse_chunks").insert(batch).execute()
            print(f"  Inserted batch {i//batch_size + 1}/{total_batches} ({len(batch)} chunks).")
        except Exception as e:
            print(f"  Error inserting batch starting at index {i}. Details: {e}")

    print("Verse chunking process completed successfully.")


In [15]:
generate_verse_chunks(context_window=5)

Starting verse chunking with context window: 5
Processing Surah: Al-Faatiha (ID: 1, Verses: 7)
Processing Surah: Al-Baqara (ID: 2, Verses: 286)
Processing Surah: Aal-i-Imraan (ID: 3, Verses: 200)
Processing Surah: An-Nisaa (ID: 4, Verses: 176)
Processing Surah: Al-Maaida (ID: 5, Verses: 120)
Processing Surah: Al-An'aam (ID: 6, Verses: 165)
Processing Surah: Al-A'raaf (ID: 7, Verses: 206)
Processing Surah: Al-Anfaal (ID: 8, Verses: 75)
Processing Surah: At-Tawba (ID: 9, Verses: 129)
Processing Surah: Yunus (ID: 10, Verses: 109)
Processing Surah: Hud (ID: 11, Verses: 123)
Processing Surah: Yusuf (ID: 12, Verses: 111)
Processing Surah: Ar-Ra'd (ID: 13, Verses: 43)
Processing Surah: Ibrahim (ID: 14, Verses: 52)
Processing Surah: Al-Hijr (ID: 15, Verses: 99)
Processing Surah: An-Nahl (ID: 16, Verses: 128)
Processing Surah: Al-Israa (ID: 17, Verses: 111)
Processing Surah: Al-Kahf (ID: 18, Verses: 110)
Processing Surah: Maryam (ID: 19, Verses: 98)
Processing Surah: Taa-Haa (ID: 20, Verses: 13

# Testing Different Embeddings

As the chunking process is finished, we now move to **testing different embeddings**.

### Embedding Table Schema

We create the table `verse_embeddings` to store embeddings for each chunk:

```sql
CREATE TABLE public.verse_embeddings (
    id BIGSERIAL NOT NULL,
    chunk_id BIGINT NOT NULL,
    embedding EXTENSIONS.VECTOR NOT NULL,
    model_name VARCHAR(150) NOT NULL,
    created_at TIMESTAMP WITHOUT TIME ZONE DEFAULT NOW()
) TABLESPACE pg_default;
```

### Next Steps
We will use the previously chunked data, generate embeddings for each chunk, and store them in this table according to the defined structure.

---

In [7]:
import torch
from transformers import AutoTokenizer, AutoModel
from datetime import datetime

In [9]:
MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" # 768 Dimensions
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [12]:
def get_embedding(text: str) -> np.ndarray:
    """
    Converts a string into a fixed-size embedding using the transformer model.
    """
    with torch.no_grad():
        encoded_input = tokenizer(text,
                                  padding=True,
                                  truncation=True,
                                  return_tensors="pt").to(device)

        model_output = model(**encoded_input)

        # Mean pooling
        token_embeddings = model_output.last_hidden_state  # [batch_size, seq_len, hidden_size]
        attention_mask = encoded_input['attention_mask'].unsqueeze(-1)
        masked_embeddings = token_embeddings * attention_mask
        summed = masked_embeddings.sum(dim=1)
        counts = attention_mask.sum(dim=1)
        mean_pooled = summed / counts
        return mean_pooled[0].cpu().numpy()

In [19]:
def generate_and_store_embeddings_batched(
    table_name="verse_chunks",
    text_field="text_english",
    embedding_table="verse_embeddings",
    model_name="sentence-transformers/all-mpnet-base-v2",
    batch_size=250,
    fetch_limit=500
):
    """
    Generates embeddings for verse chunks from a database table in batches
    and inserts them into the embeddings table.

    Args:
        table_name (str): Table to fetch chunks from.
        text_field (str): Text field to embed ('text_english', 'text_simple', 'text_uthmani').
        embedding_table (str): Table to store embeddings.
        model_name (str): HuggingFace transformer model.
        batch_size (int): Number of embeddings to insert per batch.
        fetch_limit (int): Number of rows to fetch per iteration from DB.
    """

    print(f"Loading embedding model: {model_name}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    offset = 0
    total_processed = 0

    while True:
        # Fetch batch of verse_chunks
        chunks = fetch_data(table_name,
                            offset=offset,
                            limit=fetch_limit)
        if not chunks:
            print("No more rows to fetch. Finished processing all chunks.")
            break

        embeddings_to_insert = []

        for chunk in chunks:
            chunk_id = chunk["id"]
            text_to_embed = chunk.get(text_field, "")
            if not text_to_embed:
                continue

            # Remove BOM
            text_to_embed = text_to_embed.lstrip('\ufeff')
            embedding_vector = get_embedding(text_to_embed)

            embeddings_to_insert.append({
                "chunk_id": chunk_id,
                "embedding": embedding_vector.tolist(),
                "model_name": model_name,
                "created_at": datetime.utcnow().isoformat()
            })

        for i in range(0, len(embeddings_to_insert), batch_size):
            batch = embeddings_to_insert[i:i + batch_size]
            try:
                supabase.table(embedding_table).insert(batch).execute()
                print(f"Inserted batch of {len(batch)} embeddings (offset {offset})")
            except Exception as e:
                print(f"Error inserting batch starting at offset {offset}: {e}")

        offset += fetch_limit
        total_processed += len(chunks)
        print(f"Processed {total_processed} rows so far.")

    print("Embedding generation and insertion completed successfully.")


In [20]:
generate_and_store_embeddings_batched(
    table_name="verse_chunks",
    text_field="text_english",
    embedding_table="verse_embeddings",
    model_name="sentence-transformers/all-mpnet-base-v2",
    batch_size=250,
    fetch_limit=500
)

Loading embedding model: sentence-transformers/all-mpnet-base-v2


  "created_at": datetime.utcnow().isoformat()


Inserted batch of 250 embeddings (offset 0)
Inserted batch of 250 embeddings (offset 0)
Processed 500 rows so far.
Inserted batch of 250 embeddings (offset 500)
Inserted batch of 250 embeddings (offset 500)
Processed 1000 rows so far.
Inserted batch of 250 embeddings (offset 1000)
Inserted batch of 250 embeddings (offset 1000)
Processed 1500 rows so far.
Inserted batch of 250 embeddings (offset 1500)
Inserted batch of 250 embeddings (offset 1500)
Processed 2000 rows so far.
Inserted batch of 250 embeddings (offset 2000)
Inserted batch of 250 embeddings (offset 2000)
Processed 2500 rows so far.
Inserted batch of 250 embeddings (offset 2500)
Inserted batch of 250 embeddings (offset 2500)
Processed 3000 rows so far.
Inserted batch of 250 embeddings (offset 3000)
Inserted batch of 250 embeddings (offset 3000)
Processed 3500 rows so far.
Inserted batch of 250 embeddings (offset 3500)
Inserted batch of 250 embeddings (offset 3500)
Processed 4000 rows so far.
Inserted batch of 250 embeddings 