<a href="https://colab.research.google.com/github/kairamilanifitria/PurpleBox-Intern/blob/main/03_05_VectorStore_Supabase.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## chunking

In [1]:
import json
import re
import torch
from langchain.text_splitter import MarkdownHeaderTextSplitter
from sentence_transformers import SentenceTransformer

# Load Markdown file
file_path = "/content/drive/MyDrive/document_rag/md/17.md"
with open(file_path, "r", encoding="utf-8") as file:
    markdown_text = file.read()

# Step 1: Document-Specific Chunking
headers_to_split_on = [("#", "Header 1"), ("##", "Header 2")]
splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)
documents = splitter.split_text(markdown_text)
chunks = [doc.page_content for doc in documents]

# Load Hugging Face Embedding Model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def is_table(chunk):
    """Checks if a chunk contains a Markdown table."""
    return bool(re.search(r'^\|.*\|\n\|[-| ]+\|\n(\|.*\|\n)*', chunk, re.MULTILINE))

def extract_table(chunk):
    """Extracts tables from markdown and converts them into structured JSON format."""
    lines = chunk.strip().split("\n")

    # Find the table header
    header = None
    table_rows = []
    for i, line in enumerate(lines):
        if re.match(r'^\|[-| ]+\|$', line):  # Detect separator line (---|---)
            header = lines[i - 1].strip("|").split("|")
            header = [h.strip() for h in header]
            continue
        if header:
            row_data = line.strip("|").split("|")
            row_data = [cell.strip() for cell in row_data]
            table_rows.append(row_data)

    if not header or not table_rows:
        return None  # Return None if the table extraction fails

    return {"headers": header, "rows": table_rows}

def needs_semantic_chunking(chunk, max_tokens=300):
    """Checks if the text chunk is too long and needs further splitting."""
    return not is_table(chunk) and len(chunk.split()) > max_tokens

def semantic_split(text, max_sentences=5, similarity_threshold=0.6, min_tokens=100):
    """Splits long text chunks based on semantic similarity."""
    sentences = re.split(r'(?<=[.!?])\s+', text)
    if len(sentences) <= max_sentences:
        return [text]

    embeddings = model.encode(sentences, convert_to_tensor=True)
    similarities = torch.nn.functional.cosine_similarity(embeddings[:-1], embeddings[1:], dim=1)
    split_points = [i+1 for i, sim in enumerate(similarities) if sim < similarity_threshold]

    sub_chunks, start = [], 0
    for split in split_points:
        chunk_text = " ".join(sentences[start:split])
        if len(chunk_text.split()) >= min_tokens:
            sub_chunks.append(chunk_text)
        start = split

    last_chunk = " ".join(sentences[start:])
    if len(last_chunk.split()) >= min_tokens:
        sub_chunks.append(last_chunk)
    elif sub_chunks:
        sub_chunks[-1] += " " + last_chunk  # Merge with previous if too short

    return sub_chunks if sub_chunks else [text]

def is_references_section(chunk):
    """Checks if the chunk is part of the References section."""
    return chunk.strip().lower().startswith("## references")

def extract_section_title(chunk):
    """Extracts section headers from chunks for metadata."""
    match = re.match(r'^(#+)\s+(.*)', chunk.strip())
    return match.group(2) if match else None

# Step 3: Apply Chunking
final_chunks = []
is_references = False
for chunk in chunks:
    if is_references_section(chunk):
        is_references = True
    if is_references:
        final_chunks.append(chunk)
    elif is_table(chunk):
        table_data = extract_table(chunk)
        if table_data:
            final_chunks.append({"table": table_data})  # Store table separately
    elif needs_semantic_chunking(chunk):
        final_chunks.extend(semantic_split(chunk))
    else:
        final_chunks.append(chunk)

# Step 4: Merge Small Chunks (Ensure Minimum 100 Tokens)
merged_chunks = []
i = 0
while i < len(final_chunks):
    chunk = final_chunks[i]
    if isinstance(chunk, dict):  # If it's a table, store it separately
        merged_chunks.append(chunk)
        i += 1
        continue

    while i + 1 < len(final_chunks) and isinstance(chunk, str) and len(chunk.split()) < 100:
        next_chunk = final_chunks[i + 1]
        if isinstance(next_chunk, dict):  # Don't merge tables into text
            break
        chunk += "\n" + next_chunk
        i += 1

    merged_chunks.append(chunk)
    i += 1

# Step 5: Convert Chunks to JSON Format
json_chunks = []
source_filename = file_path.split("/")[-1]  # Extract filename for metadata

for idx, chunk in enumerate(merged_chunks):
    if isinstance(chunk, dict):  # Handle table separately
        json_chunks.append({
            "chunk_id": idx + 1,
            "table": chunk["table"],
            "metadata": {
                "source": source_filename,
                "section": "Table",  # You can add better logic here
                "position": idx + 1
            }
        })
    else:
        section_title = extract_section_title(chunk)
        json_chunks.append({
            "chunk_id": idx + 1,
            "content": chunk.strip(),
            "metadata": {
                "source": source_filename,
                "section": section_title if section_title else "Unknown",
                "position": idx + 1
            }
        })

# Save JSON output
output_file = "/content/17.json"
with open(output_file, "w", encoding="utf-8") as json_file:
    json.dump(json_chunks, json_file, indent=4, ensure_ascii=False)

print(f"Chunking completed. JSON saved to: {output_file}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Chunking completed. JSON saved to: /content/17.json


## vector store embedding

In [None]:
!pip install supabase

### **sentence-transformer**

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer
from supabase import create_client

# Supabase Configuration
SUPABASE_URL = "https://vptbbrmqaqpsynvpizih.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZwdGJicm1xYXFwc3ludnBpemloIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDEwNjU2NzMsImV4cCI6MjA1NjY0MTY3M30.XVOsjwisyi39awcbC3TMf46uMbdlwUkY-wfyo31UthI"
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

# Load Embedding Model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Load JSON chunks
json_file_path = "/content/17_chunks_v2.json"
with open(json_file_path, "r", encoding="utf-8") as json_file:
    json_chunks = json.load(json_file)

def store_chunks_in_supabase(chunks):
    for chunk in chunks:
        content = chunk.get("content") or json.dumps(chunk.get("table"), ensure_ascii=False)
        embedding = model.encode(content).tolist()
        chunk["embedding"] = embedding

        data = {
            "content": content,
            "embedding": embedding,
            "metadata": chunk["metadata"]
        }
        supabase.table("documents").insert(data).execute()

# Store in Supabase
store_chunks_in_supabase(json_chunks)

print("Chunks with embeddings stored successfully in Supabase!")


Chunks with embeddings stored successfully in Supabase!


### **BAAI/bge-m3**

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from supabase import create_client

# Supabase Configuration
SUPABASE_URL = "https://vptbbrmqaqpsynvpizih.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZwdGJicm1xYXFwc3ludnBpemloIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDEwNjU2NzMsImV4cCI6MjA1NjY0MTY3M30.XVOsjwisyi39awcbC3TMf46uMbdlwUkY-wfyo31UthI"
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

# Load BAAI/bge-m3 Embedding Model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
model = AutoModel.from_pretrained("BAAI/bge-m3").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

In [None]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

# Load JSON chunks
json_file_path = "/content/17_chunks_v2.json"
with open(json_file_path, "r", encoding="utf-8") as json_file:
    json_chunks = json.load(json_file)

def store_chunks_in_supabase(chunks):
    for chunk in chunks:
        content = chunk.get("content") or json.dumps(chunk.get("table"), ensure_ascii=False)
        embedding = get_embedding(content)
        chunk["embedding"] = embedding

        data = {
            "content": content,
            "embedding": embedding,
            "metadata": chunk["metadata"]
        }
        supabase.table("documents").insert(data).execute()

# Store in Supabase
store_chunks_in_supabase(json_chunks)

print("Chunks with embeddings stored successfully in Supabase!")


Chunks with embeddings stored successfully in Supabase!


### **Alibaba-NLP/gte-multilingual-base**

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from supabase import create_client

# Supabase Configuration
SUPABASE_URL = "https://vptbbrmqaqpsynvpizih.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZwdGJicm1xYXFwc3ludnBpemloIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDEwNjU2NzMsImV4cCI6MjA1NjY0MTY3M30.XVOsjwisyi39awcbC3TMf46uMbdlwUkY-wfyo31UthI"
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

# Load Embedding Model

tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

In [None]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

# Load JSON chunks
json_file_path = "/content/17_chunks_v2.json"
with open(json_file_path, "r", encoding="utf-8") as json_file:
    json_chunks = json.load(json_file)

def store_chunks_in_supabase(chunks):
    for chunk in chunks:
        content = chunk.get("content") or json.dumps(chunk.get("table"), ensure_ascii=False)
        embedding = get_embedding(content)
        chunk["embedding"] = embedding

        data = {
            "content": content,
            "embedding": embedding,
            "metadata": chunk["metadata"]
        }
        supabase.table("documents").insert(data).execute()

# Store in Supabase
store_chunks_in_supabase(json_chunks)

print("Chunks with embeddings stored successfully in Supabase!")


### emptying the table in supabase :

> `DELETE FROM documents;`



### TESTING

### **sentence-transformer**

In [None]:
import json
import torch
from sentence_transformers import SentenceTransformer
from supabase import create_client

# Supabase Configuration
SUPABASE_URL = "https://vptbbrmqaqpsynvpizih.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZwdGJicm1xYXFwc3ludnBpemloIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDEwNjU2NzMsImV4cCI6MjA1NjY0MTY3M30.XVOsjwisyi39awcbC3TMf46uMbdlwUkY-wfyo31UthI"
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

# Load Embedding Model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def search_matching_documents(query, top_k=3):
    query_embedding = model.encode(query).tolist()

    # Perform similarity search in Supabase
    response = supabase.rpc(
        "match_documents",
        {"query_embedding": query_embedding, "match_count": top_k}
    ).execute()

    if response.data:
        print("\nMatching Documents:")
        for idx, entry in enumerate(response.data, start=1):
            print(f"{idx}. {entry['content']}\n")
    else:
        print("No matching documents found.")


In [None]:
# Example Query
query = "What is SAAFP?"
search_matching_documents(query)


Matching Documents with Similarity Scores:
1. Similarity: 0.2784
   Content: ## Reflection on workshop  
The group appreciated the richness of the discussion and the value  of  having  a  variety  of  countries  represented  in  the workshop. The group members expressed feeling encouraged and felt motivated to use 'small moments, little bits, part of the mini-CEX' during learning interactions in the workplace (teachable  moments).  This  will  allow  the  supervisors  and registrars to be 'more real' in the workplace, as opposed to striving for the hard-to-reach  perfect or ideal learning interactions. It will necessitate a more honest and pragmatic approach  to  harness  these  learning  moments.  Ongoing discussions  are  needed  around  the  validity  of  continuous assessments  in  the  workplace  for  national  examinations, such as the Fellowship of the College of Family Physicians of South Africa (FCFP[SA]), and the contribution of the learning portfolio to exit examination res

### **huggingface BAAI**

In [None]:
def search_matching_documents(query, top_k=3):
    query_embedding = get_embedding(query)

    # Perform similarity search in Supabase
    response = supabase.rpc(
        "match_documents",
        {"query_embedding": query_embedding, "match_count": top_k}
    ).execute()

    if response.data:
        print("\nMatching Documents with Similarity Scores:")
        for idx, entry in enumerate(response.data, start=1):
            print(f"{idx}. Similarity: {entry['similarity']:.4f}")
            print(f"   Content: {entry['content']}\n")
    else:
        print("No matching documents found.")

In [None]:
# Example Query
query = "What is SAAFP?"
search_matching_documents(query)


Matching Documents with Similarity Scores:
1. Similarity: 0.6064
   Content: 7  
The World Organisation of Family Doctors and the South African Academy of Family Physicians (SAAFP)  have  established  standards  for  the  postgraduate  training  of  family  physicians. 8,9  
However,  family  medicine  is  a  relatively  new  specialty  in many African countries, which adds to the challenges around training and supervision in the context of large rural areas, massive health needs and minimal resources. 10  
The  aim  of  the  workshop  was  to  understand  how  family medicine registrars (postgraduate trainees in family medicine) in  Africa  learn  in  the  workplace. We  particularly  wanted  to explore  the  interaction  between  the  registrar  and  supervisor in the workplace, captured in a portfolio of learning, and in the African  context. We  sought  a  clearer  understanding  of  what it  means  to  be  observed  while  conducting  a  consultation  or performing  a  procedure,

### **Alibaba-NLP/gte-multilingual-base**

In [6]:
def search_matching_documents(query, top_k=3):
    query_embedding = get_embedding(query)

    # Perform similarity search in Supabase
    response = supabase.rpc(
        "match_documents",
        {"query_embedding": query_embedding, "match_count": top_k}
    ).execute()

    if response.data:
        print("\nMatching Documents with Similarity Scores:")
        for idx, entry in enumerate(response.data, start=1):
            print(f"{idx}. Similarity: {entry['similarity']:.4f}")
            print(f"   Content: {entry['content']}\n")
    else:
        print("No matching documents found.")

In [7]:
# Example Query
query = "Participants in Kenya?"
search_matching_documents(query)


Matching Documents with Similarity Scores:
1. Similarity: 0.8272
   Content: ## Participants and process  
Thirty-five people  participated  in  a  2-h  workshop  and included trainers and trainees from nine African countries, the United Kingdom, United States and Sweden (see Table 1). South  Africa  was  represented  by  the  universities  of  Cape Town,  Limpopo,  Pretoria,  Sefako  Makgatho,  Stellenbosch, Walter Sisulu and Witwatersrand.  
We started with an introduction and then divided into buzz pairs (pairs were allowed to form spontaneously, regardless of the trainer or trainee status of the participants). In the buzz pairs, we explored the questions of how do I teach or learn, supervise  or  be  supervised,  and  assess  or  be  assessed. This was followed by an interactive focus group discussion on  the  reflections  created  by  the  buzz  pair  discussions (a guiding style was employed to facilitate this discussion). The  group  reflections  were  captured  on  a  flip  ch

## Try using other language = italian

### **huggingface BAAI**

In [None]:
!pip install supabase

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from supabase import create_client

# Supabase Configuration
SUPABASE_URL = "https://vptbbrmqaqpsynvpizih.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZwdGJicm1xYXFwc3ludnBpemloIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDEwNjU2NzMsImV4cCI6MjA1NjY0MTY3M30.XVOsjwisyi39awcbC3TMf46uMbdlwUkY-wfyo31UthI"
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

# Load BAAI/bge-m3 Embedding Model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
model = AutoModel.from_pretrained("BAAI/bge-m3").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

In [None]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

# Load JSON chunks
json_file_path = "/content/ManualeRotomarr.json"
with open(json_file_path, "r", encoding="utf-8") as json_file:
    json_chunks = json.load(json_file)

def store_chunks_in_supabase(chunks):
    for chunk in chunks:
        content = chunk.get("content") or json.dumps(chunk.get("table"), ensure_ascii=False)
        embedding = get_embedding(content)
        chunk["embedding"] = embedding

        data = {
            "content": content,
            "embedding": embedding,
            "metadata": chunk["metadata"]
        }
        supabase.table("documents").insert(data).execute()

# Store in Supabase
store_chunks_in_supabase(json_chunks)

print("Chunks with embeddings stored successfully in Supabase!")


Chunks with embeddings stored successfully in Supabase!


### **huggingface BAAI**

In [None]:
def search_matching_documents(query, top_k=3):
    query_embedding = get_embedding(query)

    # Perform similarity search in Supabase
    response = supabase.rpc(
        "match_documents",
        {"query_embedding": query_embedding, "match_count": top_k}
    ).execute()

    if response.data:
        print("\nMatching Documents with Similarity Scores:")
        for idx, entry in enumerate(response.data, start=1):
            print(f"{idx}. Similarity: {entry['similarity']:.4f}")
            print(f"   Content: {entry['content']}\n")
    else:
        print("No matching documents found.")

In [None]:
# Example Query
query = "INFORMAZIONI SULLA SICUREZZA"
search_matching_documents(query)


Matching Documents with Similarity Scores:
1. Similarity: 0.6989
   Content: ## 1. INFORMAZIONI SULLA SICUREZZA
## 3. DESTINAZIONE D'USO E UTILIZZATORI  
Il  presente  manuale  contiene  indicazioni  ed  informazioni fondamentali per il corretto utilizzo del GIRACASTAGNE AUTOMATICO (CUOCI CALDARROSTE) ROTOMARR .  
- -Leggere il manuale nella sua completezza per comprendere l'utilizzo della macchina;
- -Tenere questo manuale per future consultazioni in un luogo sicuro;
- -Osservare le istruzioni indicate in questo manuale per garantire la sicurezza dell'utilizzatore;
- -La non osservanza delle indicazioni elencate in questo manuale comporter√† l'annullamento della garanzia;
- -MECTRONICA S.r.l. non √® responsabile per danni o lesioni causate dalla non osservanza delle informazioni elencate nel presente manuale.

2. Similarity: 0.6657
   Content: ## 5. PULIZIA &amp; MANUTENZIONE  
40010 Bentivoglio (BO) Italia  
Tel. +39 0516641440 Fax. +39 0518909108  
Al  termine  di  ogni  utilizzo, 

try query in english

In [None]:
# Example Query
query = "safety information"
search_matching_documents(query)


Matching Documents with Similarity Scores:
1. Similarity: 0.6521
   Content: ## 1. INFORMAZIONI SULLA SICUREZZA
## 3. DESTINAZIONE D'USO E UTILIZZATORI  
Il  presente  manuale  contiene  indicazioni  ed  informazioni fondamentali per il corretto utilizzo del GIRACASTAGNE AUTOMATICO (CUOCI CALDARROSTE) ROTOMARR .  
- -Leggere il manuale nella sua completezza per comprendere l'utilizzo della macchina;
- -Tenere questo manuale per future consultazioni in un luogo sicuro;
- -Osservare le istruzioni indicate in questo manuale per garantire la sicurezza dell'utilizzatore;
- -La non osservanza delle indicazioni elencate in questo manuale comporter√† l'annullamento della garanzia;
- -MECTRONICA S.r.l. non √® responsabile per danni o lesioni causate dalla non osservanza delle informazioni elencate nel presente manuale.

2. Similarity: 0.6195
   Content: ## 6. RICAMBI  
Nelle seguenti pagine saranno indicate a disegno le componenti meccaniche con i loro codici.  
Qualora  sia necessario ordinar

### **Alibaba-NLP/gte-multilingual-base**

In [None]:
import json
import torch
from transformers import AutoTokenizer, AutoModel
from supabase import create_client

# Supabase Configuration
SUPABASE_URL = "https://vptbbrmqaqpsynvpizih.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZwdGJicm1xYXFwc3ludnBpemloIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDEwNjU2NzMsImV4cCI6MjA1NjY0MTY3M30.XVOsjwisyi39awcbC3TMf46uMbdlwUkY-wfyo31UthI"
supabase = create_client(SUPABASE_URL, SUPABASE_KEY)

# Load Embedding Model

tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

In [None]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().tolist()

# Load JSON chunks
json_file_path = "/content/ManualeRotomarr.json"
with open(json_file_path, "r", encoding="utf-8") as json_file:
    json_chunks = json.load(json_file)

def store_chunks_in_supabase(chunks):
    for chunk in chunks:
        content = chunk.get("content") or json.dumps(chunk.get("table"), ensure_ascii=False)
        embedding = get_embedding(content)
        chunk["embedding"] = embedding

        data = {
            "content": content,
            "embedding": embedding,
            "metadata": chunk["metadata"]
        }
        supabase.table("documents").insert(data).execute()

# Store in Supabase
store_chunks_in_supabase(json_chunks)

print("Chunks with embeddings stored successfully in Supabase!")


Chunks with embeddings stored successfully in Supabase!


### **Alibaba-NLP/gte-multilingual-base**

In [None]:
def search_matching_documents(query, top_k=3):
    query_embedding = get_embedding(query)

    # Perform similarity search in Supabase
    response = supabase.rpc(
        "match_documents",
        {"query_embedding": query_embedding, "match_count": top_k}
    ).execute()

    if response.data:
        print("\nMatching Documents with Similarity Scores:")
        for idx, entry in enumerate(response.data, start=1):
            print(f"{idx}. Similarity: {entry['similarity']:.4f}")
            print(f"   Content: {entry['content']}\n")
    else:
        print("No matching documents found.")

In [None]:
# Example Query
query = "INFORMAZIONI SULLA SICUREZZA"
search_matching_documents(query)


Matching Documents with Similarity Scores:
1. Similarity: 0.7864
   Content: ## ROTOMARR  
- 2) l'utilizzo della macchina su fornelli di dimensioni superiori ai 70 cm di diametro  
Utilizzare  l'apparecchio  solo  su  un  fornello  a gas dal diametro max. di 70mm.  
![Image](/content/markdown2/ManualeRotomarr_artifacts/image_000004_e70e7737d2a45c3228fd944f2ed7008935b52b5367d10e1135a35711464aba0a.png)  
- 3) il lavaggio della macchina in lavastoviglie
- 4) il lavaggio  della macchina  con  getto  d'acqua pieno
- 5) l'apertura dei ripari o una qualsiasi manomissione della macchina
- 6) l'utilizzo all'esterno in caso di cattive condizioni meteorologiche  (pioggia,  neve,  grandine,  vento forte)
- 7) l'utilizzo  in  locali  con  pericolo  di  esplosione  o incendio  o  in  presenza  di  grandi  quantitativi  di materiale infiammabile

2. Similarity: 0.7763
   Content: ## 1. INFORMAZIONI SULLA SICUREZZA
## 3. DESTINAZIONE D'USO E UTILIZZATORI  
Il  presente  manuale  contiene  indicazioni

try query in english

In [None]:
# Example Query
query = "safety information"
search_matching_documents(query)


Matching Documents with Similarity Scores:
1. Similarity: 0.7606
   Content: ## 1. INFORMAZIONI SULLA SICUREZZA
## 3. DESTINAZIONE D'USO E UTILIZZATORI  
Il  presente  manuale  contiene  indicazioni  ed  informazioni fondamentali per il corretto utilizzo del GIRACASTAGNE AUTOMATICO (CUOCI CALDARROSTE) ROTOMARR .  
- -Leggere il manuale nella sua completezza per comprendere l'utilizzo della macchina;
- -Tenere questo manuale per future consultazioni in un luogo sicuro;
- -Osservare le istruzioni indicate in questo manuale per garantire la sicurezza dell'utilizzatore;
- -La non osservanza delle indicazioni elencate in questo manuale comporter√† l'annullamento della garanzia;
- -MECTRONICA S.r.l. non √® responsabile per danni o lesioni causate dalla non osservanza delle informazioni elencate nel presente manuale.

2. Similarity: 0.7529
   Content: ## ROTOMARR  
- 2) l'utilizzo della macchina su fornelli di dimensioni superiori ai 70 cm di diametro  
Utilizzare  l'apparecchio  solo  su  u

## testing: Cosine Similarity

BAAI

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

# Load Embedding Model
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-m3")
model = AutoModel.from_pretrained("BAAI/bge-m3").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

def get_embedding(text):
    """Generate embedding using the BAAI/bge-m3 model."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Move the tensor to the CPU before converting to NumPy
    return outputs.last_hidden_state[:, 0, :].cpu().squeeze().numpy()  # Use CLS token embedding

def avg_cosine_similarity(query, retrieved_docs, doc_embeddings):
    """Compute the average cosine similarity between a query and retrieved documents."""
    query_embedding = get_embedding(query).reshape(1, -1)
    doc_vectors = np.array([doc_embeddings[doc] for doc in retrieved_docs])

    similarities = cosine_similarity(query_embedding, doc_vectors)[0]
    return np.mean(similarities)

# Example Query
query = "What is SAAFP?"

# Example Retrieved Documents
retrieved_docs = {
    "doc_1": "The World Organisation of Family Doctors and the South African Academy of Family Physicians (SAAFP)  have  established  standards  for  the  postgraduate  training  of  family  physicians. 8,9 However,  family  medicine  is  a  relatively  new  specialty  in many African countries, which adds to the challenges around training and supervision in the context of large rural areas, massive health needs and minimal resources. 10 The  aim  of  the  workshop  was  to  understand  how  family medicine registrars (postgraduate trainees in family medicine) in  Africa  learn  in  the  workplace. We  particularly  wanted  to explore  the  interaction between  the  registrar  and  supervisor in the workplace, captured in a portfolio of learning, and in the African  context. We  sought  a  clearer  understanding  of  what it  means  to  be  observed  while  conducting  a  consultation  or performing  a  procedure,  as  well  as  understanding  the  local experience of giving or receiving feedback, and how various educational meetings are conducted.",
    "doc_2": "It was clear from this workshop discussion that the training  of  family  physicians  across  Africa  shares  many common  themes.  However,  there  are  also  big  differences among the various countries and even programmes within countries. The way forward would include exploring the  local  contextual  enablers  that  influence  the  learning conversations between trainees and their supervisors. Family medicine  training  institutions  and  organisations  (such  as WONCA Africa and SAAFP) have a critical role to play in supporting  trainees  and  trainers  towards  developing  local competencies that facilitate learning in the clinical workplace dominated by service delivery pressures. ## Acknowledgements The  authors  would  like  to  thank  and  acknowledge  the  35 trainers and trainees who participated in the workshop.",
    "doc_3": "Thirty-five people  participated  in  a  2-h  workshop  and included trainers and trainees from nine African countries, the United Kingdom, United States and Sweden (see Table 1). South  Africa  was  represented  by  the  universities  of  Cape Town,  Limpopo,  Pretoria,  Sefako  Makgatho,  Stellenbosch, Walter Sisulu and Witwatersrand. We started with an introduction and then divided into buzz pairs (pairs were allowed to form spontaneously, regardless of the trainer or trainee status of the participants). In the buzz pairs, we explored the questions of how do I teach or learn, supervise  or  be  supervised,  and  assess  or  be  assessed. This was followed by an interactive focus group discussion on  the  reflections  created  by  the  buzz  pair  discussions (a guiding style was employed to facilitate this discussion). The  group  reflections  were  captured  on  a  flip  chart  by  a scribe.  Common  themes  were  identified.  Clarification  was sought and  validated immediately  with the workshop participants. A  preliminary  draft  of  this  report  was  shared with the workshop participants after the conference."
}

# Compute embeddings for documents
doc_embeddings = {doc: get_embedding(content) for doc, content in retrieved_docs.items()}

# Compute Average Cosine Similarity
avg_sim = avg_cosine_similarity(query, retrieved_docs.keys(), doc_embeddings)

print("Average Cosine Similarity:", avg_sim)


Average Cosine Similarity: 0.37750697


Alibaba

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

def get_embedding(text):
    """Generate embedding using the Alibaba-NLP/gte-multilingual-base."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    # Move inputs to the same device as the model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    # Move the tensor to the CPU before converting to NumPy
    return outputs.last_hidden_state[:, 0, :].cpu().squeeze().numpy()  # Use CLS token embedding

def avg_cosine_similarity(query, retrieved_docs, doc_embeddings):
    """Compute the average cosine similarity between a query and retrieved documents."""
    query_embedding = get_embedding(query).reshape(1, -1)
    doc_vectors = np.array([doc_embeddings[doc] for doc in retrieved_docs])

    similarities = cosine_similarity(query_embedding, doc_vectors)[0]
    return np.mean(similarities)

# Example Query
query = "What is SAAFP?"

# Example Retrieved Documents
retrieved_docs = {
    "doc_1": "The World Organisation of Family Doctors and the South African Academy of Family Physicians (SAAFP)  have  established  standards  for  the  postgraduate  training  of  family  physicians. 8,9 However,  family  medicine  is  a  relatively  new  specialty  in many African countries, which adds to the challenges around training and supervision in the context of large rural areas, massive health needs and minimal resources. 10 The  aim  of  the  workshop  was  to  understand  how  family medicine registrars (postgraduate trainees in family medicine) in  Africa  learn  in  the  workplace. We  particularly  wanted  to explore  the  interaction  between  the  registrar  and  supervisor in the workplace, captured in a portfolio of learning, and in the African  context. We  sought  a  clearer  understanding  of  what it  means  to  be  observed  while  conducting  a  consultation  or performing  a  procedure,  as  well  as  understanding  the  local experience of giving or receiving feedback, and how various educational meetings are conducted.",
    "doc_2": "The group appreciated the richness of the discussion and the value  of  having  a  variety  of  countries  represented  in  the workshop. The group members expressed feeling encouraged and felt motivated to use 'small moments, little bits, part of the mini-CEX' during learning interactions in the workplace (teachable  moments).  This  will  allow  the  supervisors  and registrars to be 'more real' in the workplace, as opposed to striving for the hard-to-reach  perfect or ideal learning interactions. It will necessitate a more honest and pragmatic approach  to  harness  these  learning  moments.  Ongoing discussions  are  needed  around  the  validity  of  continuous assessments  in  the  workplace  for  national  examinations, such as the Fellowship of the College of Family Physicians of South Africa (FCFP[SA]), and the contribution of the learning portfolio to exit examination results. Collaborative training projects, like Training the Clinical Trainers (TCT) project and 'FamLEAP'  initiative,  are  trying  to  address  the  need  for training of supervisors in South Africa and also now Malawi and  other  countries  in  Africa  in  basic  workplace-based educational skills, such as formative assessment and giving feedback. 13",
    "doc_3": "Louis Jenkins, louis.jenkins@westerncape. gov.za ## Dates:Received: 28 Sept. 2017 Accepted: 09 Nov. 2017 Published: 12 Apr. 2018 How to cite this article: Jenkins LS, Von Pressentin K. Family medicine training in Africa: Views of clinical trainers and trainees. Afr J Prm Health Care Fam Med. 2018;10(1), a1638. https:// doi.org/10.4102/phcfm. v10i1.1638 # Copyright: ¬© 2018. The Authors. Licensee: AOSIS. This work is licensed under the Creative Commons Attribution License. ![Image]/content/drive/MyDrive/document_rag/md/17_artifacts/image_000005_e2cece3be96aa05931eea2488c7312b12d82969056fd50762e3f32ae19090fd2.png) *Image Description:* This image features a QR code with instructions to Scan this QR code with your smart phone or mobile device to read online. It also mentions that online reading can be done by Read online. The text ¬© 2018. The Authors. License: AOSIS. This work is licensed under the Creative Commons Attribution License.The design and information suggest it's related to learning or training programs for family medicine registrars in Africa. Objectives :  The  aim  of  the  workshop  was  to  understand  how  family  medicine  registrars (postgraduate trainees in family medicine) in Africa learn in the workplace. Methods : Thirty-five  trainers  and  registrars  from  nine  African  countries,  the  United Kingdom,  United  States  and  Sweden  participated.  South  Africa  was  represented  by  the universities of Cape Town, Limpopo, Pretoria, Sefako Makgatho, Stellenbosch, Walter Sisulu and Witwatersrand. Results: Six  major  themes  were  identified:  (1)  context  is  critical,  (2)  learning  style  of  the registrar and (teaching style) of the supervisor, (3) learning portfolio is utilised, (4) interactions between registrar and supervisor, (5) giving and receiving feedback and (6) the competence of the supervisor. Conclusion :  The  training of family physicians across Africa shares many common themes. However, there are also big differences among the various countries and even programmes within countries. The way forward would include exploring the local contextual enablers that influence the learning conversations between trainees and their supervisors. Family medicine training  institutions  and  organisations  (such  as  WONCA  Africa  and  the  South  African Academy of Family Physicians) have a critical role to play in supporting trainees and trainers towards  developing  local  competencies  which  facilitate  learning  in  the  clinical  workplace dominated by service delivery pressures."
}

# Compute embeddings for documents
doc_embeddings = {doc: get_embedding(content) for doc, content in retrieved_docs.items()}

# Compute Average Cosine Similarity
avg_sim = avg_cosine_similarity(query, retrieved_docs.keys(), doc_embeddings)

print("Average Cosine Similarity:", avg_sim)


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

The repository for Alibaba-NLP/gte-multilingual-base contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-multilingual-base.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


The repository for Alibaba-NLP/gte-multilingual-base contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-multilingual-base.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/611M [00:00<?, ?B/s]

Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: {'classifier.weight', 'classifier.bias'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Average Cosine Similarity: 0.63800275


# trial : separate data tables

In [None]:
!pip install supabase numpy psycopg2

In [9]:
import os
import json
import numpy as np
from supabase import create_client, Client

# Initialize Supabase
SUPABASE_URL = "https://vptbbrmqaqpsynvpizih.supabase.co"
SUPABASE_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6InZwdGJicm1xYXFwc3ludnBpemloIiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDEwNjU2NzMsImV4cCI6MjA1NjY0MTY3M30.XVOsjwisyi39awcbC3TMf46uMbdlwUkY-wfyo31UthI"

supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)


In [33]:
import json
import torch
import uuid  # Import the missing module
from transformers import AutoTokenizer, AutoModel
from supabase import create_client

# Load Embedding Model
tokenizer = AutoTokenizer.from_pretrained("Alibaba-NLP/gte-multilingual-base")
model = AutoModel.from_pretrained("Alibaba-NLP/gte-multilingual-base").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().tolist()

# Load JSON chunks
json_file_path = "/content/17.json"
with open(json_file_path, "r", encoding="utf-8") as json_file:
    json_chunks = json.load(json_file)

def store_chunks_in_supabase(chunks):
    """Stores chunks into Supabase, differentiating between text and tables."""
    document_entries = []
    table_entries = []

    for chunk in chunks:
        chunk_id = str(uuid.uuid4())  # Generate unique chunk_id

        if "content" in chunk and chunk["content"]:
            content = chunk["content"]
            embedding = get_embedding(content)

            document_entries.append({
                "chunk_id": chunk_id,
                "content": content,
                "embedding": embedding,
                "metadata": chunk["metadata"],
                "type": "text"
            })

        if "table" in chunk and chunk["table"]:
            table_data = chunk["table"]
            table_entries.append({
                "chunk_id": chunk_id,
                "table_data": json.dumps(table_data, ensure_ascii=False),
                "metadata": chunk["metadata"]
            })

    # Batch insert into Supabase for efficiency
    if document_entries:
        supabase.table("documents").insert(document_entries).execute()
    if table_entries:
        supabase.table("tables").insert(table_entries).execute()

# Store chunks in Supabase
store_chunks_in_supabase(json_chunks)

print("Chunks with embeddings stored successfully in Supabase!")

The repository for Alibaba-NLP/gte-multilingual-base contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/Alibaba-NLP/gte-multilingual-base.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: {'classifier.bias', 'classifier.weight'}
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Chunks with embeddings stored successfully in Supabase!


In [49]:
import ast
import numpy as np
import re
from scipy.spatial.distance import cosine

def query_supabase(user_query):
    """Retrieves both text and table chunks based on query."""

    #### üîπ Step 1: Retrieve Text Chunks (Vector Search) ####
    query_embedding = np.array(get_embedding(user_query), dtype=np.float32).flatten()
    print(f"Query embedding shape: {query_embedding.shape}")  # Debugging

    response_text = supabase.table("documents").select("chunk_id, content, embedding").execute()
    text_results = []

    for record in response_text.data:
        chunk_embedding = record["embedding"]

        # Convert stored string embeddings to list if needed
        if isinstance(chunk_embedding, str):
            chunk_embedding = ast.literal_eval(chunk_embedding)

        chunk_embedding = np.array(chunk_embedding, dtype=np.float32).flatten()
        print(f"Chunk {record['chunk_id']} embedding shape: {chunk_embedding.shape}")  # Debugging

        if chunk_embedding.shape == query_embedding.shape:
            similarity = 1 - cosine(query_embedding, chunk_embedding)
            text_results.append((record["chunk_id"], "text", record["content"], similarity))
        else:
            print(f"‚ö†Ô∏è Skipping chunk {record['chunk_id']} due to shape mismatch.")

    #### üîπ Step 2: Retrieve Table Chunks (Improved Keyword Search) ####
    response_tables = supabase.table("tables").select("chunk_id, table_data").execute()
    table_results = []

    query_words = set(re.findall(r'\w+', user_query.lower()))  # Extract words from query

    for record in response_tables.data:
        table_data = record["table_data"].lower()
        table_words = set(re.findall(r'\w+', table_data))  # Extract words from table

        common_words = query_words.intersection(table_words)  # Count overlapping words
        match_score = len(common_words) / max(len(query_words), 1)  # Normalize score

        if match_score > 0:  # Only include tables with at least one match
            table_results.append((record["chunk_id"], "table", table_data, match_score))

    #### üîπ Step 3: Merge & Sort Results ####
    all_results = text_results + table_results
    all_results.sort(key=lambda x: x[3], reverse=True)  # Sort by relevance

    return all_results[:5]  # Return top 5 results


In [50]:
# Example usage
user_query = "Number of participants from Ireland"
retrieved_chunks = query_supabase(user_query)

# Display results
for chunk in retrieved_chunks:
    print(f"Chunk ID: {chunk[0]}\nType: {chunk[1]}\nContent: {chunk[2][:300]}...\nRelevance: {chunk[3]:.4f}\n")

Query embedding shape: (768,)
Chunk fb1b5844-c39b-4e8d-932b-739da8ad3515 embedding shape: (768,)
Chunk 21d7a8e4-cb1e-4560-b6e8-28fdb09f5cbc embedding shape: (768,)
Chunk fb16a6de-7485-4e48-afc5-75ca4c111915 embedding shape: (768,)
Chunk 11616d14-946f-4466-9473-187f9ef7b626 embedding shape: (768,)
Chunk d41877dd-5367-4d06-a7ed-26c7f31053b7 embedding shape: (768,)
Chunk faded987-2845-4c51-ade1-1c2fc493cf90 embedding shape: (768,)
Chunk 8e80b26a-f377-4a18-9095-afd50039f62e embedding shape: (768,)
Chunk 930f8f67-116a-4fc8-8c0e-06854026612c embedding shape: (768,)
Chunk 3f892d71-e76d-4b19-9f23-581cdecac427 embedding shape: (768,)
Chunk e69f6085-fd6b-4bf8-ad5d-1d917cff9d9b embedding shape: (768,)
Chunk 2e817a14-f423-4735-bcca-3f16166822a1 embedding shape: (768,)
Chunk 2ef565f1-4690-4f1b-b643-836a82b6289c embedding shape: (768,)
Chunk ba7cc1be-498e-45ff-b2c4-fa5cfc1b19ce embedding shape: (768,)
Chunk 0fdabd9b-c8c4-4304-a104-9a917ba3e4bf embedding shape: (768,)
Chunk ID: d5bbc782-e294-476d-b6e

In [51]:
import ast
import numpy as np
from scipy.spatial.distance import cosine

def query_supabase(user_query):
    """Retrieves both text and table chunks based on the query."""

    #### üîπ Step 1: Retrieve Text Chunks (Vector Search) ####
    query_embedding = np.array(get_embedding(user_query), dtype=np.float32).flatten()
    print(f"Query embedding shape: {query_embedding.shape}")  # Debugging

    response_text = supabase.table("documents").select("chunk_id, content, embedding").execute()
    text_results = []

    for record in response_text.data:
        chunk_embedding = record["embedding"]

        # Convert stored string embeddings to list if needed
        if isinstance(chunk_embedding, str):
            chunk_embedding = ast.literal_eval(chunk_embedding)

        chunk_embedding = np.array(chunk_embedding, dtype=np.float32).flatten()
        print(f"Chunk {record['chunk_id']} embedding shape: {chunk_embedding.shape}")  # Debugging

        if chunk_embedding.shape == query_embedding.shape:
            similarity = 1 - cosine(query_embedding, chunk_embedding)
            text_results.append((record["chunk_id"], "text", record["content"], similarity))
        else:
            print(f"‚ö†Ô∏è Skipping chunk {record['chunk_id']} due to shape mismatch.")

    #### üîπ Step 2: Retrieve Table Chunks (Keyword Search) ####
    response_tables = supabase.table("tables").select("chunk_id, table_data").execute()
    table_results = []

    for record in response_tables.data:
        table_data = record["table_data"]

        # Check if query terms exist in table (basic keyword match)
        if user_query.lower() in table_data.lower():
            table_results.append((record["chunk_id"], "table", table_data, 0.85))  # Assigning a fixed relevance score

    #### üîπ Step 3: Merge & Sort Results ####
    all_results = text_results + table_results
    all_results.sort(key=lambda x: x[3], reverse=True)  # Sort by relevance

    return all_results[:5]  # Return top 5 results

In [52]:
# Example usage
user_query = "Number of participants from Ireland"
retrieved_chunks = query_supabase(user_query)

# Display results
for chunk in retrieved_chunks:
    print(f"Chunk ID: {chunk[0]}\nType: {chunk[1]}\nContent: {chunk[2][:300]}...\nRelevance: {chunk[3]:.4f}\n")

Query embedding shape: (768,)
Chunk fb1b5844-c39b-4e8d-932b-739da8ad3515 embedding shape: (768,)
Chunk 21d7a8e4-cb1e-4560-b6e8-28fdb09f5cbc embedding shape: (768,)
Chunk fb16a6de-7485-4e48-afc5-75ca4c111915 embedding shape: (768,)
Chunk 11616d14-946f-4466-9473-187f9ef7b626 embedding shape: (768,)
Chunk d41877dd-5367-4d06-a7ed-26c7f31053b7 embedding shape: (768,)
Chunk faded987-2845-4c51-ade1-1c2fc493cf90 embedding shape: (768,)
Chunk 8e80b26a-f377-4a18-9095-afd50039f62e embedding shape: (768,)
Chunk 930f8f67-116a-4fc8-8c0e-06854026612c embedding shape: (768,)
Chunk 3f892d71-e76d-4b19-9f23-581cdecac427 embedding shape: (768,)
Chunk e69f6085-fd6b-4bf8-ad5d-1d917cff9d9b embedding shape: (768,)
Chunk 2e817a14-f423-4735-bcca-3f16166822a1 embedding shape: (768,)
Chunk 2ef565f1-4690-4f1b-b643-836a82b6289c embedding shape: (768,)
Chunk ba7cc1be-498e-45ff-b2c4-fa5cfc1b19ce embedding shape: (768,)
Chunk 0fdabd9b-c8c4-4304-a104-9a917ba3e4bf embedding shape: (768,)
Chunk ID: faded987-2845-4c51-ade

In [53]:
# Example usage
user_query = "Number of participants"
retrieved_chunks = query_supabase(user_query)

# Display results
for chunk in retrieved_chunks:
    print(f"Chunk ID: {chunk[0]}\nType: {chunk[1]}\nContent: {chunk[2][:300]}...\nRelevance: {chunk[3]:.4f}\n")

Query embedding shape: (768,)
Chunk fb1b5844-c39b-4e8d-932b-739da8ad3515 embedding shape: (768,)
Chunk 21d7a8e4-cb1e-4560-b6e8-28fdb09f5cbc embedding shape: (768,)
Chunk fb16a6de-7485-4e48-afc5-75ca4c111915 embedding shape: (768,)
Chunk 11616d14-946f-4466-9473-187f9ef7b626 embedding shape: (768,)
Chunk d41877dd-5367-4d06-a7ed-26c7f31053b7 embedding shape: (768,)
Chunk faded987-2845-4c51-ade1-1c2fc493cf90 embedding shape: (768,)
Chunk 8e80b26a-f377-4a18-9095-afd50039f62e embedding shape: (768,)
Chunk 930f8f67-116a-4fc8-8c0e-06854026612c embedding shape: (768,)
Chunk 3f892d71-e76d-4b19-9f23-581cdecac427 embedding shape: (768,)
Chunk e69f6085-fd6b-4bf8-ad5d-1d917cff9d9b embedding shape: (768,)
Chunk 2e817a14-f423-4735-bcca-3f16166822a1 embedding shape: (768,)
Chunk 2ef565f1-4690-4f1b-b643-836a82b6289c embedding shape: (768,)
Chunk ba7cc1be-498e-45ff-b2c4-fa5cfc1b19ce embedding shape: (768,)
Chunk 0fdabd9b-c8c4-4304-a104-9a917ba3e4bf embedding shape: (768,)
Chunk ID: d5bbc782-e294-476d-b6e

In [54]:
# Example usage
user_query = "Ireland"
retrieved_chunks = query_supabase(user_query)

# Display results
for chunk in retrieved_chunks:
    print(f"Chunk ID: {chunk[0]}\nType: {chunk[1]}\nContent: {chunk[2][:300]}...\nRelevance: {chunk[3]:.4f}\n")

Query embedding shape: (768,)
Chunk fb1b5844-c39b-4e8d-932b-739da8ad3515 embedding shape: (768,)
Chunk 21d7a8e4-cb1e-4560-b6e8-28fdb09f5cbc embedding shape: (768,)
Chunk fb16a6de-7485-4e48-afc5-75ca4c111915 embedding shape: (768,)
Chunk 11616d14-946f-4466-9473-187f9ef7b626 embedding shape: (768,)
Chunk d41877dd-5367-4d06-a7ed-26c7f31053b7 embedding shape: (768,)
Chunk faded987-2845-4c51-ade1-1c2fc493cf90 embedding shape: (768,)
Chunk 8e80b26a-f377-4a18-9095-afd50039f62e embedding shape: (768,)
Chunk 930f8f67-116a-4fc8-8c0e-06854026612c embedding shape: (768,)
Chunk 3f892d71-e76d-4b19-9f23-581cdecac427 embedding shape: (768,)
Chunk e69f6085-fd6b-4bf8-ad5d-1d917cff9d9b embedding shape: (768,)
Chunk 2e817a14-f423-4735-bcca-3f16166822a1 embedding shape: (768,)
Chunk 2ef565f1-4690-4f1b-b643-836a82b6289c embedding shape: (768,)
Chunk ba7cc1be-498e-45ff-b2c4-fa5cfc1b19ce embedding shape: (768,)
Chunk 0fdabd9b-c8c4-4304-a104-9a917ba3e4bf embedding shape: (768,)
Chunk ID: d5bbc782-e294-476d-b6e