<a href="https://colab.research.google.com/github/jessiechd/RAG_Model/blob/main/0219_supabase_setup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting up: Supabase on Colab
[Source](https://supabase.com/docs/guides/ai/google-colab)
[Source](
https://colab.research.google.com/github/supabase/supabase/blob/master/examples/ai/vector_hello_world.ipynb)

In [1]:
!pip install vecs

Collecting vecs
  Downloading vecs-0.4.5.tar.gz (22 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pgvector==0.3.* (from vecs)
  Downloading pgvector-0.3.6-py3-none-any.whl.metadata (13 kB)
Collecting psycopg2-binary==2.9.* (from vecs)
  Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting flupy==1.* (from vecs)
  Downloading flupy-1.2.1.tar.gz (12 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading pgvector-0.3.6-py3-none-any.whl (24 kB)
Downloading psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m30.0 MB/s[0m eta [36m0:00:

In [2]:
import vecs

DB_CONNECTION = "postgresql://postgres.###:###@aws-0-ap-southeast-1.pooler.supabase.com:5432/postgres"

# create vector store client
vx = vecs.create_client(DB_CONNECTION)

In [None]:
collection = vx.get_or_create_collection(name="colab_collection", dimension=3)

collection.upsert(
    records=[
        (
         "vec0",           # the vector's identifier
         [0.1, 0.2, 0.3],  # the vector. list or np.array
         {"year": 1973}    # associated  metadata
        ),
        (
         "vec1",
         [0.7, 0.8, 0.9],
         {"year": 2012}
        )
    ]
)


In [None]:
collection.query(
    data=[0.4,0.5,0.6],          # required
    limit=5,                     # number of records to return
    filters={},                  # metadata filters
    measure="cosine_distance",   # distance measure to use
    include_value=False,         # should distance measure values be returned?
    include_metadata=False,      # should record metadata be returned?
)




['vec1', 'vec0']

# Uploading Chunks (JSON) as Vector embeddings to Supabase

In [3]:
!pip install langchain langchainhub langchain-community sentence-transformers


Collecting langchainhub
  Downloading langchainhub-0.1.21-py3-none-any.whl.metadata (659 bytes)
Collecting langchain-community
  Downloading langchain_community-0.3.17-py3-none-any.whl.metadata (2.4 kB)
Collecting types-requests<3.0.0.0,>=2.31.0.2 (from langchainhub)
  Downloading types_requests-2.32.0.20241016-py3-none-any.whl.metadata (1.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain

In [7]:
import os
import json
from langchain_community.embeddings import HuggingFaceEmbeddings
from vecs import create_client  # Import Supabase Vector Client

def embed_chunks_from_directory(directory: str, db_connection: str, collection_name: str):

    embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

    # Create vector store client
    vx = create_client(db_connection)
    collection = vx.get_or_create_collection(name=collection_name, dimension=384)  # Adjust dimension as needed

    embedded_records = []

    # Iterate over all JSON files in the directory
    for filename in os.listdir(directory):
        if filename.endswith(".json"):
            file_path = os.path.join(directory, filename)

            with open(file_path, "r", encoding="utf-8") as f:
                try:
                    data = json.load(f)

                    # Extract text chunks (modify key based on JSON structure)
                    chunks = [chunk["text"] for chunk in data]

                    # Embed text chunks
                    embeddings = embedding_model.embed_documents(chunks)

                    # Prepare records for upserting
                    for i, (chunk, emb) in enumerate(zip(chunks, embeddings)):
                        embedded_records.append((f"{filename}_vec{i}", emb, {"source_file": filename}))
                except json.JSONDecodeError as e:
                    print(f"Error reading {filename}: {e}")

    # Upsert records into Supabase
    collection.upsert(records=embedded_records)
    print(f"Embeddings stored in Supabase collection: {collection_name}")

In [9]:
embed_chunks_from_directory("/content/json_files", DB_CONNECTION, "chunks1")


Embeddings stored in Supabase collection: chunks1
