Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

supabase vectorstore - first cut #3100

Merged
merged 12 commits into from
Apr 20, 2023
212 changes: 212 additions & 0 deletions langchain/vectorstores/supabase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
from itertools import repeat
from typing import Any, Iterable, List, Optional, Tuple, Type, Union

from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore


class SupabaseVectorStore(VectorStore):
"""VectorStore for a Supabase postgres database. Assumes you have the `pgvector`
extension installed and a `match_documents` (or similar) function. For more details:
https://js.langchain.com/docs/modules/indexes/vector_stores/integrations/supabase

You can implement your own `match_documents` function in order to limit the search space
to a subset of documents based on your own authorization or business logic.

Note that the Supabase Python client does not yet support async operations.
"""

_client: Any
# This is the embedding function. Don't confuse with the embedding vectors.
# We should perhaps rename the underlying Embedding base class to EmbeddingFunction or something
_embedding: Embeddings
table_name: str
query_name: str

def __init__(
self,
client: Any,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can support type checking on optional dependencies with optional imports. langchain/vectostors/chroma.py is one example. basically do something like

if TYPE_CHECKING:
  import supabase

class SupabaseVectorStore:
  def __init__(self, client: supabase.client.Client, ...):

embedding: Embeddings,
table_name: str,
query_name: Union[str, None] = None,
) -> None:
"""Initialize with supabase client."""
try:
import supabase
except ImportError:
raise ValueError(
"Could not import supabase python package. "
"Please install it with `pip install supabase`."
)

if not isinstance(client, supabase.client.Client):
raise ValueError("client should be an instance of supabase.client.Client")

self._client = client
self._embedding: Embeddings = embedding
self.table_name = table_name or "documents"
self.query_name = query_name or "match_documents"

def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict[Any, Any]]] = None,
) -> List[str]:
docs = self._texts_to_documents(texts, metadatas)

vectors = self._embedding.embed_documents(list(texts))
return self.add_vectors(vectors, docs)

@classmethod
def from_texts(
cls: Type["SupabaseVectorStore"],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict[Any, Any]]],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think adding new explicit kwargs shouldn't be issue for linter, but changing kwarg to arg might. so metadatas should have default val None

client: Any,
table_name: str,
query_name: Union[str, None] = None,
**kwargs: Any,
) -> "SupabaseVectorStore":
"""Return VectorStore initialized from texts and embeddings."""

embeddings = embedding.embed_documents(texts)
docs = cls._texts_to_documents(texts, metadatas)
_ids = cls._add_vectors(client, table_name, embeddings, docs)

return cls(
client=client,
embedding=embedding,
table_name=table_name,
query_name=query_name,
)

@classmethod
def from_documents(
cls: Type["SupabaseVectorStore"],
documents: List[Document],
embedding: Embeddings,
client: Any,
table_name: str,
query_name: Union[str, None] = None,
) -> "SupabaseVectorStore":
"""Return VectorStore initialized from Documents."""

texts = [doc.page_content for doc in documents]
embeddings = embedding.embed_documents(texts)
_ids = cls._add_vectors(client, table_name, embeddings, documents)

return cls(
client=client,
embedding=embedding,
table_name=table_name,
query_name=query_name,
)

def add_vectors(
self, vectors: List[List[float]], documents: List[Document]
) -> List[str]:
return self._add_vectors(self._client, self.table_name, vectors, documents)

def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
vectors = self._embedding.embed_documents([query])
return self.similarity_search_by_vector(vectors[0], k)

def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
result = self.similarity_search_by_vector_with_relevance_scores(embedding, k)

documents = [doc for doc, _ in result]

return documents

def similarity_search_with_relevance_scores(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
vectors = self._embedding.embed_documents([query])
return self.similarity_search_by_vector_with_relevance_scores(vectors[0], k)

def similarity_search_by_vector_with_relevance_scores(
self, query: List[float], k: int
) -> List[Tuple[Document, float]]:
match_documents_params = dict(query_embedding=query, match_count=k)
res = self._client.rpc(self.query_name, match_documents_params).execute()

match_result = [
(
Document(
metadata=search.get("metadata", {}), # type: ignore
page_content=search.get("content", ""),
),
search.get("similarity", 0.0),
)
for search in res.data
if search.get("content")
]

return match_result

@staticmethod
def _texts_to_documents(
texts: Iterable[str], metadatas: Optional[Iterable[dict[Any, Any]]] = None
) -> List[Document]:
"""Return list of Documents from list of texts and metadatas."""
if metadatas is None:
metadatas = repeat({})

docs = [
Document(page_content=text, metadata=metadata)
for text, metadata in zip(texts, metadatas)
]

return docs

@staticmethod
def _add_vectors(
client: Any,
table_name: str,
vectors: List[List[float]],
documents: List[Document],
) -> List[str]:
"""Add vectors to Supabase table."""
try:
import supabase
except ImportError:
raise ValueError(
"Could not import supabase python package. "
"Please install it with `pip install supabase`."
)

if not isinstance(client, supabase.client.Client):
raise ValueError("client should be an instance of supabase.client.Client")

rows: List[dict[str, Any]] = [
{
"content": documents[idx].page_content,
"embedding": embedding,
"metadata": documents[idx].metadata, # type: ignore
}
for idx, embedding in enumerate(vectors)
]

# According to the SupabaseVectorStore JS implementation, the best chunk size is 500
chunk_size = 500
id_list: List[str] = []
for i in range(0, len(rows), chunk_size):
chunk = rows[i : i + chunk_size]

result = client.from_(table_name).insert(chunk).execute() # type: ignore

if len(result.data) == 0:
raise Exception("Error inserting: No rows added")

# VectorStore.add_vectors returns ids as strings
ids = [str(i.get("id")) for i in result.data if i.get("id")]

id_list.extend(ids)

return id_list