-
Notifications
You must be signed in to change notification settings - Fork 13.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
supabase vectorstore - first cut #3100
Merged
hwchase17
merged 12 commits into
langchain-ai:master
from
danielchalef:supabase-vectorstore
Apr 20, 2023
Merged
Changes from 1 commit
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
f737e14
first cut of a supabase vectorstore
4b23024
add max_marginal_relevance_search
33e7148
type hints for supabase client and fix from_texts args
ae061f8
missed some type hints
b67fa6c
from_documents already implemented by superclass
d5252dc
add_texts kwargs
ef0b60e
remove explicit dict typing
91e396a
fixing type hints and line lengths
92cae66
Merge branch 'hwchase17:master' into supabase-vectorstore
danielchalef ad809b3
grammar and formatting
5231128
default values for from_texts
a11082e
SupabaseVectorStore notebook
File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,212 @@ | ||
from itertools import repeat | ||
from typing import Any, Iterable, List, Optional, Tuple, Type, Union | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.embeddings.base import Embeddings | ||
from langchain.vectorstores.base import VectorStore | ||
|
||
|
||
class SupabaseVectorStore(VectorStore): | ||
"""VectorStore for a Supabase postgres database. Assumes you have the `pgvector` | ||
extension installed and a `match_documents` (or similar) function. For more details: | ||
https://js.langchain.com/docs/modules/indexes/vector_stores/integrations/supabase | ||
|
||
You can implement your own `match_documents` function in order to limit the search space | ||
to a subset of documents based on your own authorization or business logic. | ||
|
||
Note that the Supabase Python client does not yet support async operations. | ||
""" | ||
|
||
_client: Any | ||
# This is the embedding function. Don't confuse with the embedding vectors. | ||
# We should perhaps rename the underlying Embedding base class to EmbeddingFunction or something | ||
_embedding: Embeddings | ||
table_name: str | ||
query_name: str | ||
|
||
def __init__( | ||
self, | ||
client: Any, | ||
embedding: Embeddings, | ||
table_name: str, | ||
query_name: Union[str, None] = None, | ||
) -> None: | ||
"""Initialize with supabase client.""" | ||
try: | ||
import supabase | ||
except ImportError: | ||
raise ValueError( | ||
"Could not import supabase python package. " | ||
"Please install it with `pip install supabase`." | ||
) | ||
|
||
if not isinstance(client, supabase.client.Client): | ||
raise ValueError("client should be an instance of supabase.client.Client") | ||
|
||
self._client = client | ||
self._embedding: Embeddings = embedding | ||
self.table_name = table_name or "documents" | ||
self.query_name = query_name or "match_documents" | ||
|
||
def add_texts( | ||
self, | ||
texts: Iterable[str], | ||
metadatas: Optional[List[dict[Any, Any]]] = None, | ||
) -> List[str]: | ||
docs = self._texts_to_documents(texts, metadatas) | ||
|
||
vectors = self._embedding.embed_documents(list(texts)) | ||
return self.add_vectors(vectors, docs) | ||
|
||
@classmethod | ||
def from_texts( | ||
cls: Type["SupabaseVectorStore"], | ||
texts: List[str], | ||
embedding: Embeddings, | ||
metadatas: Optional[List[dict[Any, Any]]], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think adding new explicit kwargs shouldn't be issue for linter, but changing kwarg to arg might. so |
||
client: Any, | ||
table_name: str, | ||
query_name: Union[str, None] = None, | ||
**kwargs: Any, | ||
) -> "SupabaseVectorStore": | ||
"""Return VectorStore initialized from texts and embeddings.""" | ||
|
||
embeddings = embedding.embed_documents(texts) | ||
docs = cls._texts_to_documents(texts, metadatas) | ||
_ids = cls._add_vectors(client, table_name, embeddings, docs) | ||
|
||
return cls( | ||
client=client, | ||
embedding=embedding, | ||
table_name=table_name, | ||
query_name=query_name, | ||
) | ||
|
||
@classmethod | ||
def from_documents( | ||
cls: Type["SupabaseVectorStore"], | ||
documents: List[Document], | ||
embedding: Embeddings, | ||
client: Any, | ||
table_name: str, | ||
query_name: Union[str, None] = None, | ||
) -> "SupabaseVectorStore": | ||
"""Return VectorStore initialized from Documents.""" | ||
|
||
texts = [doc.page_content for doc in documents] | ||
embeddings = embedding.embed_documents(texts) | ||
_ids = cls._add_vectors(client, table_name, embeddings, documents) | ||
|
||
return cls( | ||
client=client, | ||
embedding=embedding, | ||
table_name=table_name, | ||
query_name=query_name, | ||
) | ||
|
||
def add_vectors( | ||
self, vectors: List[List[float]], documents: List[Document] | ||
) -> List[str]: | ||
return self._add_vectors(self._client, self.table_name, vectors, documents) | ||
|
||
def similarity_search( | ||
self, query: str, k: int = 4, **kwargs: Any | ||
) -> List[Document]: | ||
vectors = self._embedding.embed_documents([query]) | ||
return self.similarity_search_by_vector(vectors[0], k) | ||
|
||
def similarity_search_by_vector( | ||
self, embedding: List[float], k: int = 4, **kwargs: Any | ||
) -> List[Document]: | ||
result = self.similarity_search_by_vector_with_relevance_scores(embedding, k) | ||
|
||
documents = [doc for doc, _ in result] | ||
|
||
return documents | ||
|
||
def similarity_search_with_relevance_scores( | ||
self, query: str, k: int = 4, **kwargs: Any | ||
) -> List[Tuple[Document, float]]: | ||
vectors = self._embedding.embed_documents([query]) | ||
return self.similarity_search_by_vector_with_relevance_scores(vectors[0], k) | ||
|
||
def similarity_search_by_vector_with_relevance_scores( | ||
self, query: List[float], k: int | ||
) -> List[Tuple[Document, float]]: | ||
match_documents_params = dict(query_embedding=query, match_count=k) | ||
res = self._client.rpc(self.query_name, match_documents_params).execute() | ||
|
||
match_result = [ | ||
( | ||
Document( | ||
metadata=search.get("metadata", {}), # type: ignore | ||
page_content=search.get("content", ""), | ||
), | ||
search.get("similarity", 0.0), | ||
) | ||
for search in res.data | ||
if search.get("content") | ||
] | ||
|
||
return match_result | ||
|
||
@staticmethod | ||
def _texts_to_documents( | ||
texts: Iterable[str], metadatas: Optional[Iterable[dict[Any, Any]]] = None | ||
) -> List[Document]: | ||
"""Return list of Documents from list of texts and metadatas.""" | ||
if metadatas is None: | ||
metadatas = repeat({}) | ||
|
||
docs = [ | ||
Document(page_content=text, metadata=metadata) | ||
for text, metadata in zip(texts, metadatas) | ||
] | ||
|
||
return docs | ||
|
||
@staticmethod | ||
def _add_vectors( | ||
client: Any, | ||
table_name: str, | ||
vectors: List[List[float]], | ||
documents: List[Document], | ||
) -> List[str]: | ||
"""Add vectors to Supabase table.""" | ||
try: | ||
import supabase | ||
except ImportError: | ||
raise ValueError( | ||
"Could not import supabase python package. " | ||
"Please install it with `pip install supabase`." | ||
) | ||
|
||
if not isinstance(client, supabase.client.Client): | ||
raise ValueError("client should be an instance of supabase.client.Client") | ||
|
||
rows: List[dict[str, Any]] = [ | ||
{ | ||
"content": documents[idx].page_content, | ||
"embedding": embedding, | ||
"metadata": documents[idx].metadata, # type: ignore | ||
} | ||
for idx, embedding in enumerate(vectors) | ||
] | ||
|
||
# According to the SupabaseVectorStore JS implementation, the best chunk size is 500 | ||
chunk_size = 500 | ||
id_list: List[str] = [] | ||
for i in range(0, len(rows), chunk_size): | ||
chunk = rows[i : i + chunk_size] | ||
|
||
result = client.from_(table_name).insert(chunk).execute() # type: ignore | ||
|
||
if len(result.data) == 0: | ||
raise Exception("Error inserting: No rows added") | ||
|
||
# VectorStore.add_vectors returns ids as strings | ||
ids = [str(i.get("id")) for i in result.data if i.get("id")] | ||
|
||
id_list.extend(ids) | ||
|
||
return id_list |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can support type checking on optional dependencies with optional imports.
langchain/vectostors/chroma.py
is one example. basically do something like