From 41433e65394e4189b80a238bdede02d1c14d9421 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Mon, 24 Apr 2023 13:40:22 +0200 Subject: [PATCH 01/19] feat: add in-memory and hnswlib vectorstore Signed-off-by: anna-charlotte --- langchain/vectorstores/hnsw_lib.py | 235 +++++++++++++++++ langchain/vectorstores/in_memory.py | 210 +++++++++++++++ poetry.lock | 245 ++++++++++++------ pyproject.toml | 7 +- .../vectorstores/test_hnsw_lib.py | 54 ++++ .../vectorstores/test_in_memory.py | 48 ++++ 6 files changed, 712 insertions(+), 87 deletions(-) create mode 100644 langchain/vectorstores/hnsw_lib.py create mode 100644 langchain/vectorstores/in_memory.py create mode 100644 tests/integration_tests/vectorstores/test_hnsw_lib.py create mode 100644 tests/integration_tests/vectorstores/test_in_memory.py diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py new file mode 100644 index 00000000000000..6974133c9891b5 --- /dev/null +++ b/langchain/vectorstores/hnsw_lib.py @@ -0,0 +1,235 @@ +"""Wrapper around in-memory DocArray store.""" +from __future__ import annotations + +from operator import itemgetter +from typing import List, Optional, Any, Tuple, Iterable, Type, Callable, Sequence, TYPE_CHECKING + +from langchain.embeddings.base import Embeddings +from langchain.schema import Document +from langchain.vectorstores import VectorStore +from langchain.vectorstores.base import VST +from langchain.vectorstores.utils import maximal_marginal_relevance + +from docarray import BaseDoc +from docarray.typing import NdArray + + +class HnswLib(VectorStore): + """Wrapper around HnswLib storage. + + To use it, you should have the ``docarray`` package with version >=0.30.0 installed. + """ + def __init__( + self, + work_dir: str, + n_dim: int, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]], + sim_metric: str = 'cosine', + kwargs: dict = None + ) -> None: + """Initialize HnswLib store.""" + try: + import docarray + da_version = docarray.__version__.split('.') + if int(da_version[0]) == 0 and int(da_version[1]) <= 21: + raise ValueError( + f'To use the HnswLib VectorStore the docarray version >=0.30.0 is expected, ' + f'received: {docarray.__version__}.' + f'To upgrade, please run: `pip install -U docarray`.' + ) + else: + from docarray import DocList + from docarray.index import HnswDocumentIndex + except ImportError: + raise ImportError( + "Could not import docarray python package. " + "Please install it with `pip install -U docarray`." + ) + try: + import google.protobuf + except ImportError: + raise ImportError( + "Could not import protobuf python package. " + "Please install it with `pip install -U protobuf`." + ) + + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + self.embedding = embedding + + self.doc_cls = self._get_doc_cls(n_dim, sim_metric) + self.doc_index = HnswDocumentIndex[self.doc_cls](work_dir=work_dir) + embeddings = self.embedding.embed_documents(texts) + docs = DocList[self.doc_cls]( + [ + self.doc_cls( + text=t, + embedding=e, + metadata=m, + ) for t, m, e in zip(texts, metadatas, embeddings) + ] + ) + self.doc_index.index(docs) + + @staticmethod + def _get_doc_cls(n_dim: int, sim_metric: str): + from pydantic import Field + + class DocArrayDoc(BaseDoc): + text: Optional[str] + embedding: Optional[NdArray] = Field(dim=n_dim, space=sim_metric) + metadata: Optional[dict] + + return DocArrayDoc + + @classmethod + def from_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + work_dir: str = None, + n_dim: int = None, + **kwargs: Any + ) -> HnswLib: + + if work_dir is None: + raise ValueError('`work_dir` parameter hs not been set.') + if n_dim is None: + raise ValueError('`n_dim` parameter has not been set.') + + return cls( + work_dir=work_dir, + n_dim=n_dim, + texts=texts, + embedding=embedding, + metadatas=metadatas, + kwargs=kwargs + ) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if metadatas is None: + metadatas = [{} for _ in range(len(list(texts)))] + + ids = [] + embeddings = self.embedding.embed_documents(texts) + for t, m, e in zip(texts, metadatas, embeddings): + doc = self.doc_cls( + text=t, + embedding=e, + metadata=m + ) + self.doc_index.index(doc) + ids.append(doc.id) # TODO return index of self.docs ? + + return ids + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query and score for each. + """ + query_embedding = self.embedding.embed_query(query) + query_embedding = [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.] + print(f"query_embedding = {query_embedding}") + query_doc = self.doc_cls(embedding=query_embedding) + docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=k) + + result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + return result + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + """ + results = self.similarity_search_with_score(query, k) + return list(map(itemgetter(0), results)) + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + raise NotImplementedError + + def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + + query_doc = self.doc_cls(embedding=embedding) + docs = self.doc_index.find(query_doc, search_field='embedding', limit=k).documents + + result = [Document(page_content=doc.text) for doc in docs] + return result + + def max_marginal_relevance_search( + self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + + docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=fetch_k) + + embeddings = [emb for emb in docs.emb] + + mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) + results = [Document(page_content=self.doc_index[idx].text) for idx in mmr_selected] + return results + diff --git a/langchain/vectorstores/in_memory.py b/langchain/vectorstores/in_memory.py new file mode 100644 index 00000000000000..a079b10da7887c --- /dev/null +++ b/langchain/vectorstores/in_memory.py @@ -0,0 +1,210 @@ +"""Wrapper around in-memory DocArray store.""" +from __future__ import annotations + +from operator import itemgetter +from typing import List, Optional, Any, Tuple, Iterable, Type + +from langchain.embeddings.base import Embeddings +from langchain.schema import Document +from langchain.vectorstores import VectorStore +from langchain.vectorstores.base import VST +from langchain.vectorstores.utils import maximal_marginal_relevance + +from docarray import BaseDoc +from docarray.typing import NdArray + + +class InMemory(VectorStore): + """Wrapper around in-memory storage. + + To use it, you should have the ``docarray`` package with version >=0.30.0 installed. + """ + def __init__( + self, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] + ) -> None: + """Initialize in-memory store.""" + try: + import docarray + da_version = docarray.__version__.split('.') + if int(da_version[0]) == 0 and int(da_version[1]) <= 21: + raise ValueError( + f'To use the InMemory VectorStore the docarray version >=0.30.0 is expected, ' + f'received: {docarray.__version__}.' + f'To upgrade, please run: `pip install -U docarray`.' + ) + else: + from docarray import DocList + + except ImportError: + raise ImportError( + "Could not import docarray python package. " + "Please install it with `pip install -U docarray`." + ) + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + self.embedding = embedding + self.doc_cls = self._get_doc_cls() + self.docs = DocList[self.doc_cls]( + [ + self.doc_cls( + text=t, + embedding=e, + metadata=m, + ) for t, m, e in zip(texts, metadatas, self.embedding.embed_documents(texts)) + ] + ) + + @staticmethod + def _get_doc_cls(): + class DocArrayDoc(BaseDoc): + text: Optional[str] + embedding: Optional[NdArray] + metadata: Optional[dict] + + # DocArrayDoc.update_forward_refs() + return DocArrayDoc + + @classmethod + def from_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any + ) -> InMemory: + return cls( + texts=texts, + embedding=embedding, + metadatas=metadatas + ) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if metadatas is None: + metadatas = [{} for _ in range(len(list(texts)))] + + ids = [] + embeddings = self.embedding.embed_documents(texts) + for t, m, e in zip(texts, metadatas, embeddings): + doc = self.doc_cls( + text=t, + embedding=e, + metadata=m + ) + self.docs.append(doc) + ids.append(doc.id) # TODO return index of self.docs ? + + return ids + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query and score for each. + """ + from docarray.utils.find import find # TODO move import + + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') + + result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + return result + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + """ + results = self.similarity_search_with_score(query, k) + return list(map(itemgetter(0), results)) + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + raise NotImplementedError + + def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + from docarray.utils.find import find + + query_doc = self.doc_cls(embedding=embedding) + result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents + + result = [Document(page_content=doc.text) for doc in result_docs] + return result + + def max_marginal_relevance_search( + self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + from docarray.utils.find import find + + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + find_res = find(self.docs, query_doc, limit=k) + + embeddings = [emb for emb in find_res.documents.emb] + mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) + results = [] + for idx in mmr_selected: + results.append(Document(page_content=self.docs[idx].text)) + return results + diff --git a/poetry.lock b/poetry.lock index 1138b9196a6496..fc785b03aebfb9 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -1515,32 +1515,40 @@ wmi = ["wmi (>=1.5.1,<2.0.0)"] [[package]] name = "docarray" -version = "0.21.0" -description = "The data structure for unstructured data" +version = "0.30.0" +description = "The data structure for multimodal data" category = "main" optional = true -python-versions = "*" +python-versions = ">=3.7,<4.0" files = [ - {file = "docarray-0.21.0.tar.gz", hash = "sha256:3c9f605123800c1b0cdf8c458be3fb19c05e9a81f723e51200ef531b02e689ee"}, + {file = "docarray-0.30.0-py3-none-any.whl", hash = "sha256:739dbe06bfee6f1cbc030156036764ca1c75832dcc01a07c724640c6d464651b"}, + {file = "docarray-0.30.0.tar.gz", hash = "sha256:dd73e9ff20485a1d819ac906a59ee0cbc4382e78a5061286e77eb7d7f8b28a8e"}, ] [package.dependencies] -jina-hubble-sdk = ">=0.24.0" -numpy = "*" -rich = ">=12.0.0" - -[package.extras] -annlite = ["annlite"] -benchmark = ["h5py", "matplotlib", "pandas", "seaborn"] -common = ["Pillow", "fastapi", "lz4", "matplotlib", "protobuf (>=3.13.0)", "pydantic (>=1.9.0)", "requests", "uvicorn"] -elasticsearch = ["elasticsearch (>=8.2.0)"] -full = ["Pillow", "av", "fastapi", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "ipython", "lz4", "matplotlib", "protobuf (>=3.13.0)", "pydantic (>=1.9.0)", "requests", "scipy", "strawberry-graphql", "trimesh[easy]", "uvicorn"] -milvus = ["pymilvus (>=2.1.0,<2.2.0)"] -opensearch = ["opensearch-py (==2.0.1)"] -qdrant = ["qdrant-client (>=0.10.3,<0.11.0)"] -redis = ["redis (>=4.3.0)"] -test = ["annlite", "black (==22.3.0)", "datasets", "elasticsearch (>=8.2.0)", "jina", "jupyterlab", "mock", "onnx", "onnxruntime", "opensearch-py (==2.0.1)", "paddlepaddle", "protobuf (>=3.13.0,<=3.20.0)", "pymilvus (==2.1.3)", "pytest", "pytest-cov (==3.0.0)", "pytest-custom_exit_code", "pytest-mock", "pytest-mock", "pytest-repeat", "pytest-reraise", "pytest-timeout", "redis (>=4.3.0)", "tensorflow (==2.7.0)", "torch (==1.9.0)", "torchvision (==0.10.0)", "transformers (>=4.16.2)", "weaviate-client (>=3.9.0,<3.10.0)"] -weaviate = ["weaviate-client (>=3.9.0,<3.10.0)"] +numpy = ">=1.17.3" +orjson = ">=3.8.2" +pydantic = ">=1.10.2" +rich = ">=13.1.0" +types-requests = ">=2.28.11.6" +typing-inspect = ">=0.8.0" + +[package.extras] +audio = ["pydub (>=0.25.1,<0.26.0)"] +aws = ["smart-open[s3] (>=6.3.0)"] +elasticsearch = ["elastic-transport (>=8.4.0,<9.0.0)", "elasticsearch (>=7.10.1)"] +full = ["av (>=10.0.0)", "lz4 (>=1.0.0)", "pandas (>=1.1.0)", "pillow (>=9.3.0)", "protobuf (>=3.19.0)", "pydub (>=0.25.1,<0.26.0)", "trimesh[easy] (>=3.17.1)", "types-pillow (>=9.3.0.1)"] +hnswlib = ["hnswlib (>=0.6.2)"] +image = ["pillow (>=9.3.0)", "types-pillow (>=9.3.0.1)"] +jac = ["jina-hubble-sdk (>=0.34.0)"] +mesh = ["trimesh[easy] (>=3.17.1)"] +pandas = ["pandas (>=1.1.0)"] +proto = ["lz4 (>=1.0.0)", "protobuf (>=3.19.0)"] +qdrant = ["qdrant-client (>=1.1.4)"] +torch = ["torch (>=1.0.0)"] +video = ["av (>=10.0.0)"] +weaviate = ["weaviate-client (>=3.15)"] +web = ["fastapi (>=0.87.0)"] [[package]] name = "docker" @@ -1740,7 +1748,7 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2018,26 +2026,24 @@ files = [ [[package]] name = "google-api-core" -version = "2.11.0" +version = "2.8.2" description = "Google API client core library" category = "main" optional = true -python-versions = ">=3.7" +python-versions = ">=3.6" files = [ - {file = "google-api-core-2.11.0.tar.gz", hash = "sha256:4b9bb5d5a380a0befa0573b302651b8a9a89262c1730e37bf423cec511804c22"}, - {file = "google_api_core-2.11.0-py3-none-any.whl", hash = "sha256:ce222e27b0de0d7bc63eb043b956996d6dccab14cc3b690aaea91c9cc99dc16e"}, + {file = "google-api-core-2.8.2.tar.gz", hash = "sha256:06f7244c640322b508b125903bb5701bebabce8832f85aba9335ec00b3d02edc"}, + {file = "google_api_core-2.8.2-py3-none-any.whl", hash = "sha256:93c6a91ccac79079ac6bbf8b74ee75db970cc899278b97d53bc012f35908cf50"}, ] [package.dependencies] -google-auth = ">=2.14.1,<3.0dev" +google-auth = ">=1.25.0,<3.0dev" googleapis-common-protos = ">=1.56.2,<2.0dev" -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" +protobuf = ">=3.15.0,<5.0.0dev" requests = ">=2.18.0,<3.0.0dev" [package.extras] -grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio (>=1.49.1,<2.0dev)", "grpcio-status (>=1.33.2,<2.0dev)", "grpcio-status (>=1.49.1,<2.0dev)"] -grpcgcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"] -grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0dev)"] +grpc = ["grpcio (>=1.33.2,<2.0dev)", "grpcio-status (>=1.33.2,<2.0dev)"] [[package]] name = "google-api-python-client" @@ -2151,21 +2157,21 @@ requests = "*" [[package]] name = "googleapis-common-protos" -version = "1.59.0" +version = "1.56.4" description = "Common protobufs used in Google APIs" category = "main" optional = true python-versions = ">=3.7" files = [ - {file = "googleapis-common-protos-1.59.0.tar.gz", hash = "sha256:4168fcb568a826a52f23510412da405abd93f4d23ba544bb68d943b14ba3cb44"}, - {file = "googleapis_common_protos-1.59.0-py2.py3-none-any.whl", hash = "sha256:b287dc48449d1d41af0c69f4ea26242b5ae4c3d7249a38b0984c86a4caffff1f"}, + {file = "googleapis-common-protos-1.56.4.tar.gz", hash = "sha256:c25873c47279387cfdcbdafa36149887901d36202cb645a0e4f29686bf6e4417"}, + {file = "googleapis_common_protos-1.56.4-py2.py3-none-any.whl", hash = "sha256:8eb2cbc91b69feaf23e32452a7ae60e791e09967d81d4fcc7fc388182d1bd394"}, ] [package.dependencies] -protobuf = ">=3.19.5,<3.20.0 || >3.20.0,<3.20.1 || >3.20.1,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4.21.3 || >4.21.3,<4.21.4 || >4.21.4,<4.21.5 || >4.21.5,<5.0.0dev" +protobuf = ">=3.15.0,<5.0.0dev" [package.extras] -grpc = ["grpcio (>=1.44.0,<2.0.0dev)"] +grpc = ["grpcio (>=1.0.0,<2.0.0dev)"] [[package]] name = "gptcache" @@ -2483,7 +2489,7 @@ numpy = ">=1.14.5" name = "hnswlib" version = "0.7.0" description = "hnswlib" -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -2763,7 +2769,7 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -2955,20 +2961,20 @@ testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] [[package]] name = "jina" -version = "3.15.0" +version = "3.14.1" description = "Build multimodal AI services via cloud native technologies · Neural Search · Generative AI · MLOps" category = "main" optional = true python-versions = "*" files = [ - {file = "jina-3.15.0.tar.gz", hash = "sha256:18a3be8ddca14ed66a554d8480a277bcb7620ebc6ae11352a9835c91865f9d1e"}, + {file = "jina-3.14.1.tar.gz", hash = "sha256:00b1f5995b13c9a49a2287bd534bd32eb8c05706064752035d569e616a15b411"}, ] [package.dependencies] aiofiles = "*" aiohttp = "*" aiostream = "*" -docarray = ">=0.16.4,<0.30.0" +docarray = ">=0.16.4" docker = "*" fastapi = ">=0.76.0" filelock = "*" @@ -3002,14 +3008,14 @@ websockets = "*" aiofiles = ["aiofiles"] aiohttp = ["aiohttp"] aiostream = ["aiostream"] -all = ["Pillow", "aiofiles", "aiohttp", "aiostream", "black (==22.3.0)", "bs4", "coverage (==6.2)", "docarray (>=0.16.4,<0.30.0)", "docker", "fastapi (>=0.76.0)", "filelock", "flaky", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "jcloud (>=0.0.35)", "jina-hubble-sdk (>=0.30.4)", "jsonschema", "kubernetes (>=18.20.0)", "mock", "numpy", "opentelemetry-api (>=1.12.0)", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-instrumentation-grpc (>=0.35b0)", "opentelemetry-sdk (>=1.14.0)", "opentelemetry-test-utils (>=0.33b0)", "packaging (>=20.0)", "pathspec", "portforward (>=0.2.4,<0.4.3)", "prometheus-api-client (>=0.5.1)", "prometheus_client (>=0.12.0)", "protobuf (>=3.19.0)", "psutil", "pydantic", "pytest", "pytest-asyncio", "pytest-cov (==3.0.0)", "pytest-custom_exit_code", "pytest-kind (==22.11.1)", "pytest-lazy-fixture", "pytest-mock", "pytest-repeat", "pytest-reraise", "pytest-timeout", "python-multipart", "pyyaml (>=5.3.1)", "requests", "requests-mock", "scipy (>=1.6.1)", "sgqlc", "strawberry-graphql (>=0.96.0)", "tensorflow (>=2.0)", "torch", "uvicorn[standard]", "uvloop", "watchfiles (>=0.18.0)", "websockets"] +all = ["Pillow", "aiofiles", "aiohttp", "aiostream", "black (==22.3.0)", "bs4", "coverage (==6.2)", "docarray (>=0.16.4)", "docker", "fastapi (>=0.76.0)", "filelock", "flaky", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "jcloud (>=0.0.35)", "jina-hubble-sdk (>=0.30.4)", "jsonschema", "kubernetes (>=18.20.0)", "mock", "numpy", "opentelemetry-api (>=1.12.0)", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-instrumentation-grpc (>=0.35b0)", "opentelemetry-sdk (>=1.14.0)", "opentelemetry-test-utils (>=0.33b0)", "packaging (>=20.0)", "pathspec", "portforward (>=0.2.4)", "prometheus-api-client (>=0.5.1)", "prometheus_client (>=0.12.0)", "protobuf (>=3.19.0)", "psutil", "pydantic", "pytest", "pytest-asyncio", "pytest-cov (==3.0.0)", "pytest-custom_exit_code", "pytest-kind (==22.11.1)", "pytest-lazy-fixture", "pytest-mock", "pytest-repeat", "pytest-reraise", "pytest-timeout", "python-multipart", "pyyaml (>=5.3.1)", "requests", "requests-mock", "scipy (>=1.6.1)", "sgqlc", "strawberry-graphql (>=0.96.0)", "tensorflow (>=2.0)", "torch", "uvicorn[standard]", "uvloop", "watchfiles (>=0.18.0)", "websockets"] black = ["black (==22.3.0)"] bs4 = ["bs4"] -cicd = ["bs4", "jsonschema", "portforward (>=0.2.4,<0.4.3)", "sgqlc", "strawberry-graphql (>=0.96.0)", "tensorflow (>=2.0)", "torch"] -core = ["aiostream", "docarray (>=0.16.4,<0.30.0)", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "jcloud (>=0.0.35)", "jina-hubble-sdk (>=0.30.4)", "numpy", "opentelemetry-api (>=1.12.0)", "opentelemetry-instrumentation-grpc (>=0.35b0)", "packaging (>=20.0)", "protobuf (>=3.19.0)", "pyyaml (>=5.3.1)"] +cicd = ["bs4", "jsonschema", "portforward (>=0.2.4)", "sgqlc", "strawberry-graphql (>=0.96.0)", "tensorflow (>=2.0)", "torch"] +core = ["docarray (>=0.16.4)", "grpcio (>=1.46.0,<1.48.1)", "grpcio-health-checking (>=1.46.0,<1.48.1)", "grpcio-reflection (>=1.46.0,<1.48.1)", "jcloud (>=0.0.35)", "jina-hubble-sdk (>=0.30.4)", "numpy", "opentelemetry-api (>=1.12.0)", "opentelemetry-instrumentation-grpc (>=0.35b0)", "packaging (>=20.0)", "protobuf (>=3.19.0)", "pyyaml (>=5.3.1)"] coverage = ["coverage (==6.2)"] -devel = ["aiofiles", "aiohttp", "docker", "fastapi (>=0.76.0)", "filelock", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "pathspec", "prometheus_client (>=0.12.0)", "pydantic", "python-multipart", "requests", "sgqlc", "strawberry-graphql (>=0.96.0)", "uvicorn[standard]", "uvloop", "watchfiles (>=0.18.0)", "websockets"] -docarray = ["docarray (>=0.16.4,<0.30.0)"] +devel = ["aiofiles", "aiohttp", "aiostream", "docker", "fastapi (>=0.76.0)", "filelock", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "pathspec", "prometheus_client (>=0.12.0)", "pydantic", "python-multipart", "requests", "sgqlc", "strawberry-graphql (>=0.96.0)", "uvicorn[standard]", "uvloop", "watchfiles (>=0.18.0)", "websockets"] +docarray = ["docarray (>=0.16.4)"] docker = ["docker"] fastapi = ["fastapi (>=0.76.0)"] filelock = ["filelock"] @@ -3036,7 +3042,7 @@ packaging = ["packaging (>=20.0)"] pathspec = ["pathspec"] perf = ["opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "prometheus_client (>=0.12.0)", "uvloop"] pillow = ["Pillow"] -portforward = ["portforward (>=0.2.4,<0.4.3)"] +portforward = ["portforward (>=0.2.4)"] prometheus-api-client = ["prometheus-api-client (>=0.5.1)"] prometheus-client = ["prometheus_client (>=0.12.0)"] protobuf = ["protobuf (>=3.19.0)"] @@ -3058,7 +3064,7 @@ requests = ["requests"] requests-mock = ["requests-mock"] scipy = ["scipy (>=1.6.1)"] sgqlc = ["sgqlc"] -standard = ["aiofiles", "aiohttp", "docker", "fastapi (>=0.76.0)", "filelock", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "pathspec", "prometheus_client (>=0.12.0)", "pydantic", "python-multipart", "requests", "uvicorn[standard]", "uvloop", "websockets"] +standard = ["aiofiles", "aiohttp", "aiostream", "docker", "fastapi (>=0.76.0)", "filelock", "opentelemetry-exporter-otlp (>=1.12.0)", "opentelemetry-exporter-prometheus (>=1.12.0rc1)", "opentelemetry-instrumentation-aiohttp-client (>=0.33b0)", "opentelemetry-instrumentation-fastapi (>=0.33b0)", "opentelemetry-sdk (>=1.14.0)", "pathspec", "prometheus_client (>=0.12.0)", "pydantic", "python-multipart", "requests", "uvicorn[standard]", "uvloop", "websockets"] standrad = ["opentelemetry-exporter-otlp-proto-grpc (>=1.13.0)"] strawberry-graphql = ["strawberry-graphql (>=0.96.0)"] tensorflow = ["tensorflow (>=2.0)"] @@ -4991,6 +4997,72 @@ numpy = ">=1.7" docs = ["numpydoc", "sphinx (==1.2.3)", "sphinx-rtd-theme", "sphinxcontrib-napoleon"] tests = ["pytest", "pytest-cov", "pytest-pep8"] +[[package]] +name = "orjson" +version = "3.8.10" +description = "Fast, correct Python JSON library supporting dataclasses, datetimes, and numpy" +category = "main" +optional = true +python-versions = ">= 3.7" +files = [ + {file = "orjson-3.8.10-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:4dfe0651e26492d5d929bbf4322de9afbd1c51ac2e3947a7f78492b20359711d"}, + {file = "orjson-3.8.10-cp310-cp310-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:bc30de5c7b3a402eb59cc0656b8ee53ca36322fc52ab67739c92635174f88336"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c08b426fae7b9577b528f99af0f7e0ff3ce46858dd9a7d1bf86d30f18df89a4c"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bce970f293825e008dbf739268dfa41dfe583aa2a1b5ef4efe53a0e92e9671ea"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9b23fb0264bbdd7218aa685cb6fc71f0dcecf34182f0a8596a3a0dff010c06f9"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0826ad2dc1cea1547edff14ce580374f0061d853cbac088c71162dbfe2e52205"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a7bce6e61cea6426309259b04c6ee2295b3f823ea51a033749459fe2dd0423b2"}, + {file = "orjson-3.8.10-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:0b470d31244a6f647e5402aac7d2abaf7bb4f52379acf67722a09d35a45c9417"}, + {file = "orjson-3.8.10-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:48824649019a25d3e52f6454435cf19fe1eb3d05ee697e65d257f58ae3aa94d9"}, + {file = "orjson-3.8.10-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:faee89e885796a9cc493c930013fa5cfcec9bfaee431ddf00f0fbfb57166a8b3"}, + {file = "orjson-3.8.10-cp310-none-win_amd64.whl", hash = "sha256:3cfe32b1227fe029a5ad989fbec0b453a34e5e6d9a977723f7c3046d062d3537"}, + {file = "orjson-3.8.10-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:2073b62822738d6740bd2492f6035af5c2fd34aa198322b803dc0e70559a17b7"}, + {file = "orjson-3.8.10-cp311-cp311-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:b2c4faf20b6bb5a2d7ac0c16f58eb1a3800abcef188c011296d1dc2bb2224d48"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c1825997232a324911d11c75d91e1e0338c7b723c149cf53a5fc24496c048a4"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f7e85d4682f3ed7321d36846cad0503e944ea9579ef435d4c162e1b73ead8ac9"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b8cdaacecb92997916603ab232bb096d0fa9e56b418ca956b9754187d65ca06"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ddabc5e44702d13137949adee3c60b7091e73a664f6e07c7b428eebb2dea7bbf"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27bb26e171e9cfdbec39c7ca4739b6bef8bd06c293d56d92d5e3a3fc017df17d"}, + {file = "orjson-3.8.10-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1810e5446fe68d61732e9743592da0ec807e63972eef076d09e02878c2f5958e"}, + {file = "orjson-3.8.10-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:61e2e51cefe7ef90c4fbbc9fd38ecc091575a3ea7751d56fad95cbebeae2a054"}, + {file = "orjson-3.8.10-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f3e9ac9483c2b4cd794e760316966b7bd1e6afb52b0218f068a4e80c9b2db4f6"}, + {file = "orjson-3.8.10-cp311-none-win_amd64.whl", hash = "sha256:26aee557cf8c93b2a971b5a4a8e3cca19780573531493ce6573aa1002f5c4378"}, + {file = "orjson-3.8.10-cp37-cp37m-macosx_10_7_x86_64.whl", hash = "sha256:11ae68f995a50724032af297c92f20bcde31005e0bf3653b12bff9356394615b"}, + {file = "orjson-3.8.10-cp37-cp37m-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:35d879b46b8029e1e01e9f6067928b470a4efa1ca749b6d053232b873c2dcf66"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:345e41abd1d9e3ecfb554e1e75ff818cf42e268bd06ad25a96c34e00f73a327e"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:45a5afc9cda6b8aac066dd50d8194432fbc33e71f7164f95402999b725232d78"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad632dc330a7b39da42530c8d146f76f727d476c01b719dc6743c2b5701aaf6b"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4bf2556ba99292c4dc550560384dd22e88b5cdbe6d98fb4e202e902b5775cf9f"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b88afd662190f19c3bb5036a903589f88b1d2c2608fbb97281ce000db6b08897"}, + {file = "orjson-3.8.10-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:abce8d319aae800fd2d774db1106f926dee0e8a5ca85998fd76391fcb58ef94f"}, + {file = "orjson-3.8.10-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:e999abca892accada083f7079612307d94dd14cc105a699588a324f843216509"}, + {file = "orjson-3.8.10-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a3fdee68c4bb3c5d6f89ed4560f1384b5d6260e48fbf868bae1a245a3c693d4d"}, + {file = "orjson-3.8.10-cp37-none-win_amd64.whl", hash = "sha256:e5d7f82506212e047b184c06e4bcd48c1483e101969013623cebcf51cf12cad9"}, + {file = "orjson-3.8.10-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:d953e6c2087dcd990e794f8405011369ee11cf13e9aaae3172ee762ee63947f2"}, + {file = "orjson-3.8.10-cp38-cp38-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:81aa3f321d201bff0bd0f4014ea44e51d58a9a02d8f2b0eeab2cee22611be8e1"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d27b6182f75896dd8c10ea0f78b9265a3454be72d00632b97f84d7031900dd4"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1486600bc1dd1db26c588dd482689edba3d72d301accbe4301db4b2b28bd7aa4"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:344ea91c556a2ce6423dc13401b83ab0392aa697a97fa4142c2c63a6fd0bbfef"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:979f231e3bad1c835627eef1a30db12a8af58bfb475a6758868ea7e81897211f"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fa3a26dcf0f5f2912a8ce8e87273e68b2a9526854d19fd09ea671b154418e88"}, + {file = "orjson-3.8.10-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:b6e79d8864794635974b18821b49a7f27859d17b93413d4603efadf2e92da7a5"}, + {file = "orjson-3.8.10-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ce49999bcbbc14791c61844bc8a69af44f5205d219be540e074660038adae6bf"}, + {file = "orjson-3.8.10-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c2ef690335b24f9272dbf6639353c1ffc3f196623a92b851063e28e9515cf7dd"}, + {file = "orjson-3.8.10-cp38-none-win_amd64.whl", hash = "sha256:5a0b1f4e4fa75e26f814161196e365fc0e1a16e3c07428154505b680a17df02f"}, + {file = "orjson-3.8.10-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:af7601a78b99f0515af2f8ab12c955c0072ffcc1e437fb2556f4465783a4d813"}, + {file = "orjson-3.8.10-cp39-cp39-macosx_10_9_x86_64.macosx_11_0_arm64.macosx_10_9_universal2.whl", hash = "sha256:6bbd7b3a3e2030b03c68c4d4b19a2ef5b89081cbb43c05fe2010767ef5e408db"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4355c9aedfefe60904e8bd7901315ebbc8bb828f665e4c9bc94b1432e67cb6f7"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b7b0ba074375e25c1594e770e2215941e2017c3cd121889150737fa1123e8bfe"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34b6901c110c06ab9e8d7d0496db4bc9a0c162ca8d77f67539d22cb39e0a1ef4"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cb62ec16a1c26ad9487727b529103cb6a94a1d4969d5b32dd0eab5c3f4f5a6f2"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:595e1e7d04aaaa3d41113e4eb9f765ab642173c4001182684ae9ddc621bb11c8"}, + {file = "orjson-3.8.10-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:64ffd92328473a2f9af059410bd10c703206a4bbc7b70abb1bedcd8761e39eb8"}, + {file = "orjson-3.8.10-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1f648ec89c6a426098868460c0ef8c86b457ce1378d7569ff4acb6c0c454048"}, + {file = "orjson-3.8.10-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:6a286ad379972e4f46579e772f0477e6b505f1823aabcd64ef097dbb4549e1a4"}, + {file = "orjson-3.8.10-cp39-none-win_amd64.whl", hash = "sha256:d2874cee6856d7c386b596e50bc517d1973d73dc40b2bd6abec057b5e7c76b2f"}, + {file = "orjson-3.8.10.tar.gz", hash = "sha256:dcf6adb4471b69875034afab51a14b64f1026bc968175a2bb02c5f6b358bd413"}, +] + [[package]] name = "packaging" version = "23.1" @@ -5373,7 +5445,7 @@ typing-extensions = {version = "*", markers = "python_version <= \"3.8\""} name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "dev" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -5546,37 +5618,36 @@ requests = "*" [[package]] name = "protobuf" -version = "3.19.6" +version = "3.19.0" description = "Protocol Buffers" category = "main" optional = true python-versions = ">=3.5" files = [ - {file = "protobuf-3.19.6-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:010be24d5a44be7b0613750ab40bc8b8cedc796db468eae6c779b395f50d1fa1"}, - {file = "protobuf-3.19.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11478547958c2dfea921920617eb457bc26867b0d1aa065ab05f35080c5d9eb6"}, - {file = "protobuf-3.19.6-cp310-cp310-win32.whl", hash = "sha256:559670e006e3173308c9254d63facb2c03865818f22204037ab76f7a0ff70b5f"}, - {file = "protobuf-3.19.6-cp310-cp310-win_amd64.whl", hash = "sha256:347b393d4dd06fb93a77620781e11c058b3b0a5289262f094379ada2920a3730"}, - {file = "protobuf-3.19.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a8ce5ae0de28b51dff886fb922012dad885e66176663950cb2344c0439ecb473"}, - {file = "protobuf-3.19.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:90b0d02163c4e67279ddb6dc25e063db0130fc299aefabb5d481053509fae5c8"}, - {file = "protobuf-3.19.6-cp36-cp36m-win32.whl", hash = "sha256:30f5370d50295b246eaa0296533403961f7e64b03ea12265d6dfce3a391d8992"}, - {file = "protobuf-3.19.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0c0714b025ec057b5a7600cb66ce7c693815f897cfda6d6efb58201c472e3437"}, - {file = "protobuf-3.19.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5057c64052a1f1dd7d4450e9aac25af6bf36cfbfb3a1cd89d16393a036c49157"}, - {file = "protobuf-3.19.6-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:bb6776bd18f01ffe9920e78e03a8676530a5d6c5911934c6a1ac6eb78973ecb6"}, - {file = "protobuf-3.19.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84a04134866861b11556a82dd91ea6daf1f4925746b992f277b84013a7cc1229"}, - {file = "protobuf-3.19.6-cp37-cp37m-win32.whl", hash = "sha256:4bc98de3cdccfb5cd769620d5785b92c662b6bfad03a202b83799b6ed3fa1fa7"}, - {file = "protobuf-3.19.6-cp37-cp37m-win_amd64.whl", hash = "sha256:aa3b82ca1f24ab5326dcf4ea00fcbda703e986b22f3d27541654f749564d778b"}, - {file = "protobuf-3.19.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:2b2d2913bcda0e0ec9a784d194bc490f5dc3d9d71d322d070b11a0ade32ff6ba"}, - {file = "protobuf-3.19.6-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:d0b635cefebd7a8a0f92020562dead912f81f401af7e71f16bf9506ff3bdbb38"}, - {file = "protobuf-3.19.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a552af4dc34793803f4e735aabe97ffc45962dfd3a237bdde242bff5a3de684"}, - {file = "protobuf-3.19.6-cp38-cp38-win32.whl", hash = "sha256:0469bc66160180165e4e29de7f445e57a34ab68f49357392c5b2f54c656ab25e"}, - {file = "protobuf-3.19.6-cp38-cp38-win_amd64.whl", hash = "sha256:91d5f1e139ff92c37e0ff07f391101df77e55ebb97f46bbc1535298d72019462"}, - {file = "protobuf-3.19.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:c0ccd3f940fe7f3b35a261b1dd1b4fc850c8fde9f74207015431f174be5976b3"}, - {file = "protobuf-3.19.6-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:30a15015d86b9c3b8d6bf78d5b8c7749f2512c29f168ca259c9d7727604d0e39"}, - {file = "protobuf-3.19.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:878b4cd080a21ddda6ac6d1e163403ec6eea2e206cf225982ae04567d39be7b0"}, - {file = "protobuf-3.19.6-cp39-cp39-win32.whl", hash = "sha256:5a0d7539a1b1fb7e76bf5faa0b44b30f812758e989e59c40f77a7dab320e79b9"}, - {file = "protobuf-3.19.6-cp39-cp39-win_amd64.whl", hash = "sha256:bbf5cea5048272e1c60d235c7bd12ce1b14b8a16e76917f371c718bd3005f045"}, - {file = "protobuf-3.19.6-py2.py3-none-any.whl", hash = "sha256:14082457dc02be946f60b15aad35e9f5c69e738f80ebbc0900a19bc83734a5a4"}, - {file = "protobuf-3.19.6.tar.gz", hash = "sha256:5f5540d57a43042389e87661c6eaa50f47c19c6176e8cf1c4f287aeefeccb5c4"}, + {file = "protobuf-3.19.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:01a0645ef3acddfbc90237e1cdfae1086130fc7cb480b5874656193afd657083"}, + {file = "protobuf-3.19.0-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:d3861c9721a90ba83ee0936a9cfcc4fa1c4b4144ac9658fb6f6343b38558e9b4"}, + {file = "protobuf-3.19.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b64be5d7270cf5e76375bac049846e8a9543a2d4368b69afe78ab725380a7487"}, + {file = "protobuf-3.19.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:2f6046b9e2feee0dce994493186e8715b4392ed5f50f356280ad9c2f9f93080a"}, + {file = "protobuf-3.19.0-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac2f8ec942d414609aba0331952ae12bb823e8f424bbb6b8c422f1cef32dc842"}, + {file = "protobuf-3.19.0-cp36-cp36m-win32.whl", hash = "sha256:3fea09aa04ef2f8b01fcc9bb87f19509934f8a35d177c865b8f9ee5c32b60c1b"}, + {file = "protobuf-3.19.0-cp36-cp36m-win_amd64.whl", hash = "sha256:d1f4277d321f60456845ca9b882c4845736f1f5c1c69eb778eba22a97977d8af"}, + {file = "protobuf-3.19.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8488c2276f14f294e890cc1260ab342a13e90cd20dcc03319d2eea258f1fd321"}, + {file = "protobuf-3.19.0-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:36bf292f44966c67080e535321501717f4f1eba30faef8f2cd4b0c745a027211"}, + {file = "protobuf-3.19.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c99af73ae34c93e0e2ace57ea2e70243f34fc015c8c23fd39ee93652e726f7e7"}, + {file = "protobuf-3.19.0-cp37-cp37m-win32.whl", hash = "sha256:f7a031cf8e2fc14acc0ba694f6dff0a01e06b70d817eba6edc72ee6cc20517ac"}, + {file = "protobuf-3.19.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d4ca5f0c7bc8d2e6966ca3bbd85e9ebe7191b6e21f067896d4af6b28ecff29fe"}, + {file = "protobuf-3.19.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9a8a880593015ef2c83f7af797fa4fbf583b2c98b4bd94e46c5b61fee319d84b"}, + {file = "protobuf-3.19.0-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:6f16925f5c977dd7787973a50c242e60c22b1d1182aba6bec7bd02862579c10f"}, + {file = "protobuf-3.19.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f9097327d277b0aa4a3224e61cd6850aef3269172397715299bcffc9f90293c9"}, + {file = "protobuf-3.19.0-cp38-cp38-win32.whl", hash = "sha256:708d04394a63ee9bdc797938b6e15ed5bf24a1cb37743eb3886fd74a5a67a234"}, + {file = "protobuf-3.19.0-cp38-cp38-win_amd64.whl", hash = "sha256:ee4d07d596357f51316b6ecf1cc1927660e9d5e418385bb1c51fd2496cd9bee7"}, + {file = "protobuf-3.19.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:34a77b8fafdeb8f89fee2b7108ae60d8958d72e33478680cc1e05517892ecc46"}, + {file = "protobuf-3.19.0-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:4f93e0f6af796ddd1502225ff8ea25340ced186ca05b601c44d5c88b45ba80a0"}, + {file = "protobuf-3.19.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:942dd6bc8bd2a3c6a156d8ab0f80bd45313f22b78e1176283270054dcc8ca4c2"}, + {file = "protobuf-3.19.0-cp39-cp39-win32.whl", hash = "sha256:7b3867795708ac88fde8d6f34f0d9a50af56087e41f624bdb2e9ff808ea5dda7"}, + {file = "protobuf-3.19.0-cp39-cp39-win_amd64.whl", hash = "sha256:a74432e9d28a6072a2359a0f49f81eb14dd718e7dbbfb6c0789b456c49e1f130"}, + {file = "protobuf-3.19.0-py2.py3-none-any.whl", hash = "sha256:c96e94d3e523a82caa3e5f74b35dd1c4884199358d01c950d95c341255ff48bc"}, + {file = "protobuf-3.19.0.tar.gz", hash = "sha256:6a1dc6584d24ef86f5b104bcad64fa0fe06ed36e5687f426e0445d363a041d18"}, ] [[package]] @@ -6105,7 +6176,7 @@ Pillow = ">=8.0.0" name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -7507,7 +7578,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -7759,18 +7830,18 @@ files = [ [[package]] name = "tensorflow-hub" -version = "0.13.0" +version = "0.12.0" description = "TensorFlow Hub is a library to foster the publication, discovery, and consumption of reusable parts of machine learning models." category = "main" optional = true python-versions = "*" files = [ - {file = "tensorflow_hub-0.13.0-py2.py3-none-any.whl", hash = "sha256:3544f4fd9fd99e4eeb6da1b5b5320e4a2dbdef7f9bb778f66f76d6790f32dd65"}, + {file = "tensorflow_hub-0.12.0-py2.py3-none-any.whl", hash = "sha256:822fe5f7338c95efcc3a534011c6689e4309ba2459def87194179c4de8a6e1fc"}, ] [package.dependencies] numpy = ">=1.12.0" -protobuf = ">=3.19.6" +protobuf = ">=3.8.0" [package.extras] make-image-classifier = ["keras-preprocessing[image]"] @@ -8132,7 +8203,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "dev" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -8414,7 +8485,7 @@ types-pyOpenSSL = "*" name = "types-requests" version = "2.28.11.17" description = "Typing stubs for requests" -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -8441,7 +8512,7 @@ files = [ name = "types-urllib3" version = "1.26.25.10" description = "Typing stubs for urllib3" -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -9267,13 +9338,15 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf"] cohere = ["cohere"] -llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] +docarray = ["docarray", "protobuf"] +embeddings = ["sentence-transformers"] +llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "ab6ea1c53c7a6e792d5bdcf8865b87e5dcfe4c89080c18b356dc4ed8a17cc3a3" +content-hash = "81e7b09595d12739f056c5f5d34021ad7e3f855a8da711d3ccc23aab72cfbd83" diff --git a/pyproject.toml b/pyproject.toml index 0eec46451897ea..61406f1db2e0c7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,6 +69,10 @@ pytesseract = {version = "^0.3.10", optional=true} html2text = {version="^2020.1.16", optional=true} numexpr = "^2.8.4" duckduckgo-search = {version="^2.8.6", optional=true} +docarray = {version="^0.30.0", optional=true} +protobuf = {version="3.19", optional=true} +hnswlib = {version="^0.7.0", optional=true} +pytest = "^7.3.1" [tool.poetry.group.docs.dependencies] @@ -145,8 +149,9 @@ llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifes qdrant = ["qdrant-client"] openai = ["openai"] cohere = ["cohere"] +docarray = ["docarray", "protobuf"] embeddings = ["sentence-transformers"] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf"] [tool.ruff] select = [ diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py new file mode 100644 index 00000000000000..7aa3481cf19e76 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -0,0 +1,54 @@ +import pytest + +from langchain.schema import Document +from langchain.vectorstores.hnsw_lib import HnswLib +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +def test_docarray_hnswlib_vec_store_init(tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + docsearch = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + sim_metric='cosine', + ) + assert isinstance(docsearch, HnswLib) + + +@pytest.fixture +def docarray_vec_store(tmp_path): + texts = ["foo", "bar", "baz"] + docsearch = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + ) + return docsearch + + +def test_sim_search(docarray_vec_store) -> None: + """Test end to end construction and simple similarity search.""" + + output = docarray_vec_store.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_sim_search_with_score(docarray_vec_store) -> None: + """Test end to end construction and similarity search with score.""" + + output = docarray_vec_store.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo"), 1.0)] + + +def test_sim_search_by_vector(docarray_vec_store): + """Test end to end construction and similarity search by vector.""" + embedding = [1.0] * 10 + output = docarray_vec_store.similarity_search_by_vector(embedding, k=1) + + assert output == [Document(page_content="bar")] + + diff --git a/tests/integration_tests/vectorstores/test_in_memory.py b/tests/integration_tests/vectorstores/test_in_memory.py new file mode 100644 index 00000000000000..79458727310a86 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_in_memory.py @@ -0,0 +1,48 @@ +import pytest + +from langchain.schema import Document +from langchain.vectorstores.in_memory import InMemory +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +def test_docarray_vec_store_init() -> None: + """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + docsearch = InMemory.from_texts( + texts, + FakeEmbeddings(), + ) + assert isinstance(docsearch, InMemory) + + +@pytest.fixture +def docarray_vec_store(): + texts = ["foo", "bar", "baz"] + docsearch = InMemory.from_texts( + texts, + FakeEmbeddings(), + ) + return docsearch + + +def test_sim_search(docarray_vec_store) -> None: + """Test end to end construction and simple similarity search.""" + + output = docarray_vec_store.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +def test_sim_search_with_score(docarray_vec_store) -> None: + """Test end to end construction and similarity search with score.""" + + output = docarray_vec_store.similarity_search_with_score("foo", k=1) + assert output == [(Document(page_content="foo"), 1.0)] + + +def test_sim_search_by_vector(docarray_vec_store): + """Test end to end construction and similarity search by vector.""" + embedding = [1.0] * 10 + output = docarray_vec_store.similarity_search_by_vector(embedding, k=1) + + assert output == [Document(page_content="bar")] + From b687fd487f596818f0fe9e7230712a0ca0da7ad5 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 27 Apr 2023 12:22:02 +0200 Subject: [PATCH 02/19] refactor: use abtract VecStoreFromDocIndex for in memory and hnswlib implementation Signed-off-by: anna-charlotte --- langchain/vectorstores/hnsw_lib.py | 204 ++----------- langchain/vectorstores/in_memory.py | 273 +++++++++--------- .../vector_store_from_doc_index.py | 186 ++++++++++++ .../vectorstores/test_hnsw_lib.py | 73 +++-- .../vectorstores/test_in_memory.py | 63 +++- 5 files changed, 448 insertions(+), 351 deletions(-) create mode 100644 langchain/vectorstores/vector_store_from_doc_index.py diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index 6974133c9891b5..51c85423ac2b7d 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -1,52 +1,44 @@ """Wrapper around in-memory DocArray store.""" from __future__ import annotations -from operator import itemgetter from typing import List, Optional, Any, Tuple, Iterable, Type, Callable, Sequence, TYPE_CHECKING +from docarray.typing import NdArray from langchain.embeddings.base import Embeddings -from langchain.schema import Document -from langchain.vectorstores import VectorStore from langchain.vectorstores.base import VST -from langchain.vectorstores.utils import maximal_marginal_relevance - -from docarray import BaseDoc -from docarray.typing import NdArray +from langchain.vectorstores.vector_store_from_doc_index import VecStoreFromDocIndex, _check_docarray_import -class HnswLib(VectorStore): +class HnswLib(VecStoreFromDocIndex): """Wrapper around HnswLib storage. - To use it, you should have the ``docarray`` package with version >=0.30.0 installed. + To use it, you should have the ``docarray`` package with version >=0.31.0 installed. """ def __init__( self, - work_dir: str, - n_dim: int, texts: List[str], embedding: Embeddings, + work_dir: str, + n_dim: int, metadatas: Optional[List[dict]], - sim_metric: str = 'cosine', - kwargs: dict = None + dist_metric: str = 'cosine', + **kwargs, ) -> None: - """Initialize HnswLib store.""" - try: - import docarray - da_version = docarray.__version__.split('.') - if int(da_version[0]) == 0 and int(da_version[1]) <= 21: - raise ValueError( - f'To use the HnswLib VectorStore the docarray version >=0.30.0 is expected, ' - f'received: {docarray.__version__}.' - f'To upgrade, please run: `pip install -U docarray`.' - ) - else: - from docarray import DocList - from docarray.index import HnswDocumentIndex - except ImportError: - raise ImportError( - "Could not import docarray python package. " - "Please install it with `pip install -U docarray`." - ) + """Initialize HnswLib store. + + Args: + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + work_dir (str): path to the location where all the data will be stored. + n_dim (int): dimension of an embedding. + dist_metric (str): Distance metric for HnswLib can be one of: 'cosine', + 'ip', and 'l2'. Defaults to 'cosine'. + """ + _check_docarray_import() + from docarray.index import HnswDocumentIndex + try: import google.protobuf except ImportError: @@ -55,27 +47,13 @@ def __init__( "Please install it with `pip install -U protobuf`." ) - if metadatas is None: - metadatas = [{} for _ in range(len(texts))] - - self.embedding = embedding - - self.doc_cls = self._get_doc_cls(n_dim, sim_metric) - self.doc_index = HnswDocumentIndex[self.doc_cls](work_dir=work_dir) - embeddings = self.embedding.embed_documents(texts) - docs = DocList[self.doc_cls]( - [ - self.doc_cls( - text=t, - embedding=e, - metadata=m, - ) for t, m, e in zip(texts, metadatas, embeddings) - ] - ) - self.doc_index.index(docs) + doc_cls = self._get_doc_cls(n_dim, dist_metric) + doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) + super().__init__(doc_index, texts, embedding, metadatas) @staticmethod def _get_doc_cls(n_dim: int, sim_metric: str): + from docarray import BaseDoc from pydantic import Field class DocArrayDoc(BaseDoc): @@ -93,6 +71,7 @@ def from_texts( metadatas: Optional[List[dict]] = None, work_dir: str = None, n_dim: int = None, + dist_metric: str = 'cosine', **kwargs: Any ) -> HnswLib: @@ -107,129 +86,6 @@ def from_texts( texts=texts, embedding=embedding, metadatas=metadatas, - kwargs=kwargs + dist_metric=dist_metric, + kwargs=kwargs, ) - - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - - Returns: - List of ids from adding the texts into the vectorstore. - """ - if metadatas is None: - metadatas = [{} for _ in range(len(list(texts)))] - - ids = [] - embeddings = self.embedding.embed_documents(texts) - for t, m, e in zip(texts, metadatas, embeddings): - doc = self.doc_cls( - text=t, - embedding=e, - metadata=m - ) - self.doc_index.index(doc) - ids.append(doc.id) # TODO return index of self.docs ? - - return ids - - def similarity_search_with_score( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Tuple[Document, float]]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query and score for each. - """ - query_embedding = self.embedding.embed_query(query) - query_embedding = [1., 1., 1., 1., 1., 1., 1., 1., 1., 0.] - print(f"query_embedding = {query_embedding}") - query_doc = self.doc_cls(embedding=query_embedding) - docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=k) - - result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] - return result - - def similarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query. - """ - results = self.similarity_search_with_score(query, k) - return list(map(itemgetter(0), results)) - - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores, normalized on a scale from 0 to 1. - - 0 is dissimilar, 1 is most similar. - """ - raise NotImplementedError - - def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: - """Return docs most similar to embedding vector. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query vector. - """ - - query_doc = self.doc_cls(embedding=embedding) - docs = self.doc_index.find(query_doc, search_field='embedding', limit=k).documents - - result = [Document(page_content=doc.text) for doc in docs] - return result - - def max_marginal_relevance_search( - self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - - Returns: - List of Documents selected by maximal marginal relevance. - """ - query_embedding = self.embedding.embed_query(query) - query_doc = self.doc_cls(embedding=query_embedding) - - docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=fetch_k) - - embeddings = [emb for emb in docs.emb] - - mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) - results = [Document(page_content=self.doc_index[idx].text) for idx in mmr_selected] - return results - diff --git a/langchain/vectorstores/in_memory.py b/langchain/vectorstores/in_memory.py index a079b10da7887c..7a5139d8984015 100644 --- a/langchain/vectorstores/in_memory.py +++ b/langchain/vectorstores/in_memory.py @@ -1,71 +1,58 @@ """Wrapper around in-memory DocArray store.""" from __future__ import annotations -from operator import itemgetter -from typing import List, Optional, Any, Tuple, Iterable, Type +from typing import List, Optional, Any, Type + +from docarray.typing import NdArray from langchain.embeddings.base import Embeddings from langchain.schema import Document -from langchain.vectorstores import VectorStore from langchain.vectorstores.base import VST from langchain.vectorstores.utils import maximal_marginal_relevance - -from docarray import BaseDoc -from docarray.typing import NdArray +from langchain.vectorstores.vector_store_from_doc_index import _check_docarray_import, VecStoreFromDocIndex -class InMemory(VectorStore): +class InMemory(VecStoreFromDocIndex): """Wrapper around in-memory storage. - To use it, you should have the ``docarray`` package with version >=0.30.0 installed. + To use it, you should have the ``docarray`` package with version >=0.31.0 installed. """ def __init__( self, texts: List[str], embedding: Embeddings, - metadatas: Optional[List[dict]] + metadatas: Optional[List[dict]] = None, + metric: str = 'cosine_sim', ) -> None: - """Initialize in-memory store.""" - try: - import docarray - da_version = docarray.__version__.split('.') - if int(da_version[0]) == 0 and int(da_version[1]) <= 21: - raise ValueError( - f'To use the InMemory VectorStore the docarray version >=0.30.0 is expected, ' - f'received: {docarray.__version__}.' - f'To upgrade, please run: `pip install -U docarray`.' - ) - else: - from docarray import DocList - - except ImportError: - raise ImportError( - "Could not import docarray python package. " - "Please install it with `pip install -U docarray`." - ) - if metadatas is None: - metadatas = [{} for _ in range(len(texts))] - - self.embedding = embedding - self.doc_cls = self._get_doc_cls() - self.docs = DocList[self.doc_cls]( - [ - self.doc_cls( - text=t, - embedding=e, - metadata=m, - ) for t, m, e in zip(texts, metadatas, self.embedding.embed_documents(texts)) - ] - ) + """Initialize in-memory store. + + Args: + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + metric (str): metric for exact nearest-neighbor search. + Can be one of: 'cosine_sim', 'euclidean_dist' and 'sqeuclidean_dist'. + Defaults to 'cosine_sim'. + + """ + _check_docarray_import() + from docarray.index import InMemoryDocIndex + + doc_cls = self._get_doc_cls(metric) + doc_index = InMemoryDocIndex[doc_cls]() + super().__init__(doc_index, texts, embedding, metadatas) @staticmethod - def _get_doc_cls(): + def _get_doc_cls(sim_metric: str): + from docarray import BaseDoc + from pydantic import Field + class DocArrayDoc(BaseDoc): text: Optional[str] - embedding: Optional[NdArray] + embedding: Optional[NdArray] = Field(space=sim_metric) metadata: Optional[dict] - # DocArrayDoc.update_forward_refs() return DocArrayDoc @classmethod @@ -74,110 +61,112 @@ def from_texts( texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, + metric: str = 'cosine_sim', **kwargs: Any ) -> InMemory: return cls( texts=texts, embedding=embedding, - metadatas=metadatas + metadatas=metadatas, + metric=metric, ) - - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - - Returns: - List of ids from adding the texts into the vectorstore. - """ - if metadatas is None: - metadatas = [{} for _ in range(len(list(texts)))] - - ids = [] - embeddings = self.embedding.embed_documents(texts) - for t, m, e in zip(texts, metadatas, embeddings): - doc = self.doc_cls( - text=t, - embedding=e, - metadata=m - ) - self.docs.append(doc) - ids.append(doc.id) # TODO return index of self.docs ? - - return ids - - def similarity_search_with_score( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Tuple[Document, float]]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query and score for each. - """ - from docarray.utils.find import find # TODO move import - - query_embedding = self.embedding.embed_query(query) - query_doc = self.doc_cls(embedding=query_embedding) - docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') - - result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] - return result - - def similarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query. - """ - results = self.similarity_search_with_score(query, k) - return list(map(itemgetter(0), results)) - - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores, normalized on a scale from 0 to 1. - - 0 is dissimilar, 1 is most similar. - """ - raise NotImplementedError - - def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: - """Return docs most similar to embedding vector. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query vector. - """ - from docarray.utils.find import find - - query_doc = self.doc_cls(embedding=embedding) - result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents - - result = [Document(page_content=doc.text) for doc in result_docs] - return result + # + # def add_texts( + # self, + # texts: Iterable[str], + # metadatas: Optional[List[dict]] = None, + # **kwargs: Any + # ) -> List[str]: + # """Run more texts through the embeddings and add to the vectorstore. + # + # Args: + # texts: Iterable of strings to add to the vectorstore. + # metadatas: Optional list of metadatas associated with the texts. + # + # Returns: + # List of ids from adding the texts into the vectorstore. + # """ + # if metadatas is None: + # metadatas = [{} for _ in range(len(list(texts)))] + # + # ids = [] + # embeddings = self.embedding.embed_documents(texts) + # for t, m, e in zip(texts, metadatas, embeddings): + # doc = self.doc_cls( + # text=t, + # embedding=e, + # metadata=m + # ) + # self.docs.append(doc) + # ids.append(doc.id) # TODO return index of self.docs ? + # + # return ids + # + # def similarity_search_with_score( + # self, query: str, k: int = 4, **kwargs: Any + # ) -> List[Tuple[Document, float]]: + # """Return docs most similar to query. + # + # Args: + # query: Text to look up documents similar to. + # k: Number of Documents to return. Defaults to 4. + # + # Returns: + # List of Documents most similar to the query and score for each. + # """ + # from docarray.utils.find import find # TODO move import + # + # query_embedding = self.embedding.embed_query(query) + # query_doc = self.doc_cls(embedding=query_embedding) + # docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') + # + # result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + # return result + # + # def similarity_search( + # self, query: str, k: int = 4, **kwargs: Any + # ) -> List[Document]: + # """Return docs most similar to query. + # + # Args: + # query: Text to look up documents similar to. + # k: Number of Documents to return. Defaults to 4. + # + # Returns: + # List of Documents most similar to the query. + # """ + # results = self.similarity_search_with_score(query, k) + # return list(map(itemgetter(0), results)) + # + # def _similarity_search_with_relevance_scores( + # self, + # query: str, + # k: int = 4, + # **kwargs: Any, + # ) -> List[Tuple[Document, float]]: + # """Return docs and relevance scores, normalized on a scale from 0 to 1. + # + # 0 is dissimilar, 1 is most similar. + # """ + # raise NotImplementedError + # + # def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + # """Return docs most similar to embedding vector. + # + # Args: + # embedding: Embedding to look up documents similar to. + # k: Number of Documents to return. Defaults to 4. + # + # Returns: + # List of Documents most similar to the query vector. + # """ + # from docarray.utils.find import find + # + # query_doc = self.doc_cls(embedding=embedding) + # result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents + # + # result = [Document(page_content=doc.text) for doc in result_docs] + # return result def max_marginal_relevance_search( self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any diff --git a/langchain/vectorstores/vector_store_from_doc_index.py b/langchain/vectorstores/vector_store_from_doc_index.py new file mode 100644 index 00000000000000..a72c883b2e201b --- /dev/null +++ b/langchain/vectorstores/vector_store_from_doc_index.py @@ -0,0 +1,186 @@ +from typing import TYPE_CHECKING, TypeVar, List, Optional, Type, Iterable, Any, Tuple + +from docarray import DocList, BaseDoc +from operator import itemgetter + +from langchain.embeddings.base import Embeddings +from langchain.schema import Document +from langchain.vectorstores import VectorStore + +from docarray.index.abstract import BaseDocIndex + + +T_Doc = TypeVar('T_Doc', bound=BaseDocIndex) + + +def _check_docarray_import(): + try: + import docarray + da_version = docarray.__version__.split('.') + if int(da_version[0]) == 0 and int(da_version[1]) <= 21: + raise ValueError( + f'To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, ' + f'received: {docarray.__version__}.' + f'To upgrade, please run: `pip install -U docarray`.' + ) + except ImportError: + raise ImportError( + "Could not import docarray python package. " + "Please install it with `pip install -U docarray`." + ) + + +class VecStoreFromDocIndex(VectorStore): + doc_index: BaseDocIndex = None + doc_cls: Type[BaseDoc] = None + embedding: Embeddings = None + + def __init__( + self, + doc_index: T_Doc, + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]], + ): + self.doc_index = doc_index + self.doc_cls = doc_index._schema + self.embedding = embedding + + embeddings = self.embedding.embed_documents(texts) + if metadatas is None: + metadatas = [{} for _ in range(len(texts))] + + docs = DocList[self.doc_cls]( + [ + self.doc_cls( + text=t, + embedding=e, + metadata=m, + ) for t, m, e in zip(texts, metadatas, embeddings) + ] + ) + if len(docs) > 0: + self.doc_index.index(docs) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + + Returns: + List of ids from adding the texts into the vectorstore. + """ + if metadatas is None: + metadatas = [{} for _ in range(len(list(texts)))] + + ids = [] + embeddings = self.embedding.embed_documents(texts) + for t, m, e in zip(texts, metadatas, embeddings): + doc = self.doc_cls( + text=t, + embedding=e, + metadata=m + ) + self.doc_index.index([doc]) + ids.append(doc.id) # TODO return index of self.docs ? + + return ids + + def similarity_search_with_score( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query and score for each. + """ + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=k) + + result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + return result + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query. + """ + results = self.similarity_search_with_score(query, k) + return list(map(itemgetter(0), results)) + + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores, normalized on a scale from 0 to 1. + + 0 is dissimilar, 1 is most similar. + """ + raise NotImplementedError + + def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + + query_doc = self.doc_cls(embedding=embedding) + docs = self.doc_index.find(query_doc, search_field='embedding', limit=k).documents + + result = [Document(page_content=doc.text) for doc in docs] + return result + + def max_marginal_relevance_search( + self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + + Returns: + List of Documents selected by maximal marginal relevance. + """ + query_embedding = self.embedding.embed_query(query) + query_doc = self.doc_cls(embedding=query_embedding) + + docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=fetch_k) + + embeddings = [emb for emb in docs.emb] + + mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) + results = [Document(page_content=self.doc_index[idx].text) for idx in mmr_selected] + return results + diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py index 7aa3481cf19e76..58919d37e70944 100644 --- a/tests/integration_tests/vectorstores/test_hnsw_lib.py +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from langchain.schema import Document @@ -5,7 +6,7 @@ from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -def test_docarray_hnswlib_vec_store_init(tmp_path) -> None: +def test_hnswlib_vec_store_from_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] docsearch = HnswLib.from_texts( @@ -16,39 +17,71 @@ def test_docarray_hnswlib_vec_store_init(tmp_path) -> None: sim_metric='cosine', ) assert isinstance(docsearch, HnswLib) + assert docsearch.doc_index.num_docs() == 3 -@pytest.fixture -def docarray_vec_store(tmp_path): - texts = ["foo", "bar", "baz"] - docsearch = HnswLib.from_texts( - texts, - FakeEmbeddings(), +def test_hnswlib_vec_store_add_texts(tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + docsearch = HnswLib( work_dir=str(tmp_path), n_dim=10, + texts=[], + embedding=FakeEmbeddings(), + metadatas=[{}], + sim_metric='cosine', ) - return docsearch + assert isinstance(docsearch, HnswLib) + assert docsearch.doc_index.num_docs() == 0 + texts = ["foo", "bar", "baz"] + docsearch.add_texts(texts=texts) + assert docsearch.doc_index.num_docs() == 3 -def test_sim_search(docarray_vec_store) -> None: - """Test end to end construction and simple similarity search.""" - output = docarray_vec_store.similarity_search("foo", k=1) +@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +def test_sim_search(metric, tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + ) + output = hnswlib_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] -def test_sim_search_with_score(docarray_vec_store) -> None: - """Test end to end construction and similarity search with score.""" - - output = docarray_vec_store.similarity_search_with_score("foo", k=1) - assert output == [(Document(page_content="foo"), 1.0)] - - -def test_sim_search_by_vector(docarray_vec_store): +@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +def test_sim_search_by_vector(metric, tmp_path): """Test end to end construction and similarity search by vector.""" + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + ) embedding = [1.0] * 10 - output = docarray_vec_store.similarity_search_by_vector(embedding, k=1) + output = hnswlib_vec_store.similarity_search_by_vector(embedding, k=1) assert output == [Document(page_content="bar")] +@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +def test_sim_search_with_score(metric, tmp_path) -> None: + """Test end to end construction and similarity search with score.""" + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + ) + output = hnswlib_vec_store.similarity_search_with_score("foo", k=1) + assert len(output) == 1 + + out_doc, out_score = output[0] + assert out_doc == Document(page_content="foo") + assert np.isclose(out_score, 0.0, atol=1.e-6) + diff --git a/tests/integration_tests/vectorstores/test_in_memory.py b/tests/integration_tests/vectorstores/test_in_memory.py index 79458727310a86..62834336c7c24b 100644 --- a/tests/integration_tests/vectorstores/test_in_memory.py +++ b/tests/integration_tests/vectorstores/test_in_memory.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from langchain.schema import Document @@ -5,7 +6,7 @@ from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -def test_docarray_vec_store_init() -> None: +def test_in_memory_vec_store_from_texts() -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] docsearch = InMemory.from_texts( @@ -13,36 +14,68 @@ def test_docarray_vec_store_init() -> None: FakeEmbeddings(), ) assert isinstance(docsearch, InMemory) + assert docsearch.doc_index.num_docs() == 3 -@pytest.fixture -def docarray_vec_store(): - texts = ["foo", "bar", "baz"] - docsearch = InMemory.from_texts( - texts, - FakeEmbeddings(), +def test_in_memory_vec_store_add_texts(tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + docsearch = InMemory( + texts=[], + embedding=FakeEmbeddings(), ) - return docsearch + assert isinstance(docsearch, InMemory) + assert docsearch.doc_index.num_docs() == 0 + + texts = ["foo", "bar", "baz"] + docsearch.add_texts(texts=texts) + assert docsearch.doc_index.num_docs() == 3 -def test_sim_search(docarray_vec_store) -> None: +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_sim_search(metric) -> None: """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + in_memory_vec_store = InMemory.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metric=metric, + ) - output = docarray_vec_store.similarity_search("foo", k=1) + output = in_memory_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] -def test_sim_search_with_score(docarray_vec_store) -> None: +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_sim_search_with_score(metric) -> None: """Test end to end construction and similarity search with score.""" + texts = ["foo", "bar", "baz"] + in_memory_vec_store = InMemory.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metric=metric, + ) + + output = in_memory_vec_store.similarity_search_with_score("foo", k=1) - output = docarray_vec_store.similarity_search_with_score("foo", k=1) - assert output == [(Document(page_content="foo"), 1.0)] + out_doc, out_score = output[0] + assert out_doc == Document(page_content="foo") + expected_score = 0.0 if 'dist' in metric else 1.0 + assert np.isclose(out_score, expected_score, atol=1.e-6) -def test_sim_search_by_vector(docarray_vec_store): + +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_sim_search_by_vector(metric): """Test end to end construction and similarity search by vector.""" + texts = ["foo", "bar", "baz"] + in_memory_vec_store = InMemory.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metric=metric, + ) + embedding = [1.0] * 10 - output = docarray_vec_store.similarity_search_by_vector(embedding, k=1) + output = in_memory_vec_store.similarity_search_by_vector(embedding, k=1) assert output == [Document(page_content="bar")] From de262f9ae52affcccc3653128dee3d7cded177dc Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 27 Apr 2023 15:17:26 +0200 Subject: [PATCH 03/19] fix: clean up and add dependencies Signed-off-by: anna-charlotte --- langchain/vectorstores/__init__.py | 4 + langchain/vectorstores/hnsw_lib.py | 74 ++++--- langchain/vectorstores/in_memory.py | 195 +++--------------- .../vector_store_from_doc_index.py | 113 +++++----- poetry.lock | 23 ++- pyproject.toml | 9 +- .../vectorstores/test_hnsw_lib.py | 27 ++- .../vectorstores/test_in_memory.py | 20 +- 8 files changed, 186 insertions(+), 279 deletions(-) diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 30d1ca7ecdc3eb..5360f4b8f25c2c 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -7,6 +7,8 @@ from langchain.vectorstores.deeplake import DeepLake from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from langchain.vectorstores.faiss import FAISS +from langchain.vectorstores.hnsw_lib import HnswLib +from langchain.vectorstores.in_memory import InMemory from langchain.vectorstores.milvus import Milvus from langchain.vectorstores.myscale import MyScale, MyScaleSettings from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch @@ -34,4 +36,6 @@ "MyScaleSettings", "SupabaseVectorStore", "AnalyticDB", + "HnswLib", + "InMemory", ] diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index 51c85423ac2b7d..ddc3ec7b6102c9 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -1,40 +1,38 @@ -"""Wrapper around in-memory DocArray store.""" +"""Wrapper around HnswLib store.""" from __future__ import annotations -from typing import List, Optional, Any, Tuple, Iterable, Type, Callable, Sequence, TYPE_CHECKING -from docarray.typing import NdArray +from typing import List, Optional, Type from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VST -from langchain.vectorstores.vector_store_from_doc_index import VecStoreFromDocIndex, _check_docarray_import +from langchain.vectorstores.vector_store_from_doc_index import ( + VecStoreFromDocIndex, + _check_docarray_import, +) class HnswLib(VecStoreFromDocIndex): """Wrapper around HnswLib storage. - To use it, you should have the ``docarray`` package with version >=0.31.0 installed. + To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed. + You can install it with `pip install "langchain[hnswlib]"`. """ + def __init__( self, - texts: List[str], embedding: Embeddings, work_dir: str, n_dim: int, - metadatas: Optional[List[dict]], - dist_metric: str = 'cosine', - **kwargs, + dist_metric: str = "cosine", ) -> None: """Initialize HnswLib store. Args: - texts (List[str]): Text data. embedding (Embeddings): Embedding function. - metadatas (Optional[List[dict]]): Metadata for each text if it exists. - Defaults to None. work_dir (str): path to the location where all the data will be stored. n_dim (int): dimension of an embedding. - dist_metric (str): Distance metric for HnswLib can be one of: 'cosine', - 'ip', and 'l2'. Defaults to 'cosine'. + dist_metric (str): Distance metric for HnswLib can be one of: "cosine", + "ip", and "l2". Defaults to "cosine". """ _check_docarray_import() from docarray.index import HnswDocumentIndex @@ -43,25 +41,13 @@ def __init__( import google.protobuf except ImportError: raise ImportError( - "Could not import protobuf python package. " - "Please install it with `pip install -U protobuf`." + "Could not import all required packages. " + "Please install it with `pip install \"langchain[hnswlib]\"`." ) - doc_cls = self._get_doc_cls(n_dim, dist_metric) + doc_cls = self._get_doc_cls({"dim": n_dim, "space": dist_metric}) doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) - super().__init__(doc_index, texts, embedding, metadatas) - - @staticmethod - def _get_doc_cls(n_dim: int, sim_metric: str): - from docarray import BaseDoc - from pydantic import Field - - class DocArrayDoc(BaseDoc): - text: Optional[str] - embedding: Optional[NdArray] = Field(dim=n_dim, space=sim_metric) - metadata: Optional[dict] - - return DocArrayDoc + super().__init__(doc_index, embedding) @classmethod def from_texts( @@ -71,21 +57,33 @@ def from_texts( metadatas: Optional[List[dict]] = None, work_dir: str = None, n_dim: int = None, - dist_metric: str = 'cosine', - **kwargs: Any + dist_metric: str = "cosine", ) -> HnswLib: + """Create an HnswLib store and insert data. + Args: + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + work_dir (str): path to the location where all the data will be stored. + n_dim (int): dimension of an embedding. + dist_metric (str): Distance metric for HnswLib can be one of: "cosine", + "ip", and "l2". Defaults to "cosine". + + Returns: + HnswLib Vector Store + """ if work_dir is None: - raise ValueError('`work_dir` parameter hs not been set.') + raise ValueError("`work_dir` parameter hs not been set.") if n_dim is None: - raise ValueError('`n_dim` parameter has not been set.') + raise ValueError("`n_dim` parameter has not been set.") - return cls( + store = cls( work_dir=work_dir, n_dim=n_dim, - texts=texts, embedding=embedding, - metadatas=metadatas, dist_metric=dist_metric, - kwargs=kwargs, ) + store.add_texts(texts=texts, metadatas=metadatas) + return store diff --git a/langchain/vectorstores/in_memory.py b/langchain/vectorstores/in_memory.py index 7a5139d8984015..07e1f49d82c170 100644 --- a/langchain/vectorstores/in_memory.py +++ b/langchain/vectorstores/in_memory.py @@ -1,59 +1,42 @@ -"""Wrapper around in-memory DocArray store.""" +"""Wrapper around in-memory storage.""" from __future__ import annotations -from typing import List, Optional, Any, Type - -from docarray.typing import NdArray +from typing import List, Optional, Type from langchain.embeddings.base import Embeddings -from langchain.schema import Document from langchain.vectorstores.base import VST -from langchain.vectorstores.utils import maximal_marginal_relevance -from langchain.vectorstores.vector_store_from_doc_index import _check_docarray_import, VecStoreFromDocIndex +from langchain.vectorstores.vector_store_from_doc_index import ( + VecStoreFromDocIndex, + _check_docarray_import, +) class InMemory(VecStoreFromDocIndex): """Wrapper around in-memory storage. To use it, you should have the ``docarray`` package with version >=0.31.0 installed. + You can install it with `pip install "langchain[in_memory_store]"`. """ + def __init__( self, - texts: List[str], embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - metric: str = 'cosine_sim', + metric: str = "cosine_sim", ) -> None: """Initialize in-memory store. Args: - texts (List[str]): Text data. embedding (Embeddings): Embedding function. - metadatas (Optional[List[dict]]): Metadata for each text if it exists. - Defaults to None. metric (str): metric for exact nearest-neighbor search. - Can be one of: 'cosine_sim', 'euclidean_dist' and 'sqeuclidean_dist'. - Defaults to 'cosine_sim'. - + Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". + Defaults to "cosine_sim". """ _check_docarray_import() - from docarray.index import InMemoryDocIndex - - doc_cls = self._get_doc_cls(metric) - doc_index = InMemoryDocIndex[doc_cls]() - super().__init__(doc_index, texts, embedding, metadatas) - - @staticmethod - def _get_doc_cls(sim_metric: str): - from docarray import BaseDoc - from pydantic import Field - - class DocArrayDoc(BaseDoc): - text: Optional[str] - embedding: Optional[NdArray] = Field(space=sim_metric) - metadata: Optional[dict] + from docarray.index import InMemoryExactNNIndex - return DocArrayDoc + doc_cls = self._get_doc_cls({"space": metric}) + doc_index = InMemoryExactNNIndex[doc_cls]() + super().__init__(doc_index, embedding) @classmethod def from_texts( @@ -61,139 +44,25 @@ def from_texts( texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, - metric: str = 'cosine_sim', - **kwargs: Any + metric: str = "cosine_sim", ) -> InMemory: - return cls( - texts=texts, - embedding=embedding, - metadatas=metadatas, - metric=metric, - ) - # - # def add_texts( - # self, - # texts: Iterable[str], - # metadatas: Optional[List[dict]] = None, - # **kwargs: Any - # ) -> List[str]: - # """Run more texts through the embeddings and add to the vectorstore. - # - # Args: - # texts: Iterable of strings to add to the vectorstore. - # metadatas: Optional list of metadatas associated with the texts. - # - # Returns: - # List of ids from adding the texts into the vectorstore. - # """ - # if metadatas is None: - # metadatas = [{} for _ in range(len(list(texts)))] - # - # ids = [] - # embeddings = self.embedding.embed_documents(texts) - # for t, m, e in zip(texts, metadatas, embeddings): - # doc = self.doc_cls( - # text=t, - # embedding=e, - # metadata=m - # ) - # self.docs.append(doc) - # ids.append(doc.id) # TODO return index of self.docs ? - # - # return ids - # - # def similarity_search_with_score( - # self, query: str, k: int = 4, **kwargs: Any - # ) -> List[Tuple[Document, float]]: - # """Return docs most similar to query. - # - # Args: - # query: Text to look up documents similar to. - # k: Number of Documents to return. Defaults to 4. - # - # Returns: - # List of Documents most similar to the query and score for each. - # """ - # from docarray.utils.find import find # TODO move import - # - # query_embedding = self.embedding.embed_query(query) - # query_doc = self.doc_cls(embedding=query_embedding) - # docs, scores = find(index=self.docs, query=query_doc, limit=k, search_field='embedding') - # - # result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] - # return result - # - # def similarity_search( - # self, query: str, k: int = 4, **kwargs: Any - # ) -> List[Document]: - # """Return docs most similar to query. - # - # Args: - # query: Text to look up documents similar to. - # k: Number of Documents to return. Defaults to 4. - # - # Returns: - # List of Documents most similar to the query. - # """ - # results = self.similarity_search_with_score(query, k) - # return list(map(itemgetter(0), results)) - # - # def _similarity_search_with_relevance_scores( - # self, - # query: str, - # k: int = 4, - # **kwargs: Any, - # ) -> List[Tuple[Document, float]]: - # """Return docs and relevance scores, normalized on a scale from 0 to 1. - # - # 0 is dissimilar, 1 is most similar. - # """ - # raise NotImplementedError - # - # def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: - # """Return docs most similar to embedding vector. - # - # Args: - # embedding: Embedding to look up documents similar to. - # k: Number of Documents to return. Defaults to 4. - # - # Returns: - # List of Documents most similar to the query vector. - # """ - # from docarray.utils.find import find - # - # query_doc = self.doc_cls(embedding=embedding) - # result_docs = find(index=self.docs, query=query_doc, limit=k, search_field='embedding').documents - # - # result = [Document(page_content=doc.text) for doc in result_docs] - # return result - - def max_marginal_relevance_search( - self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. + """Create an in-memory store and insert data. Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + metric (str): metric for exact nearest-neighbor search. + Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". + Defaults to "cosine_sim". Returns: - List of Documents selected by maximal marginal relevance. - """ - from docarray.utils.find import find - - query_embedding = self.embedding.embed_query(query) - query_doc = self.doc_cls(embedding=query_embedding) - find_res = find(self.docs, query_doc, limit=k) - - embeddings = [emb for emb in find_res.documents.emb] - mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) - results = [] - for idx in mmr_selected: - results.append(Document(page_content=self.docs[idx].text)) - return results - + InMemory Vector Store + """ + store = cls( + embedding=embedding, + metric=metric, + ) + store.add_texts(texts=texts, metadatas=metadatas) + return store diff --git a/langchain/vectorstores/vector_store_from_doc_index.py b/langchain/vectorstores/vector_store_from_doc_index.py index a72c883b2e201b..a471bfe1cd7038 100644 --- a/langchain/vectorstores/vector_store_from_doc_index.py +++ b/langchain/vectorstores/vector_store_from_doc_index.py @@ -1,72 +1,72 @@ -from typing import TYPE_CHECKING, TypeVar, List, Optional, Type, Iterable, Any, Tuple - -from docarray import DocList, BaseDoc from operator import itemgetter +from typing import Any, Dict, Iterable, List, Optional, Tuple, Type + +try: + from docarray import BaseDoc + from docarray.index.abstract import BaseDocIndex + from docarray.typing import NdArray +except ImportError: + BaseDoc = None + BaseDocIndex = None + NdArray = None from langchain.embeddings.base import Embeddings from langchain.schema import Document from langchain.vectorstores import VectorStore - -from docarray.index.abstract import BaseDocIndex - - -T_Doc = TypeVar('T_Doc', bound=BaseDocIndex) +from langchain.vectorstores.utils import maximal_marginal_relevance -def _check_docarray_import(): +def _check_docarray_import() -> None: try: import docarray - da_version = docarray.__version__.split('.') - if int(da_version[0]) == 0 and int(da_version[1]) <= 21: + + da_version = docarray.__version__.split(".") + if int(da_version[0]) == 0 and int(da_version[1]) <= 30: raise ValueError( - f'To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, ' - f'received: {docarray.__version__}.' - f'To upgrade, please run: `pip install -U docarray`.' + f"To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, " + f"received: {docarray.__version__}." + f"To upgrade, please run: `pip install -U docarray`." ) except ImportError: raise ImportError( "Could not import docarray python package. " - "Please install it with `pip install -U docarray`." + "Please install it with `pip install \"langchain[docarray]\"`." ) class VecStoreFromDocIndex(VectorStore): - doc_index: BaseDocIndex = None - doc_cls: Type[BaseDoc] = None - embedding: Embeddings = None + doc_index: BaseDocIndex + doc_cls: Type[BaseDoc] + embedding: Embeddings def __init__( self, - doc_index: T_Doc, - texts: List[str], + doc_index: BaseDocIndex, embedding: Embeddings, - metadatas: Optional[List[dict]], ): + """Initialize a vector store from DocArray's DocIndex.""" self.doc_index = doc_index self.doc_cls = doc_index._schema self.embedding = embedding - embeddings = self.embedding.embed_documents(texts) - if metadatas is None: - metadatas = [{} for _ in range(len(texts))] - - docs = DocList[self.doc_cls]( - [ - self.doc_cls( - text=t, - embedding=e, - metadata=m, - ) for t, m, e in zip(texts, metadatas, embeddings) - ] - ) - if len(docs) > 0: - self.doc_index.index(docs) + @staticmethod + def _get_doc_cls(embeddings_params: Dict[str, Any]) -> Type[BaseDoc]: + """Get docarray Document class describing the schema of DocIndex.""" + from docarray import BaseDoc + from pydantic import Field + + class DocArrayDoc(BaseDoc): + text: Optional[str] + embedding: Optional[NdArray] = Field(**embeddings_params) + metadata: Optional[dict] + + return DocArrayDoc def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, - **kwargs: Any + **kwargs: Any, ) -> List[str]: """Run more texts through the embeddings and add to the vectorstore. @@ -80,16 +80,12 @@ def add_texts( if metadatas is None: metadatas = [{} for _ in range(len(list(texts)))] - ids = [] + ids: List[str] = [] embeddings = self.embedding.embed_documents(texts) for t, m, e in zip(texts, metadatas, embeddings): - doc = self.doc_cls( - text=t, - embedding=e, - metadata=m - ) + doc = self.doc_cls(text=t, embedding=e, metadata=m) self.doc_index.index([doc]) - ids.append(doc.id) # TODO return index of self.docs ? + ids.append(str(doc.id)) return ids @@ -107,9 +103,11 @@ def similarity_search_with_score( """ query_embedding = self.embedding.embed_query(query) query_doc = self.doc_cls(embedding=query_embedding) - docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=k) + docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k) - result = [(Document(page_content=doc.text), score) for doc, score in zip(docs, scores)] + result = [ + (Document(page_content=doc.text), score) for doc, score in zip(docs, scores) + ] return result def similarity_search( @@ -127,7 +125,6 @@ def similarity_search( results = self.similarity_search_with_score(query, k) return list(map(itemgetter(0), results)) - def _similarity_search_with_relevance_scores( self, query: str, @@ -140,7 +137,9 @@ def _similarity_search_with_relevance_scores( """ raise NotImplementedError - def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwargs: Any) -> List[Document]: + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: """Return docs most similar to embedding vector. Args: @@ -152,7 +151,9 @@ def similarity_search_by_vector(self, embedding: List[float], k: int = 4, **kwar """ query_doc = self.doc_cls(embedding=embedding) - docs = self.doc_index.find(query_doc, search_field='embedding', limit=k).documents + docs = self.doc_index.find( + query_doc, search_field="embedding", limit=k + ).documents result = [Document(page_content=doc.text) for doc in docs] return result @@ -176,11 +177,13 @@ def max_marginal_relevance_search( query_embedding = self.embedding.embed_query(query) query_doc = self.doc_cls(embedding=query_embedding) - docs, scores = self.doc_index.find(query_doc, search_field='embedding', limit=fetch_k) + docs = self.doc_index.find( + query_doc, search_field="embedding", limit=fetch_k + ).documents - embeddings = [emb for emb in docs.emb] - - mmr_selected = maximal_marginal_relevance(query_embedding, embeddings, k=k) - results = [Document(page_content=self.doc_index[idx].text) for idx in mmr_selected] + mmr_selected = maximal_marginal_relevance(query_embedding, docs.embedding, k=k) + results = [ + Document(page_content=docs[idx].text, metadata=docs[idx].metadata) + for idx in mmr_selected + ] return results - diff --git a/poetry.lock b/poetry.lock index fc785b03aebfb9..4109d28490ed90 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1515,14 +1515,14 @@ wmi = ["wmi (>=1.5.1,<2.0.0)"] [[package]] name = "docarray" -version = "0.30.0" +version = "0.31.0.dev35" description = "The data structure for multimodal data" category = "main" optional = true python-versions = ">=3.7,<4.0" files = [ - {file = "docarray-0.30.0-py3-none-any.whl", hash = "sha256:739dbe06bfee6f1cbc030156036764ca1c75832dcc01a07c724640c6d464651b"}, - {file = "docarray-0.30.0.tar.gz", hash = "sha256:dd73e9ff20485a1d819ac906a59ee0cbc4382e78a5061286e77eb7d7f8b28a8e"}, + {file = "docarray-0.31.0.dev35-py3-none-any.whl", hash = "sha256:a5c578cbf69853dddd17e845cc3fb2250cb1a0800ef48082d2a40a38bc9a7165"}, + {file = "docarray-0.31.0.dev35.tar.gz", hash = "sha256:f918cc5c35ed2df9b9ad7ef0abcc0bf5f3fe38a8f9e33526a33293d26a956f2e"}, ] [package.dependencies] @@ -1748,7 +1748,7 @@ files = [ name = "exceptiongroup" version = "1.1.1" description = "Backport of PEP 654 (exception groups)" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -2769,7 +2769,7 @@ testing = ["flake8 (<5)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-chec name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -5445,7 +5445,7 @@ typing-extensions = {version = "*", markers = "python_version <= \"3.8\""} name = "pluggy" version = "1.0.0" description = "plugin and hook calling mechanisms for python" -category = "main" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -6176,7 +6176,7 @@ Pillow = ">=8.0.0" name = "pytest" version = "7.3.1" description = "pytest: simple powerful testing with Python" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -8203,7 +8203,7 @@ files = [ name = "tomli" version = "2.0.1" description = "A lil' TOML parser" -category = "main" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -9338,10 +9338,11 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf", "hnswlib"] cohere = ["cohere"] -docarray = ["docarray", "protobuf"] embeddings = ["sentence-transformers"] +hnswlib = ["docarray", "protobuf", "hnswlib"] +in-memory-store = ["docarray"] llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] @@ -9349,4 +9350,4 @@ qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "81e7b09595d12739f056c5f5d34021ad7e3f855a8da711d3ccc23aab72cfbd83" +content-hash = "5223e3c6bdf37a28e1ee1cfb26e7f8d84fd6bc94893c96ecaca428fb9e8278eb" diff --git a/pyproject.toml b/pyproject.toml index 61406f1db2e0c7..869d5f8d0d4534 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -69,11 +69,9 @@ pytesseract = {version = "^0.3.10", optional=true} html2text = {version="^2020.1.16", optional=true} numexpr = "^2.8.4" duckduckgo-search = {version="^2.8.6", optional=true} -docarray = {version="^0.30.0", optional=true} +docarray = {version="^0.31.0.dev35", optional=true} protobuf = {version="3.19", optional=true} hnswlib = {version="^0.7.0", optional=true} -pytest = "^7.3.1" - [tool.poetry.group.docs.dependencies] autodoc_pydantic = "^1.8.0" @@ -149,9 +147,10 @@ llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifes qdrant = ["qdrant-client"] openai = ["openai"] cohere = ["cohere"] -docarray = ["docarray", "protobuf"] +in_memory_store = ["docarray"] +hnswlib = ["docarray", "protobuf", "hnswlib"] embeddings = ["sentence-transformers"] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "docarray", "protobuf", "hnswlib"] [tool.ruff] select = [ diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py index 58919d37e70944..fc86321c20dd58 100644 --- a/tests/integration_tests/vectorstores/test_hnsw_lib.py +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -14,7 +14,7 @@ def test_hnswlib_vec_store_from_texts(tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, - sim_metric='cosine', + dist_metric='cosine', ) assert isinstance(docsearch, HnswLib) assert docsearch.doc_index.num_docs() == 3 @@ -25,10 +25,8 @@ def test_hnswlib_vec_store_add_texts(tmp_path) -> None: docsearch = HnswLib( work_dir=str(tmp_path), n_dim=10, - texts=[], embedding=FakeEmbeddings(), - metadatas=[{}], - sim_metric='cosine', + dist_metric='cosine', ) assert isinstance(docsearch, HnswLib) assert docsearch.doc_index.num_docs() == 0 @@ -53,7 +51,7 @@ def test_sim_search(metric, tmp_path) -> None: @pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) -def test_sim_search_by_vector(metric, tmp_path): +def test_sim_search_by_vector(metric, tmp_path) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] hnswlib_vec_store = HnswLib.from_texts( @@ -85,3 +83,22 @@ def test_sim_search_with_score(metric, tmp_path) -> None: assert out_doc == Document(page_content="foo") assert np.isclose(out_score, 0.0, atol=1.e-6) + +@pytest.mark.parametrize('metric', ['cosine', 'l2']) +def test_max_marginal_relevance_search(metric, tmp_path) -> None: + """Test MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = HnswLib.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + dist_metric=metric, + work_dir=str(tmp_path), + n_dim=10, + ) + output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) + assert output == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"page": 1}), + ] diff --git a/tests/integration_tests/vectorstores/test_in_memory.py b/tests/integration_tests/vectorstores/test_in_memory.py index 62834336c7c24b..e90c4ed312d213 100644 --- a/tests/integration_tests/vectorstores/test_in_memory.py +++ b/tests/integration_tests/vectorstores/test_in_memory.py @@ -20,7 +20,6 @@ def test_in_memory_vec_store_from_texts() -> None: def test_in_memory_vec_store_add_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" docsearch = InMemory( - texts=[], embedding=FakeEmbeddings(), ) assert isinstance(docsearch, InMemory) @@ -65,7 +64,7 @@ def test_sim_search_with_score(metric) -> None: @pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) -def test_sim_search_by_vector(metric): +def test_sim_search_by_vector(metric) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] in_memory_vec_store = InMemory.from_texts( @@ -79,3 +78,20 @@ def test_sim_search_by_vector(metric): assert output == [Document(page_content="bar")] + +@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +def test_max_marginal_relevance_search(metric) -> None: + """Test MRR search.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = InMemory.from_texts( + texts, + FakeEmbeddings(), + metadatas=metadatas, + metric=metric + ) + output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) + assert output == [ + Document(page_content="foo", metadata={"page": 0}), + Document(page_content="bar", metadata={"page": 1}), + ] From 30456bc3c30fca7a09cd115e07205a1db997159a Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 27 Apr 2023 15:39:30 +0200 Subject: [PATCH 04/19] Add more configurations for hnswlib Signed-off-by: anna-charlotte --- langchain/vectorstores/hnsw_lib.py | 54 ++++++++++++++++++- .../vectorstores/test_hnsw_lib.py | 51 ++++++++++++++++-- 2 files changed, 101 insertions(+), 4 deletions(-) diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index ddc3ec7b6102c9..42f5c902cb5af4 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -24,6 +24,13 @@ def __init__( work_dir: str, n_dim: int, dist_metric: str = "cosine", + max_elements: int = 1024, + index: bool = True, + ef_construction: int = 200, + ef: int = 10, + M: int = 16, + allow_replace_deleted: bool = True, + num_threads: int = 1, ) -> None: """Initialize HnswLib store. @@ -33,6 +40,19 @@ def __init__( n_dim (int): dimension of an embedding. dist_metric (str): Distance metric for HnswLib can be one of: "cosine", "ip", and "l2". Defaults to "cosine". + max_elements (int): Maximum number of vectors that can be stored. + Defaults to 1024. + index (bool): Whether an index should be built for this field. + Defaults to True. + ef_construction (int): defines a construction time/accuracy trade-off. + Defaults to 200. + ef (int): parameter controlling query time/accuracy trade-off. + Defaults to 10. + M (int): parameter that defines the maximum number of outgoing + connections in the graph. Defaults to 16. + allow_replace_deleted (bool): Enables replacing of deleted elements + with new added ones. Defaults to True. + num_threads (int): Sets the number of cpu threads to use. Defaults to 1. """ _check_docarray_import() from docarray.index import HnswDocumentIndex @@ -45,7 +65,19 @@ def __init__( "Please install it with `pip install \"langchain[hnswlib]\"`." ) - doc_cls = self._get_doc_cls({"dim": n_dim, "space": dist_metric}) + doc_cls = self._get_doc_cls( + { + "dim": n_dim, + "space": dist_metric, + "max_elements": max_elements, + "index": index, + "ef_construction": ef_construction, + "ef": ef, + "M": M, + "allow_replace_deleted": allow_replace_deleted, + "num_threads": num_threads, + } + ) doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) super().__init__(doc_index, embedding) @@ -58,6 +90,13 @@ def from_texts( work_dir: str = None, n_dim: int = None, dist_metric: str = "cosine", + max_elements: int = 1024, + index: bool = True, + ef_construction: int = 200, + ef: int = 10, + M: int = 16, + allow_replace_deleted: bool = True, + num_threads: int = 1, ) -> HnswLib: """Create an HnswLib store and insert data. @@ -70,6 +109,19 @@ def from_texts( n_dim (int): dimension of an embedding. dist_metric (str): Distance metric for HnswLib can be one of: "cosine", "ip", and "l2". Defaults to "cosine". + max_elements (int): Maximum number of vectors that can be stored. + Defaults to 1024. + index (bool): Whether an index should be built for this field. + Defaults to True. + ef_construction (int): defines a construction time/accuracy trade-off. + Defaults to 200. + ef (int): parameter controlling query time/accuracy trade-off. + Defaults to 10. + M (int): parameter that defines the maximum number of outgoing + connections in the graph. Defaults to 16. + allow_replace_deleted (bool): Enables replacing of deleted elements + with new added ones. Defaults to True. + num_threads (int): Sets the number of cpu threads to use. Defaults to 1. Returns: HnswLib Vector Store diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py index fc86321c20dd58..a4a6441eec7794 100644 --- a/tests/integration_tests/vectorstores/test_hnsw_lib.py +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -36,7 +36,7 @@ def test_hnswlib_vec_store_add_texts(tmp_path) -> None: assert docsearch.doc_index.num_docs() == 3 -@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +@pytest.mark.parametrize('metric', ['cosine', 'l2']) def test_sim_search(metric, tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] @@ -45,12 +45,35 @@ def test_sim_search(metric, tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, + dist_metric=metric, + ) + output = hnswlib_vec_store.similarity_search("foo", k=1) + assert output == [Document(page_content="foo")] + + +@pytest.mark.parametrize('metric', ['cosine', 'l2']) +def test_sim_search_all_configurations(metric, tmp_path) -> None: + """Test end to end construction and simple similarity search.""" + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + dist_metric=metric, + n_dim=10, + max_elements=8, + index=False, + ef_construction=300, + ef=20, + M=8, + allow_replace_deleted=False, + num_threads=2, ) output = hnswlib_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] -@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +@pytest.mark.parametrize('metric', ['cosine', 'l2']) def test_sim_search_by_vector(metric, tmp_path) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] @@ -59,6 +82,7 @@ def test_sim_search_by_vector(metric, tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, + dist_metric=metric, ) embedding = [1.0] * 10 output = hnswlib_vec_store.similarity_search_by_vector(embedding, k=1) @@ -66,7 +90,7 @@ def test_sim_search_by_vector(metric, tmp_path) -> None: assert output == [Document(page_content="bar")] -@pytest.mark.parametrize('metric', ['cosine', 'ip', 'l2']) +@pytest.mark.parametrize('metric', ['cosine', 'l2']) def test_sim_search_with_score(metric, tmp_path) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] @@ -75,6 +99,7 @@ def test_sim_search_with_score(metric, tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, + dist_metric=metric, ) output = hnswlib_vec_store.similarity_search_with_score("foo", k=1) assert len(output) == 1 @@ -84,6 +109,26 @@ def test_sim_search_with_score(metric, tmp_path) -> None: assert np.isclose(out_score, 0.0, atol=1.e-6) +def test_sim_search_with_score_for_ip_metric(tmp_path) -> None: + """ + Test end to end construction and similarity search with score for ip + (inner-product) metric. + """ + texts = ["foo", "bar", "baz"] + hnswlib_vec_store = HnswLib.from_texts( + texts, + FakeEmbeddings(), + work_dir=str(tmp_path), + n_dim=10, + dist_metric='ip', + ) + output = hnswlib_vec_store.similarity_search_with_score("foo", k=3) + assert len(output) == 3 + + for result in output: + assert result[1] == -8.0 + + @pytest.mark.parametrize('metric', ['cosine', 'l2']) def test_max_marginal_relevance_search(metric, tmp_path) -> None: """Test MRR search.""" From 5d2324a65fb4c8dc78dc7ac5c886d0325ebb3b57 Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Thu, 27 Apr 2023 16:08:05 +0200 Subject: [PATCH 05/19] refactor: rename InMemory to InMemoryExactSearch Signed-off-by: anna-charlotte --- langchain/vectorstores/__init__.py | 4 ++-- ...{in_memory.py => in_memory_exact_search.py} | 12 ++++++------ ...emory.py => test_in_memory_exact_search.py} | 18 +++++++++--------- 3 files changed, 17 insertions(+), 17 deletions(-) rename langchain/vectorstores/{in_memory.py => in_memory_exact_search.py} (86%) rename tests/integration_tests/vectorstores/{test_in_memory.py => test_in_memory_exact_search.py} (85%) diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 5360f4b8f25c2c..ed3982ad7e1abb 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -8,7 +8,7 @@ from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from langchain.vectorstores.faiss import FAISS from langchain.vectorstores.hnsw_lib import HnswLib -from langchain.vectorstores.in_memory import InMemory +from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch from langchain.vectorstores.milvus import Milvus from langchain.vectorstores.myscale import MyScale, MyScaleSettings from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch @@ -37,5 +37,5 @@ "SupabaseVectorStore", "AnalyticDB", "HnswLib", - "InMemory", + "InMemoryExactSearch", ] diff --git a/langchain/vectorstores/in_memory.py b/langchain/vectorstores/in_memory_exact_search.py similarity index 86% rename from langchain/vectorstores/in_memory.py rename to langchain/vectorstores/in_memory_exact_search.py index 07e1f49d82c170..bbaabe7e11c6bc 100644 --- a/langchain/vectorstores/in_memory.py +++ b/langchain/vectorstores/in_memory_exact_search.py @@ -11,8 +11,8 @@ ) -class InMemory(VecStoreFromDocIndex): - """Wrapper around in-memory storage. +class InMemoryExactSearch(VecStoreFromDocIndex): + """Wrapper around in-memory storage for exact search. To use it, you should have the ``docarray`` package with version >=0.31.0 installed. You can install it with `pip install "langchain[in_memory_store]"`. @@ -23,7 +23,7 @@ def __init__( embedding: Embeddings, metric: str = "cosine_sim", ) -> None: - """Initialize in-memory store. + """Initialize InMemoryExactSearch store. Args: embedding (Embeddings): Embedding function. @@ -45,8 +45,8 @@ def from_texts( embedding: Embeddings, metadatas: Optional[List[dict]] = None, metric: str = "cosine_sim", - ) -> InMemory: - """Create an in-memory store and insert data. + ) -> InMemoryExactSearch: + """Create an InMemoryExactSearch store and insert data. Args: texts (List[str]): Text data. @@ -58,7 +58,7 @@ def from_texts( Defaults to "cosine_sim". Returns: - InMemory Vector Store + InMemoryExactSearch Vector Store """ store = cls( embedding=embedding, diff --git a/tests/integration_tests/vectorstores/test_in_memory.py b/tests/integration_tests/vectorstores/test_in_memory_exact_search.py similarity index 85% rename from tests/integration_tests/vectorstores/test_in_memory.py rename to tests/integration_tests/vectorstores/test_in_memory_exact_search.py index e90c4ed312d213..7e0142ec8212ff 100644 --- a/tests/integration_tests/vectorstores/test_in_memory.py +++ b/tests/integration_tests/vectorstores/test_in_memory_exact_search.py @@ -2,27 +2,27 @@ import pytest from langchain.schema import Document -from langchain.vectorstores.in_memory import InMemory +from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings def test_in_memory_vec_store_from_texts() -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - docsearch = InMemory.from_texts( + docsearch = InMemoryExactSearch.from_texts( texts, FakeEmbeddings(), ) - assert isinstance(docsearch, InMemory) + assert isinstance(docsearch, InMemoryExactSearch) assert docsearch.doc_index.num_docs() == 3 def test_in_memory_vec_store_add_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" - docsearch = InMemory( + docsearch = InMemoryExactSearch( embedding=FakeEmbeddings(), ) - assert isinstance(docsearch, InMemory) + assert isinstance(docsearch, InMemoryExactSearch) assert docsearch.doc_index.num_docs() == 0 texts = ["foo", "bar", "baz"] @@ -34,7 +34,7 @@ def test_in_memory_vec_store_add_texts(tmp_path) -> None: def test_sim_search(metric) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemory.from_texts( + in_memory_vec_store = InMemoryExactSearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -48,7 +48,7 @@ def test_sim_search(metric) -> None: def test_sim_search_with_score(metric) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemory.from_texts( + in_memory_vec_store = InMemoryExactSearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -67,7 +67,7 @@ def test_sim_search_with_score(metric) -> None: def test_sim_search_by_vector(metric) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemory.from_texts( + in_memory_vec_store = InMemoryExactSearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -84,7 +84,7 @@ def test_max_marginal_relevance_search(metric) -> None: """Test MRR search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] - docsearch = InMemory.from_texts( + docsearch = InMemoryExactSearch.from_texts( texts, FakeEmbeddings(), metadatas=metadatas, From ecc73b4bb948a9852237957944c5653f94d2b08f Mon Sep 17 00:00:00 2001 From: anna-charlotte Date: Fri, 28 Apr 2023 10:38:25 +0200 Subject: [PATCH 06/19] fix: change space default for hnswlib to l2 --- langchain/vectorstores/hnsw_lib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index 42f5c902cb5af4..2857248f0f5aad 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -89,7 +89,7 @@ def from_texts( metadatas: Optional[List[dict]] = None, work_dir: str = None, n_dim: int = None, - dist_metric: str = "cosine", + dist_metric: str = "l2", max_elements: int = 1024, index: bool = True, ef_construction: int = 200, @@ -108,7 +108,7 @@ def from_texts( work_dir (str): path to the location where all the data will be stored. n_dim (int): dimension of an embedding. dist_metric (str): Distance metric for HnswLib can be one of: "cosine", - "ip", and "l2". Defaults to "cosine". + "ip", and "l2". Defaults to "l2". max_elements (int): Maximum number of vectors that can be stored. Defaults to 1024. index (bool): Whether an index should be built for this field. From 3eb3fdc44e62c6b27cb4218d564cbf8c01379819 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Mon, 8 May 2023 13:42:53 +0200 Subject: [PATCH 07/19] feat: add example notebooks Signed-off-by: jupyterjazz --- .../vectorstores/examples/hsnwlib.ipynb | 234 ++++++++++++++++++ .../examples/inmemoryexactsearch.ipynb | 220 ++++++++++++++++ langchain/vectorstores/hnsw_lib.py | 2 +- poetry.lock | 20 +- pyproject.toml | 2 +- 5 files changed, 466 insertions(+), 12 deletions(-) create mode 100644 docs/modules/indexes/vectorstores/examples/hsnwlib.ipynb create mode 100644 docs/modules/indexes/vectorstores/examples/inmemoryexactsearch.ipynb diff --git a/docs/modules/indexes/vectorstores/examples/hsnwlib.ipynb b/docs/modules/indexes/vectorstores/examples/hsnwlib.ipynb new file mode 100644 index 00000000000000..fd1959cae73a8b --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/hsnwlib.ipynb @@ -0,0 +1,234 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2ce41f46-5711-4311-b04d-2fe233ac5b1b", + "metadata": {}, + "source": [ + "# HnswLib\n", + "\n", + ">[HnswLib](https://docs.docarray.org/user_guide/storing/index_hnswlib/) is a lightweight Document Index implementation provided by [Docarray](https://docs.docarray.org/) that runs fully locally and is best suited for small- to medium-sized datasets. It stores vectors on disk in [hnswlib](https://github.com/nmslib/hnswlib), and stores all other data in [SQLite](https://www.sqlite.org/index.html).\n", + "\n", + "This notebook shows how to use functionality related to the `HnswLib`." + ] + }, + { + "cell_type": "raw", + "id": "6db14de4-b417-4139-8236-2d4e909f2157", + "metadata": { + "tags": [] + }, + "source": [ + "!pip install \"docarray[hnswlib]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "878f17df-100f-4854-9e87-472cf36d51f3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], + "source": [ + "# get a token: https://platform.openai.com/account/api-keys\n", + "\n", + "from getpass import getpass\n", + "\n", + "OPENAI_API_KEY = getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "82d9984a-6031-403d-a977-6bc98d6be23a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b757afef-ef0a-465d-8e8a-9aadb9c32b88", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinaai/Desktop/langchain/venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import HnswLib\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "605e200e-e711-486b-b36e-cbe5dd2512d7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "db = HnswLib.from_documents(docs, embeddings, work_dir='hnswlib_store/', n_dim=1536)" + ] + }, + { + "cell_type": "markdown", + "id": "ed6f905b-4853-4a44-9730-614aa8e22b78", + "metadata": {}, + "source": [ + "## Similarity search" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4d7e742f-2002-449d-a10e-16046890906c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = db.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0da9e26f-1fc2-48e6-95a7-f692c853bbd3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "3febb987-e903-416f-af26-6897d84c8d61", + "metadata": {}, + "source": [ + "## Similarity search with score" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "40764fdd-357d-475a-8152-5f1979d61a45", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = db.similarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "a479fc46-b299-4330-89b9-e9b5a218ea03", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={}),\n", + " 0.3691615)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4d3d4e97-5d2b-4571-8ff9-e3f6b6778714", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import shutil\n", + "# delete the dir\n", + "shutil.rmtree('hnswlib_store')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/indexes/vectorstores/examples/inmemoryexactsearch.ipynb b/docs/modules/indexes/vectorstores/examples/inmemoryexactsearch.ipynb new file mode 100644 index 00000000000000..e0828919742a6e --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/inmemoryexactsearch.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a3afefb0-7e99-4912-a222-c6b186da11af", + "metadata": {}, + "source": [ + "# InMemoryExactSearch\n", + "\n", + ">[InMemoryExactSearch](https://docs.docarray.org/user_guide/storing/index_in_memory/) is a document index provided by [Docarray](https://docs.docarray.org/) that stores documents in memory. It is a great starting point for small datasets, where you may not want to launch a database server.\n", + "\n", + "This notebook shows how to use functionality related to the `InMemoryExactSearch`." + ] + }, + { + "cell_type": "raw", + "id": "2ac74987-65ae-4298-b439-8ad8dff96b36", + "metadata": { + "tags": [] + }, + "source": [ + "!pip install docarray" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "c6a40ad8-920e-4370-818d-3227e2f506ed", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], + "source": [ + "# get a token: https://platform.openai.com/account/api-keys\n", + "\n", + "from getpass import getpass\n", + "\n", + "OPENAI_API_KEY = getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d2ada2f5-e30e-4aa7-863e-69339a28e825", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e49be085-ddf1-4028-8c0c-97836ce4a873", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinaai/Desktop/langchain/venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import InMemoryExactSearch\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38222aee-adc5-44c2-913c-97977b394cf5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader\n", + "loader = TextLoader('../../../state_of_the_union.txt')\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "db = InMemoryExactSearch.from_documents(docs, embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "efbb6684-3846-4332-a624-ddd4d75844c1", + "metadata": {}, + "source": [ + "## Similarity search" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "aa28a7f8-41d0-4299-84eb-91d1576e8a63", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = db.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1eb16d2a-b466-456a-b412-5e74bb8523dd", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "43896697-f99e-47b6-9117-47a25e9afa9c", + "metadata": {}, + "source": [ + "## Similarity search with score" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8e9eef05-1516-469a-ad36-880c69aef7a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = db.similarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "bd5fb0e4-2a94-4bb4-af8a-27327ecb1a7f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={}),\n", + " 0.8154190158347903)" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/hnsw_lib.py index 2857248f0f5aad..78c686021356c6 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/hnsw_lib.py @@ -127,7 +127,7 @@ def from_texts( HnswLib Vector Store """ if work_dir is None: - raise ValueError("`work_dir` parameter hs not been set.") + raise ValueError("`work_dir` parameter has not been set.") if n_dim is None: raise ValueError("`n_dim` parameter has not been set.") diff --git a/poetry.lock b/poetry.lock index 8693868f93aefd..0dcc62287e4ed1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1560,14 +1560,14 @@ wmi = ["wmi (>=1.5.1,<2.0.0)"] [[package]] name = "docarray" -version = "0.31.0.dev64" +version = "0.31.0" description = "The data structure for multimodal data" category = "main" optional = true python-versions = ">=3.7,<4.0" files = [ - {file = "docarray-0.31.0.dev64-py3-none-any.whl", hash = "sha256:87791cc77c2989a2f54d77c12400443f61fb245ca8b3d2af2d56feee04961128"}, - {file = "docarray-0.31.0.dev64.tar.gz", hash = "sha256:ea3ce816ac1a18d523e589dac962b042517befb2824787686030686ec3fdc328"}, + {file = "docarray-0.31.0-py3-none-any.whl", hash = "sha256:3783e9bdcf0d59b17499660e54577f4e3d202545998afca9306ebcc09cf0e14e"}, + {file = "docarray-0.31.0.tar.gz", hash = "sha256:a79d1ed70bd143b3e2a53ff90a62e4b3ce7231d5d237a2fab9b8311d7ae7d245"}, ] [package.dependencies] @@ -2862,14 +2862,14 @@ files = [ [[package]] name = "ipykernel" -version = "6.22.0" +version = "6.23.0" description = "IPython Kernel for Jupyter" category = "dev" optional = false python-versions = ">=3.8" files = [ - {file = "ipykernel-6.22.0-py3-none-any.whl", hash = "sha256:1ae6047c1277508933078163721bbb479c3e7292778a04b4bacf0874550977d6"}, - {file = "ipykernel-6.22.0.tar.gz", hash = "sha256:302558b81f1bc22dc259fb2a0c5c7cf2f4c0bdb21b50484348f7bafe7fb71421"}, + {file = "ipykernel-6.23.0-py3-none-any.whl", hash = "sha256:fc886f1dcdc0ec17f277e4d21fd071c857d381adcb04f3f3735d25325ca323c6"}, + {file = "ipykernel-6.23.0.tar.gz", hash = "sha256:bd6f487d9e2744c84f6e667d46462d7647a4c862e70e08282f05a52b9d4b705f"}, ] [package.dependencies] @@ -4414,14 +4414,14 @@ test = ["flaky", "ipykernel", "ipython", "ipywidgets", "nbconvert (>=7.0.0)", "p [[package]] name = "nbconvert" -version = "7.3.1" +version = "7.4.0" description = "Converting Jupyter Notebooks" category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "nbconvert-7.3.1-py3-none-any.whl", hash = "sha256:d2e95904666f1ff77d36105b9de4e0801726f93b862d5b28f69e93d99ad3b19c"}, - {file = "nbconvert-7.3.1.tar.gz", hash = "sha256:78685362b11d2e8058e70196fe83b09abed8df22d3e599cf271f4d39fdc48b9e"}, + {file = "nbconvert-7.4.0-py3-none-any.whl", hash = "sha256:af5064a9db524f9f12f4e8be7f0799524bd5b14c1adea37e34e83c95127cc818"}, + {file = "nbconvert-7.4.0.tar.gz", hash = "sha256:51b6c77b507b177b73f6729dba15676e42c4e92bcb00edc8cc982ee72e7d89d7"}, ] [package.dependencies] @@ -9756,4 +9756,4 @@ qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "b77ce5f91654d8258753dcc527115580f40718537581d1b4727ef31b30ff4b06" +content-hash = "39a61e1f193e60698bf6d3572a06f42cbf6ac93c59982d67826ca99bffa98290" diff --git a/pyproject.toml b/pyproject.toml index 95027ba9bd7e15..1bec1c4b2f9dd7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,7 +77,7 @@ pexpect = {version = "^4.8.0", optional = true} pyvespa = {version = "^0.33.0", optional = true} O365 = {version = "^2.0.26", optional = true} jq = {version = "^1.4.1", optional = true} -docarray = {version="^0.31.0.dev35", optional=true} +docarray = {version="^0.31.0", optional=true} protobuf = {version="3.19", optional=true} hnswlib = {version="^0.7.0", optional=true} From d38cb10306e46a22c121dd84c05444d023496251 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 9 May 2023 10:29:36 +0200 Subject: [PATCH 08/19] refactor: rename classes Signed-off-by: jupyterjazz --- .../vectorstores/examples/lanecdb.ipynb | 2 +- langchain/vectorstores/__init__.py | 8 ++--- .../{hnsw_lib.py => docarray_hnsw_search.py} | 18 +++++----- ...search.py => docarray_in_memory_search.py} | 10 +++--- .../vector_store_from_doc_index.py | 2 +- .../vectorstores/test_hnsw_lib.py | 36 +++++++++---------- .../test_in_memory_exact_search.py | 18 +++++----- 7 files changed, 47 insertions(+), 47 deletions(-) rename langchain/vectorstores/{hnsw_lib.py => docarray_hnsw_search.py} (90%) rename langchain/vectorstores/{in_memory_exact_search.py => docarray_in_memory_search.py} (88%) diff --git a/docs/modules/indexes/vectorstores/examples/lanecdb.ipynb b/docs/modules/indexes/vectorstores/examples/lanecdb.ipynb index eb3bc6c8ad8184..7569e13742373b 100644 --- a/docs/modules/indexes/vectorstores/examples/lanecdb.ipynb +++ b/docs/modules/indexes/vectorstores/examples/lanecdb.ipynb @@ -206,7 +206,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.16" } }, "nbformat": 4, diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 9349a6cc586ceb..242927ee660ab4 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -8,8 +8,8 @@ from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from langchain.vectorstores.faiss import FAISS from langchain.vectorstores.lancedb import LanceDB -from langchain.vectorstores.hnsw_lib import HnswLib -from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch +from langchain.vectorstores.docarray_hnsw_search import DocArrayHnswSearch +from langchain.vectorstores.docarray_in_memory_search import DocArrayInMemorySearch from langchain.vectorstores.milvus import Milvus from langchain.vectorstores.myscale import MyScale, MyScaleSettings from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch @@ -42,6 +42,6 @@ "AnalyticDB", "Tair", "LanceDB", - "HnswLib", - "InMemoryExactSearch", + "DocArrayHnswSearch", + "DocArrayInMemorySearch", ] diff --git a/langchain/vectorstores/hnsw_lib.py b/langchain/vectorstores/docarray_hnsw_search.py similarity index 90% rename from langchain/vectorstores/hnsw_lib.py rename to langchain/vectorstores/docarray_hnsw_search.py index 78c686021356c6..e1e0c9e8ebc50b 100644 --- a/langchain/vectorstores/hnsw_lib.py +++ b/langchain/vectorstores/docarray_hnsw_search.py @@ -1,4 +1,4 @@ -"""Wrapper around HnswLib store.""" +"""Wrapper around DocArrayHnswSearch store.""" from __future__ import annotations from typing import List, Optional, Type @@ -11,8 +11,8 @@ ) -class HnswLib(VecStoreFromDocIndex): - """Wrapper around HnswLib storage. +class DocArrayHnswSearch(VecStoreFromDocIndex): + """Wrapper around DocArrayHnswSearch storage. To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed. You can install it with `pip install "langchain[hnswlib]"`. @@ -32,13 +32,13 @@ def __init__( allow_replace_deleted: bool = True, num_threads: int = 1, ) -> None: - """Initialize HnswLib store. + """Initialize DocArrayHnswSearch store. Args: embedding (Embeddings): Embedding function. work_dir (str): path to the location where all the data will be stored. n_dim (int): dimension of an embedding. - dist_metric (str): Distance metric for HnswLib can be one of: "cosine", + dist_metric (str): Distance metric for DocArrayHnswSearch can be one of: "cosine", "ip", and "l2". Defaults to "cosine". max_elements (int): Maximum number of vectors that can be stored. Defaults to 1024. @@ -97,8 +97,8 @@ def from_texts( M: int = 16, allow_replace_deleted: bool = True, num_threads: int = 1, - ) -> HnswLib: - """Create an HnswLib store and insert data. + ) -> DocArrayHnswSearch: + """Create an DocArrayHnswSearch store and insert data. Args: texts (List[str]): Text data. @@ -107,7 +107,7 @@ def from_texts( Defaults to None. work_dir (str): path to the location where all the data will be stored. n_dim (int): dimension of an embedding. - dist_metric (str): Distance metric for HnswLib can be one of: "cosine", + dist_metric (str): Distance metric for DocArrayHnswSearch can be one of: "cosine", "ip", and "l2". Defaults to "l2". max_elements (int): Maximum number of vectors that can be stored. Defaults to 1024. @@ -124,7 +124,7 @@ def from_texts( num_threads (int): Sets the number of cpu threads to use. Defaults to 1. Returns: - HnswLib Vector Store + DocArrayHnswSearch Vector Store """ if work_dir is None: raise ValueError("`work_dir` parameter has not been set.") diff --git a/langchain/vectorstores/in_memory_exact_search.py b/langchain/vectorstores/docarray_in_memory_search.py similarity index 88% rename from langchain/vectorstores/in_memory_exact_search.py rename to langchain/vectorstores/docarray_in_memory_search.py index bbaabe7e11c6bc..1bc6cbe35278b3 100644 --- a/langchain/vectorstores/in_memory_exact_search.py +++ b/langchain/vectorstores/docarray_in_memory_search.py @@ -11,7 +11,7 @@ ) -class InMemoryExactSearch(VecStoreFromDocIndex): +class DocArrayInMemorySearch(VecStoreFromDocIndex): """Wrapper around in-memory storage for exact search. To use it, you should have the ``docarray`` package with version >=0.31.0 installed. @@ -23,7 +23,7 @@ def __init__( embedding: Embeddings, metric: str = "cosine_sim", ) -> None: - """Initialize InMemoryExactSearch store. + """Initialize DocArrayInMemorySearch store. Args: embedding (Embeddings): Embedding function. @@ -45,8 +45,8 @@ def from_texts( embedding: Embeddings, metadatas: Optional[List[dict]] = None, metric: str = "cosine_sim", - ) -> InMemoryExactSearch: - """Create an InMemoryExactSearch store and insert data. + ) -> DocArrayInMemorySearch: + """Create an DocArrayInMemorySearch store and insert data. Args: texts (List[str]): Text data. @@ -58,7 +58,7 @@ def from_texts( Defaults to "cosine_sim". Returns: - InMemoryExactSearch Vector Store + DocArrayInMemorySearch Vector Store """ store = cls( embedding=embedding, diff --git a/langchain/vectorstores/vector_store_from_doc_index.py b/langchain/vectorstores/vector_store_from_doc_index.py index a471bfe1cd7038..8b76a750f8193d 100644 --- a/langchain/vectorstores/vector_store_from_doc_index.py +++ b/langchain/vectorstores/vector_store_from_doc_index.py @@ -23,7 +23,7 @@ def _check_docarray_import() -> None: da_version = docarray.__version__.split(".") if int(da_version[0]) == 0 and int(da_version[1]) <= 30: raise ValueError( - f"To use the HnswLib VectorStore the docarray version >=0.31.0 is expected, " + f"To use the DocArrayHnswSearch VectorStore the docarray version >=0.31.0 is expected, " f"received: {docarray.__version__}." f"To upgrade, please run: `pip install -U docarray`." ) diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_hnsw_lib.py index a4a6441eec7794..c2ec6e11eaa187 100644 --- a/tests/integration_tests/vectorstores/test_hnsw_lib.py +++ b/tests/integration_tests/vectorstores/test_hnsw_lib.py @@ -2,33 +2,33 @@ import pytest from langchain.schema import Document -from langchain.vectorstores.hnsw_lib import HnswLib +from langchain.vectorstores.docarray_hnsw_search import DocArrayHnswSearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -def test_hnswlib_vec_store_from_texts(tmp_path) -> None: +def test_docarray_hnsw_search_vec_store_from_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - docsearch = HnswLib.from_texts( + docsearch = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, dist_metric='cosine', ) - assert isinstance(docsearch, HnswLib) + assert isinstance(docsearch, DocArrayHnswSearch) assert docsearch.doc_index.num_docs() == 3 -def test_hnswlib_vec_store_add_texts(tmp_path) -> None: +def test_docarray_hnsw_search_vec_store_add_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" - docsearch = HnswLib( + docsearch = DocArrayHnswSearch( work_dir=str(tmp_path), n_dim=10, embedding=FakeEmbeddings(), dist_metric='cosine', ) - assert isinstance(docsearch, HnswLib) + assert isinstance(docsearch, DocArrayHnswSearch) assert docsearch.doc_index.num_docs() == 0 texts = ["foo", "bar", "baz"] @@ -40,14 +40,14 @@ def test_hnswlib_vec_store_add_texts(tmp_path) -> None: def test_sim_search(metric, tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - hnswlib_vec_store = HnswLib.from_texts( + hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, dist_metric=metric, ) - output = hnswlib_vec_store.similarity_search("foo", k=1) + output = hnsw_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] @@ -55,7 +55,7 @@ def test_sim_search(metric, tmp_path) -> None: def test_sim_search_all_configurations(metric, tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - hnswlib_vec_store = HnswLib.from_texts( + hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), work_dir=str(tmp_path), @@ -69,7 +69,7 @@ def test_sim_search_all_configurations(metric, tmp_path) -> None: allow_replace_deleted=False, num_threads=2, ) - output = hnswlib_vec_store.similarity_search("foo", k=1) + output = hnsw_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] @@ -77,7 +77,7 @@ def test_sim_search_all_configurations(metric, tmp_path) -> None: def test_sim_search_by_vector(metric, tmp_path) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] - hnswlib_vec_store = HnswLib.from_texts( + hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), work_dir=str(tmp_path), @@ -85,7 +85,7 @@ def test_sim_search_by_vector(metric, tmp_path) -> None: dist_metric=metric, ) embedding = [1.0] * 10 - output = hnswlib_vec_store.similarity_search_by_vector(embedding, k=1) + output = hnsw_vec_store.similarity_search_by_vector(embedding, k=1) assert output == [Document(page_content="bar")] @@ -94,14 +94,14 @@ def test_sim_search_by_vector(metric, tmp_path) -> None: def test_sim_search_with_score(metric, tmp_path) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] - hnswlib_vec_store = HnswLib.from_texts( + hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, dist_metric=metric, ) - output = hnswlib_vec_store.similarity_search_with_score("foo", k=1) + output = hnsw_vec_store.similarity_search_with_score("foo", k=1) assert len(output) == 1 out_doc, out_score = output[0] @@ -115,14 +115,14 @@ def test_sim_search_with_score_for_ip_metric(tmp_path) -> None: (inner-product) metric. """ texts = ["foo", "bar", "baz"] - hnswlib_vec_store = HnswLib.from_texts( + hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, dist_metric='ip', ) - output = hnswlib_vec_store.similarity_search_with_score("foo", k=3) + output = hnsw_vec_store.similarity_search_with_score("foo", k=3) assert len(output) == 3 for result in output: @@ -134,7 +134,7 @@ def test_max_marginal_relevance_search(metric, tmp_path) -> None: """Test MRR search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] - docsearch = HnswLib.from_texts( + docsearch = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), metadatas=metadatas, diff --git a/tests/integration_tests/vectorstores/test_in_memory_exact_search.py b/tests/integration_tests/vectorstores/test_in_memory_exact_search.py index 7e0142ec8212ff..dd3b8c86fbc169 100644 --- a/tests/integration_tests/vectorstores/test_in_memory_exact_search.py +++ b/tests/integration_tests/vectorstores/test_in_memory_exact_search.py @@ -2,27 +2,27 @@ import pytest from langchain.schema import Document -from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch +from langchain.vectorstores.docarray_in_memory_search import DocArrayInMemorySearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings def test_in_memory_vec_store_from_texts() -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - docsearch = InMemoryExactSearch.from_texts( + docsearch = DocArrayInMemorySearch.from_texts( texts, FakeEmbeddings(), ) - assert isinstance(docsearch, InMemoryExactSearch) + assert isinstance(docsearch, DocArrayInMemorySearch) assert docsearch.doc_index.num_docs() == 3 def test_in_memory_vec_store_add_texts(tmp_path) -> None: """Test end to end construction and simple similarity search.""" - docsearch = InMemoryExactSearch( + docsearch = DocArrayInMemorySearch( embedding=FakeEmbeddings(), ) - assert isinstance(docsearch, InMemoryExactSearch) + assert isinstance(docsearch, DocArrayInMemorySearch) assert docsearch.doc_index.num_docs() == 0 texts = ["foo", "bar", "baz"] @@ -34,7 +34,7 @@ def test_in_memory_vec_store_add_texts(tmp_path) -> None: def test_sim_search(metric) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemoryExactSearch.from_texts( + in_memory_vec_store = DocArrayInMemorySearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -48,7 +48,7 @@ def test_sim_search(metric) -> None: def test_sim_search_with_score(metric) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemoryExactSearch.from_texts( + in_memory_vec_store = DocArrayInMemorySearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -67,7 +67,7 @@ def test_sim_search_with_score(metric) -> None: def test_sim_search_by_vector(metric) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] - in_memory_vec_store = InMemoryExactSearch.from_texts( + in_memory_vec_store = DocArrayInMemorySearch.from_texts( texts=texts, embedding=FakeEmbeddings(), metric=metric, @@ -84,7 +84,7 @@ def test_max_marginal_relevance_search(metric) -> None: """Test MRR search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] - docsearch = InMemoryExactSearch.from_texts( + docsearch = DocArrayInMemorySearch.from_texts( texts, FakeEmbeddings(), metadatas=metadatas, From 8c0a611b8d04c12eefe55e78b40f63462abec84c Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 9 May 2023 10:36:51 +0200 Subject: [PATCH 09/19] refactor: modify notebooks Signed-off-by: jupyterjazz --- ...nwlib.ipynb => docarray_hnsw_search.ipynb} | 38 +++++++++-------- ....ipynb => docarray_in_memory_search.ipynb} | 42 ++++++++++++------- .../vectorstores/examples/lanecdb.ipynb | 2 +- .../vectorstores/docarray_hnsw_search.py | 2 +- 4 files changed, 48 insertions(+), 36 deletions(-) rename docs/modules/indexes/vectorstores/examples/{hsnwlib.ipynb => docarray_hnsw_search.ipynb} (88%) rename docs/modules/indexes/vectorstores/examples/{inmemoryexactsearch.ipynb => docarray_in_memory_search.ipynb} (87%) diff --git a/docs/modules/indexes/vectorstores/examples/hsnwlib.ipynb b/docs/modules/indexes/vectorstores/examples/docarray_hnsw_search.ipynb similarity index 88% rename from docs/modules/indexes/vectorstores/examples/hsnwlib.ipynb rename to docs/modules/indexes/vectorstores/examples/docarray_hnsw_search.ipynb index fd1959cae73a8b..436970611c965f 100644 --- a/docs/modules/indexes/vectorstores/examples/hsnwlib.ipynb +++ b/docs/modules/indexes/vectorstores/examples/docarray_hnsw_search.ipynb @@ -5,26 +5,28 @@ "id": "2ce41f46-5711-4311-b04d-2fe233ac5b1b", "metadata": {}, "source": [ - "# HnswLib\n", + "# DocArrayHnswSearch\n", "\n", - ">[HnswLib](https://docs.docarray.org/user_guide/storing/index_hnswlib/) is a lightweight Document Index implementation provided by [Docarray](https://docs.docarray.org/) that runs fully locally and is best suited for small- to medium-sized datasets. It stores vectors on disk in [hnswlib](https://github.com/nmslib/hnswlib), and stores all other data in [SQLite](https://www.sqlite.org/index.html).\n", + ">[DocArrayHnswSearch](https://docs.docarray.org/user_guide/storing/index_hnswlib/) is a lightweight Document Index implementation provided by [Docarray](https://docs.docarray.org/) that runs fully locally and is best suited for small- to medium-sized datasets. It stores vectors on disk in [hnswlib](https://github.com/nmslib/hnswlib), and stores all other data in [SQLite](https://www.sqlite.org/index.html).\n", "\n", - "This notebook shows how to use functionality related to the `HnswLib`." + "This notebook shows how to use functionality related to the `DocArrayHnswSearch`." ] }, { - "cell_type": "raw", - "id": "6db14de4-b417-4139-8236-2d4e909f2157", + "cell_type": "code", + "execution_count": null, + "id": "8ce1b8cb-dbf0-40c3-99ee-04f28143331b", "metadata": { "tags": [] }, + "outputs": [], "source": [ "!pip install \"docarray[hnswlib]\"" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "878f17df-100f-4854-9e87-472cf36d51f3", "metadata": { "tags": [] @@ -48,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "82d9984a-6031-403d-a977-6bc98d6be23a", "metadata": { "tags": [] @@ -62,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "b757afef-ef0a-465d-8e8a-9aadb9c32b88", "metadata": { "tags": [] @@ -80,13 +82,13 @@ "source": [ "from langchain.embeddings.openai import OpenAIEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", - "from langchain.vectorstores import HnswLib\n", + "from langchain.vectorstores import DocArrayHnswSearch\n", "from langchain.document_loaders import TextLoader" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "605e200e-e711-486b-b36e-cbe5dd2512d7", "metadata": { "tags": [] @@ -101,7 +103,7 @@ "\n", "embeddings = OpenAIEmbeddings()\n", "\n", - "db = HnswLib.from_documents(docs, embeddings, work_dir='hnswlib_store/', n_dim=1536)" + "db = DocArrayHnswSearch.from_documents(docs, embeddings, work_dir='hnswlib_store/', n_dim=1536)" ] }, { @@ -114,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "4d7e742f-2002-449d-a10e-16046890906c", "metadata": { "tags": [] @@ -127,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "0da9e26f-1fc2-48e6-95a7-f692c853bbd3", "metadata": { "tags": [] @@ -161,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "40764fdd-357d-475a-8152-5f1979d61a45", "metadata": { "tags": [] @@ -173,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "a479fc46-b299-4330-89b9-e9b5a218ea03", "metadata": { "tags": [] @@ -183,10 +185,10 @@ "data": { "text/plain": [ "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={}),\n", - " 0.3691615)" + " 0.36962226)" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -197,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "id": "4d3d4e97-5d2b-4571-8ff9-e3f6b6778714", "metadata": { "tags": [] diff --git a/docs/modules/indexes/vectorstores/examples/inmemoryexactsearch.ipynb b/docs/modules/indexes/vectorstores/examples/docarray_in_memory_search.ipynb similarity index 87% rename from docs/modules/indexes/vectorstores/examples/inmemoryexactsearch.ipynb rename to docs/modules/indexes/vectorstores/examples/docarray_in_memory_search.ipynb index e0828919742a6e..2b2c134f59e0ad 100644 --- a/docs/modules/indexes/vectorstores/examples/inmemoryexactsearch.ipynb +++ b/docs/modules/indexes/vectorstores/examples/docarray_in_memory_search.ipynb @@ -5,26 +5,28 @@ "id": "a3afefb0-7e99-4912-a222-c6b186da11af", "metadata": {}, "source": [ - "# InMemoryExactSearch\n", + "# DocArrayInMemorySearch\n", "\n", - ">[InMemoryExactSearch](https://docs.docarray.org/user_guide/storing/index_in_memory/) is a document index provided by [Docarray](https://docs.docarray.org/) that stores documents in memory. It is a great starting point for small datasets, where you may not want to launch a database server.\n", + ">[DocArrayInMemorySearch](https://docs.docarray.org/user_guide/storing/index_in_memory/) is a document index provided by [Docarray](https://docs.docarray.org/) that stores documents in memory. It is a great starting point for small datasets, where you may not want to launch a database server.\n", "\n", - "This notebook shows how to use functionality related to the `InMemoryExactSearch`." + "This notebook shows how to use functionality related to the `DocArrayInMemorySearch`." ] }, { - "cell_type": "raw", - "id": "2ac74987-65ae-4298-b439-8ad8dff96b36", + "cell_type": "code", + "execution_count": null, + "id": "7cd7391f-7759-4a21-952a-2ec972d818c6", "metadata": { "tags": [] }, + "outputs": [], "source": [ "!pip install docarray" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "c6a40ad8-920e-4370-818d-3227e2f506ed", "metadata": { "tags": [] @@ -48,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "d2ada2f5-e30e-4aa7-863e-69339a28e825", "metadata": { "tags": [] @@ -62,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "e49be085-ddf1-4028-8c0c-97836ce4a873", "metadata": { "tags": [] @@ -80,13 +82,13 @@ "source": [ "from langchain.embeddings.openai import OpenAIEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", - "from langchain.vectorstores import InMemoryExactSearch\n", + "from langchain.vectorstores import DocArrayInMemorySearch\n", "from langchain.document_loaders import TextLoader" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "38222aee-adc5-44c2-913c-97977b394cf5", "metadata": { "tags": [] @@ -101,7 +103,7 @@ "\n", "embeddings = OpenAIEmbeddings()\n", "\n", - "db = InMemoryExactSearch.from_documents(docs, embeddings)" + "db = DocArrayInMemorySearch.from_documents(docs, embeddings)" ] }, { @@ -114,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "aa28a7f8-41d0-4299-84eb-91d1576e8a63", "metadata": { "tags": [] @@ -127,7 +129,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "1eb16d2a-b466-456a-b412-5e74bb8523dd", "metadata": { "tags": [] @@ -161,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "8e9eef05-1516-469a-ad36-880c69aef7a9", "metadata": { "tags": [] @@ -173,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "bd5fb0e4-2a94-4bb4-af8a-27327ecb1a7f", "metadata": { "tags": [] @@ -186,7 +188,7 @@ " 0.8154190158347903)" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -194,6 +196,14 @@ "source": [ "docs[0]" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e5da522-ef0e-4a59-91ea-89e563f7b825", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/docs/modules/indexes/vectorstores/examples/lanecdb.ipynb b/docs/modules/indexes/vectorstores/examples/lanecdb.ipynb index 7569e13742373b..eb3bc6c8ad8184 100644 --- a/docs/modules/indexes/vectorstores/examples/lanecdb.ipynb +++ b/docs/modules/indexes/vectorstores/examples/lanecdb.ipynb @@ -206,7 +206,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/langchain/vectorstores/docarray_hnsw_search.py b/langchain/vectorstores/docarray_hnsw_search.py index e1e0c9e8ebc50b..6aebaedbb05630 100644 --- a/langchain/vectorstores/docarray_hnsw_search.py +++ b/langchain/vectorstores/docarray_hnsw_search.py @@ -12,7 +12,7 @@ class DocArrayHnswSearch(VecStoreFromDocIndex): - """Wrapper around DocArrayHnswSearch storage. + """Wrapper around HnswLib storage. To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed. You can install it with `pip install "langchain[hnswlib]"`. From b920c15ebf5de4617c3914171cdfccbfe1ff4c2c Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Tue, 9 May 2023 12:43:39 +0200 Subject: [PATCH 10/19] refactor: naming adjustments --- langchain/vectorstores/docarray_hnsw_search.py | 2 +- .../{test_hnsw_lib.py => test_docarray_hnsw_search.py} | 0 ...memory_exact_search.py => test_docarray_in_memory_search.py} | 0 3 files changed, 1 insertion(+), 1 deletion(-) rename tests/integration_tests/vectorstores/{test_hnsw_lib.py => test_docarray_hnsw_search.py} (100%) rename tests/integration_tests/vectorstores/{test_in_memory_exact_search.py => test_docarray_in_memory_search.py} (100%) diff --git a/langchain/vectorstores/docarray_hnsw_search.py b/langchain/vectorstores/docarray_hnsw_search.py index 6aebaedbb05630..19f11a79a11bc0 100644 --- a/langchain/vectorstores/docarray_hnsw_search.py +++ b/langchain/vectorstores/docarray_hnsw_search.py @@ -1,4 +1,4 @@ -"""Wrapper around DocArrayHnswSearch store.""" +"""Wrapper around Hnswlib store.""" from __future__ import annotations from typing import List, Optional, Type diff --git a/tests/integration_tests/vectorstores/test_hnsw_lib.py b/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py similarity index 100% rename from tests/integration_tests/vectorstores/test_hnsw_lib.py rename to tests/integration_tests/vectorstores/test_docarray_hnsw_search.py diff --git a/tests/integration_tests/vectorstores/test_in_memory_exact_search.py b/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py similarity index 100% rename from tests/integration_tests/vectorstores/test_in_memory_exact_search.py rename to tests/integration_tests/vectorstores/test_docarray_in_memory_search.py From d8df4bbcfd81b0aa8f9faa654ec6a23104ee34f2 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Wed, 10 May 2023 14:18:33 +0200 Subject: [PATCH 11/19] refactor: requested changes Signed-off-by: jupyterjazz --- langchain/vectorstores/__init__.py | 3 +- langchain/vectorstores/docarray/__init__.py | 7 + .../base.py} | 51 ++++--- langchain/vectorstores/docarray/hnsw.py | 81 ++++++++++ .../in_memory.py} | 7 +- .../vectorstores/docarray_hnsw_search.py | 141 ------------------ .../vectorstores/test_docarray_hnsw_search.py | 20 +-- .../test_docarray_in_memory_search.py | 19 +-- 8 files changed, 134 insertions(+), 195 deletions(-) create mode 100644 langchain/vectorstores/docarray/__init__.py rename langchain/vectorstores/{vector_store_from_doc_index.py => docarray/base.py} (84%) create mode 100644 langchain/vectorstores/docarray/hnsw.py rename langchain/vectorstores/{docarray_in_memory_search.py => docarray/in_memory.py} (92%) delete mode 100644 langchain/vectorstores/docarray_hnsw_search.py diff --git a/langchain/vectorstores/__init__.py b/langchain/vectorstores/__init__.py index 242927ee660ab4..ade924590c2b59 100644 --- a/langchain/vectorstores/__init__.py +++ b/langchain/vectorstores/__init__.py @@ -5,11 +5,10 @@ from langchain.vectorstores.base import VectorStore from langchain.vectorstores.chroma import Chroma from langchain.vectorstores.deeplake import DeepLake +from langchain.vectorstores.docarray import DocArrayHnswSearch, DocArrayInMemorySearch from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch from langchain.vectorstores.faiss import FAISS from langchain.vectorstores.lancedb import LanceDB -from langchain.vectorstores.docarray_hnsw_search import DocArrayHnswSearch -from langchain.vectorstores.docarray_in_memory_search import DocArrayInMemorySearch from langchain.vectorstores.milvus import Milvus from langchain.vectorstores.myscale import MyScale, MyScaleSettings from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch diff --git a/langchain/vectorstores/docarray/__init__.py b/langchain/vectorstores/docarray/__init__.py new file mode 100644 index 00000000000000..be3d5bde6588ed --- /dev/null +++ b/langchain/vectorstores/docarray/__init__.py @@ -0,0 +1,7 @@ +from langchain.vectorstores.docarray.hnsw import DocArrayHnswSearch +from langchain.vectorstores.docarray.in_memory import DocArrayInMemorySearch + +__all__ = [ + "DocArrayHnswSearch", + "DocArrayInMemorySearch", +] diff --git a/langchain/vectorstores/vector_store_from_doc_index.py b/langchain/vectorstores/docarray/base.py similarity index 84% rename from langchain/vectorstores/vector_store_from_doc_index.py rename to langchain/vectorstores/docarray/base.py index 8b76a750f8193d..421e0a3dc05771 100644 --- a/langchain/vectorstores/vector_store_from_doc_index.py +++ b/langchain/vectorstores/docarray/base.py @@ -1,20 +1,20 @@ -from operator import itemgetter -from typing import Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type -try: - from docarray import BaseDoc - from docarray.index.abstract import BaseDocIndex - from docarray.typing import NdArray -except ImportError: - BaseDoc = None - BaseDocIndex = None - NdArray = None +import numpy as np +from pydantic import Field from langchain.embeddings.base import Embeddings from langchain.schema import Document from langchain.vectorstores import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance +if TYPE_CHECKING: + from docarray import BaseDoc + from docarray.index.abstract import BaseDocIndex +else: + BaseDoc = object + BaseDocIndex = object + def _check_docarray_import() -> None: try: @@ -30,15 +30,11 @@ def _check_docarray_import() -> None: except ImportError: raise ImportError( "Could not import docarray python package. " - "Please install it with `pip install \"langchain[docarray]\"`." + 'Please install it with `pip install "langchain[docarray]"`.' ) -class VecStoreFromDocIndex(VectorStore): - doc_index: BaseDocIndex - doc_cls: Type[BaseDoc] - embedding: Embeddings - +class DocArrayIndex(VectorStore): def __init__( self, doc_index: BaseDocIndex, @@ -53,7 +49,7 @@ def __init__( def _get_doc_cls(embeddings_params: Dict[str, Any]) -> Type[BaseDoc]: """Get docarray Document class describing the schema of DocIndex.""" from docarray import BaseDoc - from pydantic import Field + from docarray.typing import NdArray class DocArrayDoc(BaseDoc): text: Optional[str] @@ -77,12 +73,10 @@ def add_texts( Returns: List of ids from adding the texts into the vectorstore. """ - if metadatas is None: - metadatas = [{} for _ in range(len(list(texts)))] - ids: List[str] = [] embeddings = self.embedding.embed_documents(texts) - for t, m, e in zip(texts, metadatas, embeddings): + for i, (t, e) in enumerate(zip(texts, embeddings)): + m = metadatas[i] if metadatas else {} doc = self.doc_cls(text=t, embedding=e, metadata=m) self.doc_index.index([doc]) ids.append(str(doc.id)) @@ -106,7 +100,8 @@ def similarity_search_with_score( docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k) result = [ - (Document(page_content=doc.text), score) for doc, score in zip(docs, scores) + (Document(page_content=doc.text, metadata=doc.metadata), score) + for doc, score in zip(docs, scores) ] return result @@ -122,8 +117,8 @@ def similarity_search( Returns: List of Documents most similar to the query. """ - results = self.similarity_search_with_score(query, k) - return list(map(itemgetter(0), results)) + results = self.similarity_search_with_score(query=query, k=k, **kwargs) + return [doc for doc, _ in results] def _similarity_search_with_relevance_scores( self, @@ -155,7 +150,9 @@ def similarity_search_by_vector( query_doc, search_field="embedding", limit=k ).documents - result = [Document(page_content=doc.text) for doc in docs] + result = [ + Document(page_content=doc.text, metadata=doc.metadata) for doc in docs + ] return result def max_marginal_relevance_search( @@ -181,7 +178,9 @@ def max_marginal_relevance_search( query_doc, search_field="embedding", limit=fetch_k ).documents - mmr_selected = maximal_marginal_relevance(query_embedding, docs.embedding, k=k) + mmr_selected = maximal_marginal_relevance( + np.array(query_embedding), docs.embedding, k=k + ) results = [ Document(page_content=docs[idx].text, metadata=docs[idx].metadata) for idx in mmr_selected diff --git a/langchain/vectorstores/docarray/hnsw.py b/langchain/vectorstores/docarray/hnsw.py new file mode 100644 index 00000000000000..81c7313286e284 --- /dev/null +++ b/langchain/vectorstores/docarray/hnsw.py @@ -0,0 +1,81 @@ +"""Wrapper around Hnswlib store.""" +from __future__ import annotations + +from typing import Any, List, Optional, Type + +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.docarray.base import DocArrayIndex, _check_docarray_import + + +class DocArrayHnswSearch(DocArrayIndex): + """Wrapper around HnswLib storage. + + To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed. + You can install it with `pip install "langchain[hnswlib]"`. + """ + + def __init__( + self, embedding: Embeddings, work_dir: str, n_dim: int, **kwargs: Any + ) -> None: + """Initialize DocArrayHnswSearch store. + + Args: + embedding (Embeddings): Embedding function. + work_dir (str): path to the location where all the data will be stored. + n_dim (int): dimension of an embedding. + **kwargs: Other keyword arguments to be passed to the _get_doc_cls method. + """ + _check_docarray_import() + from docarray.index import HnswDocumentIndex + + kwargs.setdefault("dist_metric", "cosine") + kwargs.setdefault("max_elements", 1024) + kwargs.setdefault("index", True) + kwargs.setdefault("ef_construction", 200) + kwargs.setdefault("ef", 10) + kwargs.setdefault("M", 16) + kwargs.setdefault("allow_replace_deleted", True) + kwargs.setdefault("num_threads", 1) + + doc_cls = self._get_doc_cls( + { + "dim": n_dim, + "space": kwargs["dist_metric"], + **{k: v for k, v in kwargs.items() if k != "dist_metric"}, + } + ) + doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) + super().__init__(doc_index, embedding) + + @classmethod + def from_texts( + cls: Type[DocArrayHnswSearch], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + work_dir: str = None, + n_dim: int = None, + **kwargs: Any, + ) -> DocArrayHnswSearch: + """Create an DocArrayHnswSearch store and insert data. + + Args: + texts (List[str]): Text data. + embedding (Embeddings): Embedding function. + metadatas (Optional[List[dict]]): Metadata for each text if it exists. + Defaults to None. + work_dir (str): path to the location where all the data will be stored. + n_dim (int): dimension of an embedding. + **kwargs: Other keyword arguments to be passed to the __init__ method. + + Returns: + DocArrayHnswSearch Vector Store + """ + if work_dir is None: + raise ValueError("`work_dir` parameter has not been set.") + if n_dim is None: + raise ValueError("`n_dim` parameter has not been set.") + + store = cls(work_dir=work_dir, n_dim=n_dim, embedding=embedding, **kwargs) + store.add_texts(texts=texts, metadatas=metadatas) + return store diff --git a/langchain/vectorstores/docarray_in_memory_search.py b/langchain/vectorstores/docarray/in_memory.py similarity index 92% rename from langchain/vectorstores/docarray_in_memory_search.py rename to langchain/vectorstores/docarray/in_memory.py index 1bc6cbe35278b3..1e20cb3053ab3c 100644 --- a/langchain/vectorstores/docarray_in_memory_search.py +++ b/langchain/vectorstores/docarray/in_memory.py @@ -5,13 +5,10 @@ from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VST -from langchain.vectorstores.vector_store_from_doc_index import ( - VecStoreFromDocIndex, - _check_docarray_import, -) +from langchain.vectorstores.docarray.base import DocArrayIndex, _check_docarray_import -class DocArrayInMemorySearch(VecStoreFromDocIndex): +class DocArrayInMemorySearch(DocArrayIndex): """Wrapper around in-memory storage for exact search. To use it, you should have the ``docarray`` package with version >=0.31.0 installed. diff --git a/langchain/vectorstores/docarray_hnsw_search.py b/langchain/vectorstores/docarray_hnsw_search.py deleted file mode 100644 index 19f11a79a11bc0..00000000000000 --- a/langchain/vectorstores/docarray_hnsw_search.py +++ /dev/null @@ -1,141 +0,0 @@ -"""Wrapper around Hnswlib store.""" -from __future__ import annotations - -from typing import List, Optional, Type - -from langchain.embeddings.base import Embeddings -from langchain.vectorstores.base import VST -from langchain.vectorstores.vector_store_from_doc_index import ( - VecStoreFromDocIndex, - _check_docarray_import, -) - - -class DocArrayHnswSearch(VecStoreFromDocIndex): - """Wrapper around HnswLib storage. - - To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed. - You can install it with `pip install "langchain[hnswlib]"`. - """ - - def __init__( - self, - embedding: Embeddings, - work_dir: str, - n_dim: int, - dist_metric: str = "cosine", - max_elements: int = 1024, - index: bool = True, - ef_construction: int = 200, - ef: int = 10, - M: int = 16, - allow_replace_deleted: bool = True, - num_threads: int = 1, - ) -> None: - """Initialize DocArrayHnswSearch store. - - Args: - embedding (Embeddings): Embedding function. - work_dir (str): path to the location where all the data will be stored. - n_dim (int): dimension of an embedding. - dist_metric (str): Distance metric for DocArrayHnswSearch can be one of: "cosine", - "ip", and "l2". Defaults to "cosine". - max_elements (int): Maximum number of vectors that can be stored. - Defaults to 1024. - index (bool): Whether an index should be built for this field. - Defaults to True. - ef_construction (int): defines a construction time/accuracy trade-off. - Defaults to 200. - ef (int): parameter controlling query time/accuracy trade-off. - Defaults to 10. - M (int): parameter that defines the maximum number of outgoing - connections in the graph. Defaults to 16. - allow_replace_deleted (bool): Enables replacing of deleted elements - with new added ones. Defaults to True. - num_threads (int): Sets the number of cpu threads to use. Defaults to 1. - """ - _check_docarray_import() - from docarray.index import HnswDocumentIndex - - try: - import google.protobuf - except ImportError: - raise ImportError( - "Could not import all required packages. " - "Please install it with `pip install \"langchain[hnswlib]\"`." - ) - - doc_cls = self._get_doc_cls( - { - "dim": n_dim, - "space": dist_metric, - "max_elements": max_elements, - "index": index, - "ef_construction": ef_construction, - "ef": ef, - "M": M, - "allow_replace_deleted": allow_replace_deleted, - "num_threads": num_threads, - } - ) - doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) - super().__init__(doc_index, embedding) - - @classmethod - def from_texts( - cls: Type[VST], - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - work_dir: str = None, - n_dim: int = None, - dist_metric: str = "l2", - max_elements: int = 1024, - index: bool = True, - ef_construction: int = 200, - ef: int = 10, - M: int = 16, - allow_replace_deleted: bool = True, - num_threads: int = 1, - ) -> DocArrayHnswSearch: - """Create an DocArrayHnswSearch store and insert data. - - Args: - texts (List[str]): Text data. - embedding (Embeddings): Embedding function. - metadatas (Optional[List[dict]]): Metadata for each text if it exists. - Defaults to None. - work_dir (str): path to the location where all the data will be stored. - n_dim (int): dimension of an embedding. - dist_metric (str): Distance metric for DocArrayHnswSearch can be one of: "cosine", - "ip", and "l2". Defaults to "l2". - max_elements (int): Maximum number of vectors that can be stored. - Defaults to 1024. - index (bool): Whether an index should be built for this field. - Defaults to True. - ef_construction (int): defines a construction time/accuracy trade-off. - Defaults to 200. - ef (int): parameter controlling query time/accuracy trade-off. - Defaults to 10. - M (int): parameter that defines the maximum number of outgoing - connections in the graph. Defaults to 16. - allow_replace_deleted (bool): Enables replacing of deleted elements - with new added ones. Defaults to True. - num_threads (int): Sets the number of cpu threads to use. Defaults to 1. - - Returns: - DocArrayHnswSearch Vector Store - """ - if work_dir is None: - raise ValueError("`work_dir` parameter has not been set.") - if n_dim is None: - raise ValueError("`n_dim` parameter has not been set.") - - store = cls( - work_dir=work_dir, - n_dim=n_dim, - embedding=embedding, - dist_metric=dist_metric, - ) - store.add_texts(texts=texts, metadatas=metadatas) - return store diff --git a/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py b/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py index c2ec6e11eaa187..f83e8dd94ef2ac 100644 --- a/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py +++ b/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py @@ -2,7 +2,7 @@ import pytest from langchain.schema import Document -from langchain.vectorstores.docarray_hnsw_search import DocArrayHnswSearch +from langchain.vectorstores.docarray import DocArrayHnswSearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings @@ -14,7 +14,7 @@ def test_docarray_hnsw_search_vec_store_from_texts(tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, - dist_metric='cosine', + dist_metric="cosine", ) assert isinstance(docsearch, DocArrayHnswSearch) assert docsearch.doc_index.num_docs() == 3 @@ -26,7 +26,7 @@ def test_docarray_hnsw_search_vec_store_add_texts(tmp_path) -> None: work_dir=str(tmp_path), n_dim=10, embedding=FakeEmbeddings(), - dist_metric='cosine', + dist_metric="cosine", ) assert isinstance(docsearch, DocArrayHnswSearch) assert docsearch.doc_index.num_docs() == 0 @@ -36,7 +36,7 @@ def test_docarray_hnsw_search_vec_store_add_texts(tmp_path) -> None: assert docsearch.doc_index.num_docs() == 3 -@pytest.mark.parametrize('metric', ['cosine', 'l2']) +@pytest.mark.parametrize("metric", ["cosine", "l2"]) def test_sim_search(metric, tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] @@ -51,7 +51,7 @@ def test_sim_search(metric, tmp_path) -> None: assert output == [Document(page_content="foo")] -@pytest.mark.parametrize('metric', ['cosine', 'l2']) +@pytest.mark.parametrize("metric", ["cosine", "l2"]) def test_sim_search_all_configurations(metric, tmp_path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] @@ -73,7 +73,7 @@ def test_sim_search_all_configurations(metric, tmp_path) -> None: assert output == [Document(page_content="foo")] -@pytest.mark.parametrize('metric', ['cosine', 'l2']) +@pytest.mark.parametrize("metric", ["cosine", "l2"]) def test_sim_search_by_vector(metric, tmp_path) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] @@ -90,7 +90,7 @@ def test_sim_search_by_vector(metric, tmp_path) -> None: assert output == [Document(page_content="bar")] -@pytest.mark.parametrize('metric', ['cosine', 'l2']) +@pytest.mark.parametrize("metric", ["cosine", "l2"]) def test_sim_search_with_score(metric, tmp_path) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] @@ -106,7 +106,7 @@ def test_sim_search_with_score(metric, tmp_path) -> None: out_doc, out_score = output[0] assert out_doc == Document(page_content="foo") - assert np.isclose(out_score, 0.0, atol=1.e-6) + assert np.isclose(out_score, 0.0, atol=1.0e-6) def test_sim_search_with_score_for_ip_metric(tmp_path) -> None: @@ -120,7 +120,7 @@ def test_sim_search_with_score_for_ip_metric(tmp_path) -> None: FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, - dist_metric='ip', + dist_metric="ip", ) output = hnsw_vec_store.similarity_search_with_score("foo", k=3) assert len(output) == 3 @@ -129,7 +129,7 @@ def test_sim_search_with_score_for_ip_metric(tmp_path) -> None: assert result[1] == -8.0 -@pytest.mark.parametrize('metric', ['cosine', 'l2']) +@pytest.mark.parametrize("metric", ["cosine", "l2"]) def test_max_marginal_relevance_search(metric, tmp_path) -> None: """Test MRR search.""" texts = ["foo", "bar", "baz"] diff --git a/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py b/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py index dd3b8c86fbc169..5a14679a0bc57d 100644 --- a/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py +++ b/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py @@ -2,7 +2,7 @@ import pytest from langchain.schema import Document -from langchain.vectorstores.docarray_in_memory_search import DocArrayInMemorySearch +from langchain.vectorstores.docarray import DocArrayInMemorySearch from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings @@ -30,7 +30,7 @@ def test_in_memory_vec_store_add_texts(tmp_path) -> None: assert docsearch.doc_index.num_docs() == 3 -@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +@pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) def test_sim_search(metric) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] @@ -44,7 +44,7 @@ def test_sim_search(metric) -> None: assert output == [Document(page_content="foo")] -@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +@pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) def test_sim_search_with_score(metric) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] @@ -59,11 +59,11 @@ def test_sim_search_with_score(metric) -> None: out_doc, out_score = output[0] assert out_doc == Document(page_content="foo") - expected_score = 0.0 if 'dist' in metric else 1.0 - assert np.isclose(out_score, expected_score, atol=1.e-6) + expected_score = 0.0 if "dist" in metric else 1.0 + assert np.isclose(out_score, expected_score, atol=1.0e-6) -@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +@pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) def test_sim_search_by_vector(metric) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] @@ -79,16 +79,13 @@ def test_sim_search_by_vector(metric) -> None: assert output == [Document(page_content="bar")] -@pytest.mark.parametrize('metric', ['cosine_sim', 'euclidean_dist', 'sqeuclidean_dist']) +@pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) def test_max_marginal_relevance_search(metric) -> None: """Test MRR search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] docsearch = DocArrayInMemorySearch.from_texts( - texts, - FakeEmbeddings(), - metadatas=metadatas, - metric=metric + texts, FakeEmbeddings(), metadatas=metadatas, metric=metric ) output = docsearch.max_marginal_relevance_search("foo", k=2, fetch_k=3) assert output == [ From 4694bb4eee17df08435c7fe0cce9a737cdb22970 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Wed, 10 May 2023 17:35:40 +0200 Subject: [PATCH 12/19] style: resolve lint errors Signed-off-by: jupyterjazz --- langchain/vectorstores/docarray/base.py | 25 +++++++++++++------ langchain/vectorstores/docarray/hnsw.py | 10 +++++--- langchain/vectorstores/docarray/in_memory.py | 15 +++++------ .../vectorstores/test_docarray_hnsw_search.py | 18 +++++++------ .../test_docarray_in_memory_search.py | 12 +++++---- 5 files changed, 48 insertions(+), 32 deletions(-) diff --git a/langchain/vectorstores/docarray/base.py b/langchain/vectorstores/docarray/base.py index 421e0a3dc05771..c9fdd3867b3351 100644 --- a/langchain/vectorstores/docarray/base.py +++ b/langchain/vectorstores/docarray/base.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, + Type) import numpy as np from pydantic import Field @@ -74,10 +75,10 @@ def add_texts( List of ids from adding the texts into the vectorstore. """ ids: List[str] = [] - embeddings = self.embedding.embed_documents(texts) + embeddings = self.embedding.embed_documents(list(texts)) for i, (t, e) in enumerate(zip(texts, embeddings)): m = metadatas[i] if metadatas else {} - doc = self.doc_cls(text=t, embedding=e, metadata=m) + doc = self.doc_cls(text=t, embedding=e, metadata=m) # type: ignore self.doc_index.index([doc]) ids.append(str(doc.id)) @@ -96,7 +97,7 @@ def similarity_search_with_score( List of Documents most similar to the query and score for each. """ query_embedding = self.embedding.embed_query(query) - query_doc = self.doc_cls(embedding=query_embedding) + query_doc = self.doc_cls(embedding=query_embedding) # type: ignore docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k) result = [ @@ -145,7 +146,7 @@ def similarity_search_by_vector( List of Documents most similar to the query vector. """ - query_doc = self.doc_cls(embedding=embedding) + query_doc = self.doc_cls(embedding=embedding) # type: ignore docs = self.doc_index.find( query_doc, search_field="embedding", limit=k ).documents @@ -156,7 +157,12 @@ def similarity_search_by_vector( return result def max_marginal_relevance_search( - self, query: str, k: int = 4, fetch_k: int = 20, **kwargs: Any + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -167,12 +173,15 @@ def max_marginal_relevance_search( query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. fetch_k: Number of Documents to fetch to pass to MMR algorithm. - + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. Returns: List of Documents selected by maximal marginal relevance. """ query_embedding = self.embedding.embed_query(query) - query_doc = self.doc_cls(embedding=query_embedding) + query_doc = self.doc_cls(embedding=query_embedding) # type: ignore docs = self.doc_index.find( query_doc, search_field="embedding", limit=fetch_k diff --git a/langchain/vectorstores/docarray/hnsw.py b/langchain/vectorstores/docarray/hnsw.py index 81c7313286e284..5736a5768b9231 100644 --- a/langchain/vectorstores/docarray/hnsw.py +++ b/langchain/vectorstores/docarray/hnsw.py @@ -4,7 +4,8 @@ from typing import Any, List, Optional, Type from langchain.embeddings.base import Embeddings -from langchain.vectorstores.docarray.base import DocArrayIndex, _check_docarray_import +from langchain.vectorstores.docarray.base import (DocArrayIndex, + _check_docarray_import) class DocArrayHnswSearch(DocArrayIndex): @@ -44,7 +45,7 @@ def __init__( **{k: v for k, v in kwargs.items() if k != "dist_metric"}, } ) - doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) + doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) # type: ignore super().__init__(doc_index, embedding) @classmethod @@ -53,12 +54,13 @@ def from_texts( texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, - work_dir: str = None, - n_dim: int = None, + work_dir: Optional[str] = None, + n_dim: Optional[int] = None, **kwargs: Any, ) -> DocArrayHnswSearch: """Create an DocArrayHnswSearch store and insert data. + Args: texts (List[str]): Text data. embedding (Embeddings): Embedding function. diff --git a/langchain/vectorstores/docarray/in_memory.py b/langchain/vectorstores/docarray/in_memory.py index 1e20cb3053ab3c..48b01015b1d9c2 100644 --- a/langchain/vectorstores/docarray/in_memory.py +++ b/langchain/vectorstores/docarray/in_memory.py @@ -1,11 +1,11 @@ """Wrapper around in-memory storage.""" from __future__ import annotations -from typing import List, Optional, Type +from typing import Any, Dict, List, Optional, Type from langchain.embeddings.base import Embeddings -from langchain.vectorstores.base import VST -from langchain.vectorstores.docarray.base import DocArrayIndex, _check_docarray_import +from langchain.vectorstores.docarray.base import (DocArrayIndex, + _check_docarray_import) class DocArrayInMemorySearch(DocArrayIndex): @@ -32,23 +32,24 @@ def __init__( from docarray.index import InMemoryExactNNIndex doc_cls = self._get_doc_cls({"space": metric}) - doc_index = InMemoryExactNNIndex[doc_cls]() + doc_index = InMemoryExactNNIndex[doc_cls]() # type: ignore super().__init__(doc_index, embedding) @classmethod def from_texts( - cls: Type[VST], + cls: Type[DocArrayInMemorySearch], texts: List[str], embedding: Embeddings, - metadatas: Optional[List[dict]] = None, + metadatas: Optional[List[Dict[Any, Any]]] = None, metric: str = "cosine_sim", + **kwargs: Any, ) -> DocArrayInMemorySearch: """Create an DocArrayInMemorySearch store and insert data. Args: texts (List[str]): Text data. embedding (Embeddings): Embedding function. - metadatas (Optional[List[dict]]): Metadata for each text if it exists. + metadatas (Optional[List[Dict[Any, Any]]]): Metadata for each text if it exists. Defaults to None. metric (str): metric for exact nearest-neighbor search. Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". diff --git a/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py b/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py index f83e8dd94ef2ac..f6aa704b002819 100644 --- a/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py +++ b/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py @@ -1,3 +1,5 @@ +from pathlib import Path + import numpy as np import pytest @@ -6,7 +8,7 @@ from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -def test_docarray_hnsw_search_vec_store_from_texts(tmp_path) -> None: +def test_docarray_hnsw_search_vec_store_from_texts(tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] docsearch = DocArrayHnswSearch.from_texts( @@ -20,7 +22,7 @@ def test_docarray_hnsw_search_vec_store_from_texts(tmp_path) -> None: assert docsearch.doc_index.num_docs() == 3 -def test_docarray_hnsw_search_vec_store_add_texts(tmp_path) -> None: +def test_docarray_hnsw_search_vec_store_add_texts(tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" docsearch = DocArrayHnswSearch( work_dir=str(tmp_path), @@ -37,7 +39,7 @@ def test_docarray_hnsw_search_vec_store_add_texts(tmp_path) -> None: @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_sim_search(metric, tmp_path) -> None: +def test_sim_search(metric: str, tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] hnsw_vec_store = DocArrayHnswSearch.from_texts( @@ -52,7 +54,7 @@ def test_sim_search(metric, tmp_path) -> None: @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_sim_search_all_configurations(metric, tmp_path) -> None: +def test_sim_search_all_configurations(metric: str, tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] hnsw_vec_store = DocArrayHnswSearch.from_texts( @@ -74,7 +76,7 @@ def test_sim_search_all_configurations(metric, tmp_path) -> None: @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_sim_search_by_vector(metric, tmp_path) -> None: +def test_sim_search_by_vector(metric: str, tmp_path: Path) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] hnsw_vec_store = DocArrayHnswSearch.from_texts( @@ -91,7 +93,7 @@ def test_sim_search_by_vector(metric, tmp_path) -> None: @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_sim_search_with_score(metric, tmp_path) -> None: +def test_sim_search_with_score(metric: str, tmp_path: Path) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] hnsw_vec_store = DocArrayHnswSearch.from_texts( @@ -109,7 +111,7 @@ def test_sim_search_with_score(metric, tmp_path) -> None: assert np.isclose(out_score, 0.0, atol=1.0e-6) -def test_sim_search_with_score_for_ip_metric(tmp_path) -> None: +def test_sim_search_with_score_for_ip_metric(tmp_path: Path) -> None: """ Test end to end construction and similarity search with score for ip (inner-product) metric. @@ -130,7 +132,7 @@ def test_sim_search_with_score_for_ip_metric(tmp_path) -> None: @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_max_marginal_relevance_search(metric, tmp_path) -> None: +def test_max_marginal_relevance_search(metric: str, tmp_path: Path) -> None: """Test MRR search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] diff --git a/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py b/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py index 5a14679a0bc57d..33b4eab17c7db9 100644 --- a/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py +++ b/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py @@ -1,3 +1,5 @@ +from pathlib import Path + import numpy as np import pytest @@ -17,7 +19,7 @@ def test_in_memory_vec_store_from_texts() -> None: assert docsearch.doc_index.num_docs() == 3 -def test_in_memory_vec_store_add_texts(tmp_path) -> None: +def test_in_memory_vec_store_add_texts(tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" docsearch = DocArrayInMemorySearch( embedding=FakeEmbeddings(), @@ -31,7 +33,7 @@ def test_in_memory_vec_store_add_texts(tmp_path) -> None: @pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) -def test_sim_search(metric) -> None: +def test_sim_search(metric: str) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] in_memory_vec_store = DocArrayInMemorySearch.from_texts( @@ -45,7 +47,7 @@ def test_sim_search(metric) -> None: @pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) -def test_sim_search_with_score(metric) -> None: +def test_sim_search_with_score(metric: str) -> None: """Test end to end construction and similarity search with score.""" texts = ["foo", "bar", "baz"] in_memory_vec_store = DocArrayInMemorySearch.from_texts( @@ -64,7 +66,7 @@ def test_sim_search_with_score(metric) -> None: @pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) -def test_sim_search_by_vector(metric) -> None: +def test_sim_search_by_vector(metric: str) -> None: """Test end to end construction and similarity search by vector.""" texts = ["foo", "bar", "baz"] in_memory_vec_store = DocArrayInMemorySearch.from_texts( @@ -80,7 +82,7 @@ def test_sim_search_by_vector(metric) -> None: @pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) -def test_max_marginal_relevance_search(metric) -> None: +def test_max_marginal_relevance_search(metric: str) -> None: """Test MRR search.""" texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] From 45b8c09a29c03091b8f93dd0ac4bfb95359a67e5 Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Wed, 10 May 2023 17:50:37 +0200 Subject: [PATCH 13/19] style: run black Signed-off-by: jupyterjazz --- langchain/vectorstores/docarray/base.py | 3 +-- langchain/vectorstores/docarray/hnsw.py | 3 +-- langchain/vectorstores/docarray/in_memory.py | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/langchain/vectorstores/docarray/base.py b/langchain/vectorstores/docarray/base.py index c9fdd3867b3351..1dab0f53c32a4d 100644 --- a/langchain/vectorstores/docarray/base.py +++ b/langchain/vectorstores/docarray/base.py @@ -1,5 +1,4 @@ -from typing import (TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, - Type) +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type import numpy as np from pydantic import Field diff --git a/langchain/vectorstores/docarray/hnsw.py b/langchain/vectorstores/docarray/hnsw.py index 5736a5768b9231..85b98a98806244 100644 --- a/langchain/vectorstores/docarray/hnsw.py +++ b/langchain/vectorstores/docarray/hnsw.py @@ -4,8 +4,7 @@ from typing import Any, List, Optional, Type from langchain.embeddings.base import Embeddings -from langchain.vectorstores.docarray.base import (DocArrayIndex, - _check_docarray_import) +from langchain.vectorstores.docarray.base import DocArrayIndex, _check_docarray_import class DocArrayHnswSearch(DocArrayIndex): diff --git a/langchain/vectorstores/docarray/in_memory.py b/langchain/vectorstores/docarray/in_memory.py index 48b01015b1d9c2..ba0e48f0504ea2 100644 --- a/langchain/vectorstores/docarray/in_memory.py +++ b/langchain/vectorstores/docarray/in_memory.py @@ -4,8 +4,7 @@ from typing import Any, Dict, List, Optional, Type from langchain.embeddings.base import Embeddings -from langchain.vectorstores.docarray.base import (DocArrayIndex, - _check_docarray_import) +from langchain.vectorstores.docarray.base import DocArrayIndex, _check_docarray_import class DocArrayInMemorySearch(DocArrayIndex): From 15c5911902957b0372d460ee0f3bfe49da23744b Mon Sep 17 00:00:00 2001 From: jupyterjazz Date: Wed, 10 May 2023 18:10:36 +0200 Subject: [PATCH 14/19] style: ruff ruff Signed-off-by: jupyterjazz --- langchain/vectorstores/docarray/base.py | 4 ++-- langchain/vectorstores/docarray/hnsw.py | 4 ++-- langchain/vectorstores/docarray/in_memory.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/langchain/vectorstores/docarray/base.py b/langchain/vectorstores/docarray/base.py index 1dab0f53c32a4d..c2d6ed899ed4c2 100644 --- a/langchain/vectorstores/docarray/base.py +++ b/langchain/vectorstores/docarray/base.py @@ -23,8 +23,8 @@ def _check_docarray_import() -> None: da_version = docarray.__version__.split(".") if int(da_version[0]) == 0 and int(da_version[1]) <= 30: raise ValueError( - f"To use the DocArrayHnswSearch VectorStore the docarray version >=0.31.0 is expected, " - f"received: {docarray.__version__}." + f"To use the DocArrayHnswSearch VectorStore the docarray " + f"version >=0.31.0 is expected, received: {docarray.__version__}." f"To upgrade, please run: `pip install -U docarray`." ) except ImportError: diff --git a/langchain/vectorstores/docarray/hnsw.py b/langchain/vectorstores/docarray/hnsw.py index 85b98a98806244..2d026d621d0b03 100644 --- a/langchain/vectorstores/docarray/hnsw.py +++ b/langchain/vectorstores/docarray/hnsw.py @@ -10,8 +10,8 @@ class DocArrayHnswSearch(DocArrayIndex): """Wrapper around HnswLib storage. - To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed. - You can install it with `pip install "langchain[hnswlib]"`. + To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 + installed. You can install it with `pip install "langchain[hnswlib]"`. """ def __init__( diff --git a/langchain/vectorstores/docarray/in_memory.py b/langchain/vectorstores/docarray/in_memory.py index ba0e48f0504ea2..d7935f5afcb1c6 100644 --- a/langchain/vectorstores/docarray/in_memory.py +++ b/langchain/vectorstores/docarray/in_memory.py @@ -48,8 +48,8 @@ def from_texts( Args: texts (List[str]): Text data. embedding (Embeddings): Embedding function. - metadatas (Optional[List[Dict[Any, Any]]]): Metadata for each text if it exists. - Defaults to None. + metadatas (Optional[List[Dict[Any, Any]]]): Metadata for each text + if it exists. Defaults to None. metric (str): metric for exact nearest-neighbor search. Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". Defaults to "cosine_sim". From 8b1638bc5ffff49c2a5b486dd9b4e6d77bc24dbb Mon Sep 17 00:00:00 2001 From: Dev 2049 Date: Wed, 10 May 2023 12:23:07 -0700 Subject: [PATCH 15/19] cr --- langchain/vectorstores/docarray/base.py | 32 ++++----- langchain/vectorstores/docarray/hnsw.py | 76 +++++++++++++------- langchain/vectorstores/docarray/in_memory.py | 33 +++++---- 3 files changed, 86 insertions(+), 55 deletions(-) diff --git a/langchain/vectorstores/docarray/base.py b/langchain/vectorstores/docarray/base.py index c2d6ed899ed4c2..7ab55b0fffa720 100644 --- a/langchain/vectorstores/docarray/base.py +++ b/langchain/vectorstores/docarray/base.py @@ -1,3 +1,4 @@ +from abc import ABC from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type import numpy as np @@ -11,9 +12,6 @@ if TYPE_CHECKING: from docarray import BaseDoc from docarray.index.abstract import BaseDocIndex -else: - BaseDoc = object - BaseDocIndex = object def _check_docarray_import() -> None: @@ -34,7 +32,20 @@ def _check_docarray_import() -> None: ) -class DocArrayIndex(VectorStore): +def get_doc_cls(**embeddings_params: Any) -> Type[BaseDoc]: + """Get docarray Document class describing the schema of DocIndex.""" + from docarray import BaseDoc + from docarray.typing import NdArray + + class DocArrayDoc(BaseDoc): + text: Optional[str] + embedding: Optional[NdArray] = Field(**embeddings_params) + metadata: Optional[dict] + + return DocArrayDoc + + +class DocArrayIndex(VectorStore, ABC): def __init__( self, doc_index: BaseDocIndex, @@ -45,19 +56,6 @@ def __init__( self.doc_cls = doc_index._schema self.embedding = embedding - @staticmethod - def _get_doc_cls(embeddings_params: Dict[str, Any]) -> Type[BaseDoc]: - """Get docarray Document class describing the schema of DocIndex.""" - from docarray import BaseDoc - from docarray.typing import NdArray - - class DocArrayDoc(BaseDoc): - text: Optional[str] - embedding: Optional[NdArray] = Field(**embeddings_params) - metadata: Optional[dict] - - return DocArrayDoc - def add_texts( self, texts: Iterable[str], diff --git a/langchain/vectorstores/docarray/hnsw.py b/langchain/vectorstores/docarray/hnsw.py index 2d026d621d0b03..3abccc6b14e67d 100644 --- a/langchain/vectorstores/docarray/hnsw.py +++ b/langchain/vectorstores/docarray/hnsw.py @@ -1,10 +1,14 @@ """Wrapper around Hnswlib store.""" from __future__ import annotations -from typing import Any, List, Optional, Type +from typing import Any, List, Literal, Optional from langchain.embeddings.base import Embeddings -from langchain.vectorstores.docarray.base import DocArrayIndex, _check_docarray_import +from langchain.vectorstores.docarray.base import ( + DocArrayIndex, + _check_docarray_import, + get_doc_cls, +) class DocArrayHnswSearch(DocArrayIndex): @@ -14,42 +18,66 @@ class DocArrayHnswSearch(DocArrayIndex): installed. You can install it with `pip install "langchain[hnswlib]"`. """ - def __init__( - self, embedding: Embeddings, work_dir: str, n_dim: int, **kwargs: Any - ) -> None: + @classmethod + def from_params( + cls, + embedding: Embeddings, + work_dir: str, + n_dim: int, + dist_metric: Literal["cosine", "ip", "l2"] = "cosine", + max_elements: int = 1024, + index: bool = True, + ef_construction: int = 200, + ef: int = 10, + M: int = 16, + allow_replace_deleted: bool = True, + num_threads: int = 1, + **kwargs: Any, + ) -> DocArrayHnswSearch: """Initialize DocArrayHnswSearch store. Args: embedding (Embeddings): Embedding function. work_dir (str): path to the location where all the data will be stored. n_dim (int): dimension of an embedding. - **kwargs: Other keyword arguments to be passed to the _get_doc_cls method. + dist_metric (str): Distance metric for DocArrayHnswSearch can be one of: + "cosine", "ip", and "l2". Defaults to "cosine". + max_elements (int): Maximum number of vectors that can be stored. + Defaults to 1024. + index (bool): Whether an index should be built for this field. + Defaults to True. + ef_construction (int): defines a construction time/accuracy trade-off. + Defaults to 200. + ef (int): parameter controlling query time/accuracy trade-off. + Defaults to 10. + M (int): parameter that defines the maximum number of outgoing + connections in the graph. Defaults to 16. + allow_replace_deleted (bool): Enables replacing of deleted elements + with new added ones. Defaults to True. + num_threads (int): Sets the number of cpu threads to use. Defaults to 1. + **kwargs: Other keyword arguments to be passed to the get_doc_cls method. """ _check_docarray_import() from docarray.index import HnswDocumentIndex - kwargs.setdefault("dist_metric", "cosine") - kwargs.setdefault("max_elements", 1024) - kwargs.setdefault("index", True) - kwargs.setdefault("ef_construction", 200) - kwargs.setdefault("ef", 10) - kwargs.setdefault("M", 16) - kwargs.setdefault("allow_replace_deleted", True) - kwargs.setdefault("num_threads", 1) - - doc_cls = self._get_doc_cls( - { - "dim": n_dim, - "space": kwargs["dist_metric"], - **{k: v for k, v in kwargs.items() if k != "dist_metric"}, - } + doc_cls = get_doc_cls( + n_dim=n_dim, + space=dist_metric, + max_elements=max_elements, + index=index, + ef_construction=ef_construction, + ef=ef, + M=M, + allow_replace_deleted=allow_replace_deleted, + num_threads=num_threads, + **kwargs, ) doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) # type: ignore - super().__init__(doc_index, embedding) + return cls(doc_index, embedding) @classmethod def from_texts( - cls: Type[DocArrayHnswSearch], + cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[dict]] = None, @@ -77,6 +105,6 @@ def from_texts( if n_dim is None: raise ValueError("`n_dim` parameter has not been set.") - store = cls(work_dir=work_dir, n_dim=n_dim, embedding=embedding, **kwargs) + store = cls.from_params(embedding, work_dir, n_dim, **kwargs) store.add_texts(texts=texts, metadatas=metadatas) return store diff --git a/langchain/vectorstores/docarray/in_memory.py b/langchain/vectorstores/docarray/in_memory.py index d7935f5afcb1c6..a66cf68d273f92 100644 --- a/langchain/vectorstores/docarray/in_memory.py +++ b/langchain/vectorstores/docarray/in_memory.py @@ -1,10 +1,14 @@ """Wrapper around in-memory storage.""" from __future__ import annotations -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Literal, Optional from langchain.embeddings.base import Embeddings -from langchain.vectorstores.docarray.base import DocArrayIndex, _check_docarray_import +from langchain.vectorstores.docarray.base import ( + DocArrayIndex, + _check_docarray_import, + get_doc_cls, +) class DocArrayInMemorySearch(DocArrayIndex): @@ -14,11 +18,15 @@ class DocArrayInMemorySearch(DocArrayIndex): You can install it with `pip install "langchain[in_memory_store]"`. """ - def __init__( - self, + @classmethod + def from_params( + cls, embedding: Embeddings, - metric: str = "cosine_sim", - ) -> None: + metric: Literal[ + "cosine_sim", "euclidian_dist", "sgeuclidean_dist" + ] = "cosine_sim", + **kwargs: Any, + ) -> DocArrayInMemorySearch: """Initialize DocArrayInMemorySearch store. Args: @@ -26,21 +34,21 @@ def __init__( metric (str): metric for exact nearest-neighbor search. Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist". Defaults to "cosine_sim". + **kwargs: Other keyword arguments to be passed to the get_doc_cls method. """ _check_docarray_import() from docarray.index import InMemoryExactNNIndex - doc_cls = self._get_doc_cls({"space": metric}) + doc_cls = get_doc_cls(space=metric, **kwargs) doc_index = InMemoryExactNNIndex[doc_cls]() # type: ignore - super().__init__(doc_index, embedding) + return cls(doc_index, embedding) @classmethod def from_texts( - cls: Type[DocArrayInMemorySearch], + cls, texts: List[str], embedding: Embeddings, metadatas: Optional[List[Dict[Any, Any]]] = None, - metric: str = "cosine_sim", **kwargs: Any, ) -> DocArrayInMemorySearch: """Create an DocArrayInMemorySearch store and insert data. @@ -57,9 +65,6 @@ def from_texts( Returns: DocArrayInMemorySearch Vector Store """ - store = cls( - embedding=embedding, - metric=metric, - ) + store = cls.from_params(embedding, **kwargs) store.add_texts(texts=texts, metadatas=metadatas) return store From cc6e86e473993c13d75010cad037fd72e5770d10 Mon Sep 17 00:00:00 2001 From: Dev 2049 Date: Wed, 10 May 2023 13:58:07 -0700 Subject: [PATCH 16/19] wip --- langchain/vectorstores/docarray/base.py | 15 +++++-- langchain/vectorstores/docarray/hnsw.py | 26 ++++++------- .../vectorstores/docarray/__init__.py | 0 .../test_hnsw.py} | 39 +++++++++---------- .../test_in_memory.py} | 0 5 files changed, 43 insertions(+), 37 deletions(-) create mode 100644 tests/integration_tests/vectorstores/docarray/__init__.py rename tests/integration_tests/vectorstores/{test_docarray_hnsw_search.py => docarray/test_hnsw.py} (80%) rename tests/integration_tests/vectorstores/{test_docarray_in_memory_search.py => docarray/test_in_memory.py} (100%) diff --git a/langchain/vectorstores/docarray/base.py b/langchain/vectorstores/docarray/base.py index 7ab55b0fffa720..65ad63a06535f9 100644 --- a/langchain/vectorstores/docarray/base.py +++ b/langchain/vectorstores/docarray/base.py @@ -32,7 +32,8 @@ def _check_docarray_import() -> None: ) -def get_doc_cls(**embeddings_params: Any) -> Type[BaseDoc]: +# TODO: Find better way of typing output. +def get_doc_cls(**embeddings_params: Any) -> Type: """Get docarray Document class describing the schema of DocIndex.""" from docarray import BaseDoc from docarray.typing import NdArray @@ -46,6 +47,9 @@ class DocArrayDoc(BaseDoc): class DocArrayIndex(VectorStore, ABC): + from docarray.index.abstract import BaseDocIndex + from docarray import BaseDoc + def __init__( self, doc_index: BaseDocIndex, @@ -53,9 +57,14 @@ def __init__( ): """Initialize a vector store from DocArray's DocIndex.""" self.doc_index = doc_index - self.doc_cls = doc_index._schema self.embedding = embedding + @property + def doc_cls(self) -> Type[BaseDoc]: + if self.doc_index._schema is None: + raise ValueError("doc_index expected to have non-null _schema attribute.") + return self.doc_index._schema + def add_texts( self, texts: Iterable[str], @@ -75,7 +84,7 @@ def add_texts( embeddings = self.embedding.embed_documents(list(texts)) for i, (t, e) in enumerate(zip(texts, embeddings)): m = metadatas[i] if metadatas else {} - doc = self.doc_cls(text=t, embedding=e, metadata=m) # type: ignore + doc = self.doc_cls(text=t, embedding=e, metadata=m) self.doc_index.index([doc]) ids.append(str(doc.id)) diff --git a/langchain/vectorstores/docarray/hnsw.py b/langchain/vectorstores/docarray/hnsw.py index 3abccc6b14e67d..3d54a87a876dc2 100644 --- a/langchain/vectorstores/docarray/hnsw.py +++ b/langchain/vectorstores/docarray/hnsw.py @@ -42,19 +42,19 @@ def from_params( n_dim (int): dimension of an embedding. dist_metric (str): Distance metric for DocArrayHnswSearch can be one of: "cosine", "ip", and "l2". Defaults to "cosine". - max_elements (int): Maximum number of vectors that can be stored. - Defaults to 1024. - index (bool): Whether an index should be built for this field. - Defaults to True. - ef_construction (int): defines a construction time/accuracy trade-off. - Defaults to 200. - ef (int): parameter controlling query time/accuracy trade-off. - Defaults to 10. - M (int): parameter that defines the maximum number of outgoing - connections in the graph. Defaults to 16. - allow_replace_deleted (bool): Enables replacing of deleted elements - with new added ones. Defaults to True. - num_threads (int): Sets the number of cpu threads to use. Defaults to 1. + max_elements (int): Maximum number of vectors that can be stored. + Defaults to 1024. + index (bool): Whether an index should be built for this field. + Defaults to True. + ef_construction (int): defines a construction time/accuracy trade-off. + Defaults to 200. + ef (int): parameter controlling query time/accuracy trade-off. + Defaults to 10. + M (int): parameter that defines the maximum number of outgoing + connections in the graph. Defaults to 16. + allow_replace_deleted (bool): Enables replacing of deleted elements + with new added ones. Defaults to True. + num_threads (int): Sets the number of cpu threads to use. Defaults to 1. **kwargs: Other keyword arguments to be passed to the get_doc_cls method. """ _check_docarray_import() diff --git a/tests/integration_tests/vectorstores/docarray/__init__.py b/tests/integration_tests/vectorstores/docarray/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py b/tests/integration_tests/vectorstores/docarray/test_hnsw.py similarity index 80% rename from tests/integration_tests/vectorstores/test_docarray_hnsw_search.py rename to tests/integration_tests/vectorstores/docarray/test_hnsw.py index f6aa704b002819..b847cbfa0fd631 100644 --- a/tests/integration_tests/vectorstores/test_docarray_hnsw_search.py +++ b/tests/integration_tests/vectorstores/docarray/test_hnsw.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import List import numpy as np import pytest @@ -8,40 +9,36 @@ from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -def test_docarray_hnsw_search_vec_store_from_texts(tmp_path: Path) -> None: +@pytest.fixture +def texts() -> List[str]: + return ["foo", "bar", "baz"] + + +def test_from_texts(texts: List[str], tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" - texts = ["foo", "bar", "baz"] docsearch = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), work_dir=str(tmp_path), n_dim=10, - dist_metric="cosine", ) - assert isinstance(docsearch, DocArrayHnswSearch) assert docsearch.doc_index.num_docs() == 3 -def test_docarray_hnsw_search_vec_store_add_texts(tmp_path: Path) -> None: +def test_add_texts(texts: List[str], tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" - docsearch = DocArrayHnswSearch( + docsearch = DocArrayHnswSearch.from_params( work_dir=str(tmp_path), n_dim=10, embedding=FakeEmbeddings(), - dist_metric="cosine", ) - assert isinstance(docsearch, DocArrayHnswSearch) - assert docsearch.doc_index.num_docs() == 0 - - texts = ["foo", "bar", "baz"] docsearch.add_texts(texts=texts) assert docsearch.doc_index.num_docs() == 3 @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_sim_search(metric: str, tmp_path: Path) -> None: +def test_sim_search(metric: str, texts: List[str], tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" - texts = ["foo", "bar", "baz"] hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), @@ -54,9 +51,10 @@ def test_sim_search(metric: str, tmp_path: Path) -> None: @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_sim_search_all_configurations(metric: str, tmp_path: Path) -> None: +def test_sim_search_all_configurations( + metric: str, texts: List[str], tmp_path: Path +) -> None: """Test end to end construction and simple similarity search.""" - texts = ["foo", "bar", "baz"] hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), @@ -76,9 +74,8 @@ def test_sim_search_all_configurations(metric: str, tmp_path: Path) -> None: @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_sim_search_by_vector(metric: str, tmp_path: Path) -> None: +def test_sim_search_by_vector(metric: str, texts: List[str], tmp_path: Path) -> None: """Test end to end construction and similarity search by vector.""" - texts = ["foo", "bar", "baz"] hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), @@ -111,12 +108,11 @@ def test_sim_search_with_score(metric: str, tmp_path: Path) -> None: assert np.isclose(out_score, 0.0, atol=1.0e-6) -def test_sim_search_with_score_for_ip_metric(tmp_path: Path) -> None: +def test_sim_search_with_score_for_ip_metric(texts: List[str], tmp_path: Path) -> None: """ Test end to end construction and similarity search with score for ip (inner-product) metric. """ - texts = ["foo", "bar", "baz"] hnsw_vec_store = DocArrayHnswSearch.from_texts( texts, FakeEmbeddings(), @@ -132,9 +128,10 @@ def test_sim_search_with_score_for_ip_metric(tmp_path: Path) -> None: @pytest.mark.parametrize("metric", ["cosine", "l2"]) -def test_max_marginal_relevance_search(metric: str, tmp_path: Path) -> None: +def test_max_marginal_relevance_search( + metric: str, texts: List[str], tmp_path: Path +) -> None: """Test MRR search.""" - texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] docsearch = DocArrayHnswSearch.from_texts( texts, diff --git a/tests/integration_tests/vectorstores/test_docarray_in_memory_search.py b/tests/integration_tests/vectorstores/docarray/test_in_memory.py similarity index 100% rename from tests/integration_tests/vectorstores/test_docarray_in_memory_search.py rename to tests/integration_tests/vectorstores/docarray/test_in_memory.py From 23e9ad634e62f208cda2f0b3468e0f023d43ae61 Mon Sep 17 00:00:00 2001 From: Dev 2049 Date: Wed, 10 May 2023 14:50:40 -0700 Subject: [PATCH 17/19] cr --- .../examples/docarray_hnsw_search.ipynb | 236 ------------------ .../examples/docarray_in_memory_search.ipynb | 230 ----------------- langchain/vectorstores/docarray/base.py | 36 ++- langchain/vectorstores/docarray/hnsw.py | 5 +- langchain/vectorstores/docarray/in_memory.py | 3 +- .../vectorstores/docarray/test_hnsw.py | 2 +- .../vectorstores/docarray/test_in_memory.py | 27 +- 7 files changed, 33 insertions(+), 506 deletions(-) delete mode 100644 docs/modules/indexes/vectorstores/examples/docarray_hnsw_search.ipynb delete mode 100644 docs/modules/indexes/vectorstores/examples/docarray_in_memory_search.ipynb diff --git a/docs/modules/indexes/vectorstores/examples/docarray_hnsw_search.ipynb b/docs/modules/indexes/vectorstores/examples/docarray_hnsw_search.ipynb deleted file mode 100644 index 436970611c965f..00000000000000 --- a/docs/modules/indexes/vectorstores/examples/docarray_hnsw_search.ipynb +++ /dev/null @@ -1,236 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "2ce41f46-5711-4311-b04d-2fe233ac5b1b", - "metadata": {}, - "source": [ - "# DocArrayHnswSearch\n", - "\n", - ">[DocArrayHnswSearch](https://docs.docarray.org/user_guide/storing/index_hnswlib/) is a lightweight Document Index implementation provided by [Docarray](https://docs.docarray.org/) that runs fully locally and is best suited for small- to medium-sized datasets. It stores vectors on disk in [hnswlib](https://github.com/nmslib/hnswlib), and stores all other data in [SQLite](https://www.sqlite.org/index.html).\n", - "\n", - "This notebook shows how to use functionality related to the `DocArrayHnswSearch`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8ce1b8cb-dbf0-40c3-99ee-04f28143331b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!pip install \"docarray[hnswlib]\"" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "878f17df-100f-4854-9e87-472cf36d51f3", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - " ········\n" - ] - } - ], - "source": [ - "# get a token: https://platform.openai.com/account/api-keys\n", - "\n", - "from getpass import getpass\n", - "\n", - "OPENAI_API_KEY = getpass()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "82d9984a-6031-403d-a977-6bc98d6be23a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b757afef-ef0a-465d-8e8a-9aadb9c32b88", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jinaai/Desktop/langchain/venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "from langchain.text_splitter import CharacterTextSplitter\n", - "from langchain.vectorstores import DocArrayHnswSearch\n", - "from langchain.document_loaders import TextLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "605e200e-e711-486b-b36e-cbe5dd2512d7", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.document_loaders import TextLoader\n", - "loader = TextLoader('../../../state_of_the_union.txt')\n", - "documents = loader.load()\n", - "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", - "docs = text_splitter.split_documents(documents)\n", - "\n", - "embeddings = OpenAIEmbeddings()\n", - "\n", - "db = DocArrayHnswSearch.from_documents(docs, embeddings, work_dir='hnswlib_store/', n_dim=1536)" - ] - }, - { - "cell_type": "markdown", - "id": "ed6f905b-4853-4a44-9730-614aa8e22b78", - "metadata": {}, - "source": [ - "## Similarity search" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "4d7e742f-2002-449d-a10e-16046890906c", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "docs = db.similarity_search(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "0da9e26f-1fc2-48e6-95a7-f692c853bbd3", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" - ] - } - ], - "source": [ - "print(docs[0].page_content)" - ] - }, - { - "cell_type": "markdown", - "id": "3febb987-e903-416f-af26-6897d84c8d61", - "metadata": {}, - "source": [ - "## Similarity search with score" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "40764fdd-357d-475a-8152-5f1979d61a45", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "docs = db.similarity_search_with_score(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "a479fc46-b299-4330-89b9-e9b5a218ea03", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={}),\n", - " 0.36962226)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "4d3d4e97-5d2b-4571-8ff9-e3f6b6778714", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import shutil\n", - "# delete the dir\n", - "shutil.rmtree('hnswlib_store')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/modules/indexes/vectorstores/examples/docarray_in_memory_search.ipynb b/docs/modules/indexes/vectorstores/examples/docarray_in_memory_search.ipynb deleted file mode 100644 index 2b2c134f59e0ad..00000000000000 --- a/docs/modules/indexes/vectorstores/examples/docarray_in_memory_search.ipynb +++ /dev/null @@ -1,230 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "a3afefb0-7e99-4912-a222-c6b186da11af", - "metadata": {}, - "source": [ - "# DocArrayInMemorySearch\n", - "\n", - ">[DocArrayInMemorySearch](https://docs.docarray.org/user_guide/storing/index_in_memory/) is a document index provided by [Docarray](https://docs.docarray.org/) that stores documents in memory. It is a great starting point for small datasets, where you may not want to launch a database server.\n", - "\n", - "This notebook shows how to use functionality related to the `DocArrayInMemorySearch`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7cd7391f-7759-4a21-952a-2ec972d818c6", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "!pip install docarray" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "c6a40ad8-920e-4370-818d-3227e2f506ed", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - " ········\n" - ] - } - ], - "source": [ - "# get a token: https://platform.openai.com/account/api-keys\n", - "\n", - "from getpass import getpass\n", - "\n", - "OPENAI_API_KEY = getpass()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d2ada2f5-e30e-4aa7-863e-69339a28e825", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e49be085-ddf1-4028-8c0c-97836ce4a873", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/jinaai/Desktop/langchain/venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], - "source": [ - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "from langchain.text_splitter import CharacterTextSplitter\n", - "from langchain.vectorstores import DocArrayInMemorySearch\n", - "from langchain.document_loaders import TextLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "38222aee-adc5-44c2-913c-97977b394cf5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.document_loaders import TextLoader\n", - "loader = TextLoader('../../../state_of_the_union.txt')\n", - "documents = loader.load()\n", - "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", - "docs = text_splitter.split_documents(documents)\n", - "\n", - "embeddings = OpenAIEmbeddings()\n", - "\n", - "db = DocArrayInMemorySearch.from_documents(docs, embeddings)" - ] - }, - { - "cell_type": "markdown", - "id": "efbb6684-3846-4332-a624-ddd4d75844c1", - "metadata": {}, - "source": [ - "## Similarity search" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "aa28a7f8-41d0-4299-84eb-91d1576e8a63", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "docs = db.similarity_search(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1eb16d2a-b466-456a-b412-5e74bb8523dd", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", - "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" - ] - } - ], - "source": [ - "print(docs[0].page_content)" - ] - }, - { - "cell_type": "markdown", - "id": "43896697-f99e-47b6-9117-47a25e9afa9c", - "metadata": {}, - "source": [ - "## Similarity search with score" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "8e9eef05-1516-469a-ad36-880c69aef7a9", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "docs = db.similarity_search_with_score(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "bd5fb0e4-2a94-4bb4-af8a-27327ecb1a7f", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={}),\n", - " 0.8154190158347903)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "docs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e5da522-ef0e-4a59-91ea-89e563f7b825", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.16" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/langchain/vectorstores/docarray/base.py b/langchain/vectorstores/docarray/base.py index 65ad63a06535f9..e5feea2dba4642 100644 --- a/langchain/vectorstores/docarray/base.py +++ b/langchain/vectorstores/docarray/base.py @@ -1,5 +1,5 @@ from abc import ABC -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type +from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type import numpy as np from pydantic import Field @@ -32,35 +32,31 @@ def _check_docarray_import() -> None: ) -# TODO: Find better way of typing output. -def get_doc_cls(**embeddings_params: Any) -> Type: - """Get docarray Document class describing the schema of DocIndex.""" - from docarray import BaseDoc - from docarray.typing import NdArray - - class DocArrayDoc(BaseDoc): - text: Optional[str] - embedding: Optional[NdArray] = Field(**embeddings_params) - metadata: Optional[dict] - - return DocArrayDoc - - class DocArrayIndex(VectorStore, ABC): - from docarray.index.abstract import BaseDocIndex - from docarray import BaseDoc - def __init__( self, - doc_index: BaseDocIndex, + doc_index: "BaseDocIndex", embedding: Embeddings, ): """Initialize a vector store from DocArray's DocIndex.""" self.doc_index = doc_index self.embedding = embedding + @staticmethod + def _get_doc_cls(**embeddings_params: Any) -> Type["BaseDoc"]: + """Get docarray Document class describing the schema of DocIndex.""" + from docarray import BaseDoc + from docarray.typing import NdArray + + class DocArrayDoc(BaseDoc): + text: Optional[str] + embedding: Optional[NdArray] = Field(**embeddings_params) + metadata: Optional[dict] + + return DocArrayDoc + @property - def doc_cls(self) -> Type[BaseDoc]: + def doc_cls(self) -> Type["BaseDoc"]: if self.doc_index._schema is None: raise ValueError("doc_index expected to have non-null _schema attribute.") return self.doc_index._schema diff --git a/langchain/vectorstores/docarray/hnsw.py b/langchain/vectorstores/docarray/hnsw.py index 3d54a87a876dc2..9e334c3c47b3da 100644 --- a/langchain/vectorstores/docarray/hnsw.py +++ b/langchain/vectorstores/docarray/hnsw.py @@ -7,7 +7,6 @@ from langchain.vectorstores.docarray.base import ( DocArrayIndex, _check_docarray_import, - get_doc_cls, ) @@ -60,8 +59,8 @@ def from_params( _check_docarray_import() from docarray.index import HnswDocumentIndex - doc_cls = get_doc_cls( - n_dim=n_dim, + doc_cls = cls._get_doc_cls( + dim=n_dim, space=dist_metric, max_elements=max_elements, index=index, diff --git a/langchain/vectorstores/docarray/in_memory.py b/langchain/vectorstores/docarray/in_memory.py index a66cf68d273f92..8ab664859eb6b3 100644 --- a/langchain/vectorstores/docarray/in_memory.py +++ b/langchain/vectorstores/docarray/in_memory.py @@ -7,7 +7,6 @@ from langchain.vectorstores.docarray.base import ( DocArrayIndex, _check_docarray_import, - get_doc_cls, ) @@ -39,7 +38,7 @@ def from_params( _check_docarray_import() from docarray.index import InMemoryExactNNIndex - doc_cls = get_doc_cls(space=metric, **kwargs) + doc_cls = cls._get_doc_cls(space=metric, **kwargs) doc_index = InMemoryExactNNIndex[doc_cls]() # type: ignore return cls(doc_index, embedding) diff --git a/tests/integration_tests/vectorstores/docarray/test_hnsw.py b/tests/integration_tests/vectorstores/docarray/test_hnsw.py index b847cbfa0fd631..0143660f126e1c 100644 --- a/tests/integration_tests/vectorstores/docarray/test_hnsw.py +++ b/tests/integration_tests/vectorstores/docarray/test_hnsw.py @@ -45,6 +45,7 @@ def test_sim_search(metric: str, texts: List[str], tmp_path: Path) -> None: work_dir=str(tmp_path), n_dim=10, dist_metric=metric, + index=True, ) output = hnsw_vec_store.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] @@ -62,7 +63,6 @@ def test_sim_search_all_configurations( dist_metric=metric, n_dim=10, max_elements=8, - index=False, ef_construction=300, ef=20, M=8, diff --git a/tests/integration_tests/vectorstores/docarray/test_in_memory.py b/tests/integration_tests/vectorstores/docarray/test_in_memory.py index 33b4eab17c7db9..ca556b11cc5b82 100644 --- a/tests/integration_tests/vectorstores/docarray/test_in_memory.py +++ b/tests/integration_tests/vectorstores/docarray/test_in_memory.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import List import numpy as np import pytest @@ -8,9 +9,13 @@ from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings -def test_in_memory_vec_store_from_texts() -> None: +@pytest.fixture +def texts() -> List[str]: + return ["foo", "bar", "baz"] + + +def test_from_texts(texts: List[str]) -> None: """Test end to end construction and simple similarity search.""" - texts = ["foo", "bar", "baz"] docsearch = DocArrayInMemorySearch.from_texts( texts, FakeEmbeddings(), @@ -19,21 +24,18 @@ def test_in_memory_vec_store_from_texts() -> None: assert docsearch.doc_index.num_docs() == 3 -def test_in_memory_vec_store_add_texts(tmp_path: Path) -> None: +def test_add_texts(texts: List[str], tmp_path: Path) -> None: """Test end to end construction and simple similarity search.""" - docsearch = DocArrayInMemorySearch( - embedding=FakeEmbeddings(), - ) + docsearch = DocArrayInMemorySearch.from_params(FakeEmbeddings()) assert isinstance(docsearch, DocArrayInMemorySearch) assert docsearch.doc_index.num_docs() == 0 - texts = ["foo", "bar", "baz"] docsearch.add_texts(texts=texts) assert docsearch.doc_index.num_docs() == 3 @pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) -def test_sim_search(metric: str) -> None: +def test_sim_search(metric: str, texts: List[str]) -> None: """Test end to end construction and simple similarity search.""" texts = ["foo", "bar", "baz"] in_memory_vec_store = DocArrayInMemorySearch.from_texts( @@ -47,9 +49,8 @@ def test_sim_search(metric: str) -> None: @pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) -def test_sim_search_with_score(metric: str) -> None: +def test_sim_search_with_score(metric: str, texts: List[str]) -> None: """Test end to end construction and similarity search with score.""" - texts = ["foo", "bar", "baz"] in_memory_vec_store = DocArrayInMemorySearch.from_texts( texts=texts, embedding=FakeEmbeddings(), @@ -66,9 +67,8 @@ def test_sim_search_with_score(metric: str) -> None: @pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) -def test_sim_search_by_vector(metric: str) -> None: +def test_sim_search_by_vector(metric: str, texts: List[str]) -> None: """Test end to end construction and similarity search by vector.""" - texts = ["foo", "bar", "baz"] in_memory_vec_store = DocArrayInMemorySearch.from_texts( texts=texts, embedding=FakeEmbeddings(), @@ -82,9 +82,8 @@ def test_sim_search_by_vector(metric: str) -> None: @pytest.mark.parametrize("metric", ["cosine_sim", "euclidean_dist", "sqeuclidean_dist"]) -def test_max_marginal_relevance_search(metric: str) -> None: +def test_max_marginal_relevance_search(metric: str, texts: List[str]) -> None: """Test MRR search.""" - texts = ["foo", "bar", "baz"] metadatas = [{"page": i} for i in range(len(texts))] docsearch = DocArrayInMemorySearch.from_texts( texts, FakeEmbeddings(), metadatas=metadatas, metric=metric From 672baf3449dc686bd985fa2c257d8f624e75aff5 Mon Sep 17 00:00:00 2001 From: Dev 2049 Date: Wed, 10 May 2023 14:50:55 -0700 Subject: [PATCH 18/19] docs --- .../vectorstores/examples/docarray_hnsw.ipynb | 227 ++++++++++++++++++ .../examples/docarray_in_memory.ipynb | 210 ++++++++++++++++ 2 files changed, 437 insertions(+) create mode 100644 docs/modules/indexes/vectorstores/examples/docarray_hnsw.ipynb create mode 100644 docs/modules/indexes/vectorstores/examples/docarray_in_memory.ipynb diff --git a/docs/modules/indexes/vectorstores/examples/docarray_hnsw.ipynb b/docs/modules/indexes/vectorstores/examples/docarray_hnsw.ipynb new file mode 100644 index 00000000000000..01686c6ab731e9 --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/docarray_hnsw.ipynb @@ -0,0 +1,227 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2ce41f46-5711-4311-b04d-2fe233ac5b1b", + "metadata": {}, + "source": [ + "# DocArrayHnswSearch\n", + "\n", + ">[DocArrayHnswSearch](https://docs.docarray.org/user_guide/storing/index_hnswlib/) is a lightweight Document Index implementation provided by [Docarray](https://docs.docarray.org/) that runs fully locally and is best suited for small- to medium-sized datasets. It stores vectors on disk in [hnswlib](https://github.com/nmslib/hnswlib), and stores all other data in [SQLite](https://www.sqlite.org/index.html).\n", + "\n", + "This notebook shows how to use functionality related to the `DocArrayHnswSearch`." + ] + }, + { + "cell_type": "markdown", + "id": "7ee37d28", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "Uncomment the below cells to install docarray and get/set your OpenAI api key if you haven't already done so." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8ce1b8cb-dbf0-40c3-99ee-04f28143331b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# !pip install \"docarray[hnswlib]\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "878f17df-100f-4854-9e87-472cf36d51f3", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# Get an OpenAI token: https://platform.openai.com/account/api-keys\n", + "\n", + "# import os\n", + "# from getpass import getpass\n", + "\n", + "# OPENAI_API_KEY = getpass()\n", + "\n", + "# os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "markdown", + "id": "8dbb6de2", + "metadata": { + "tags": [] + }, + "source": [ + "# Using DocArrayHnswSearch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b757afef-ef0a-465d-8e8a-9aadb9c32b88", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import DocArrayHnswSearch\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "605e200e-e711-486b-b36e-cbe5dd2512d7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents = TextLoader('../../../state_of_the_union.txt').load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "db = DocArrayHnswSearch.from_documents(docs, embeddings, work_dir='hnswlib_store/', n_dim=1536)" + ] + }, + { + "cell_type": "markdown", + "id": "ed6f905b-4853-4a44-9730-614aa8e22b78", + "metadata": {}, + "source": [ + "## Similarity search" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4d7e742f-2002-449d-a10e-16046890906c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = db.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0da9e26f-1fc2-48e6-95a7-f692c853bbd3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "3febb987-e903-416f-af26-6897d84c8d61", + "metadata": {}, + "source": [ + "## Similarity search with score" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "40764fdd-357d-475a-8152-5f1979d61a45", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = db.similarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a479fc46-b299-4330-89b9-e9b5a218ea03", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={}),\n", + " 0.36962226)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "4d3d4e97-5d2b-4571-8ff9-e3f6b6778714", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import shutil\n", + "# delete the dir\n", + "shutil.rmtree('hnswlib_store')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/modules/indexes/vectorstores/examples/docarray_in_memory.ipynb b/docs/modules/indexes/vectorstores/examples/docarray_in_memory.ipynb new file mode 100644 index 00000000000000..8bc6ffdf2cd19a --- /dev/null +++ b/docs/modules/indexes/vectorstores/examples/docarray_in_memory.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a3afefb0-7e99-4912-a222-c6b186da11af", + "metadata": {}, + "source": [ + "# DocArrayInMemorySearch\n", + "\n", + ">[DocArrayInMemorySearch](https://docs.docarray.org/user_guide/storing/index_in_memory/) is a document index provided by [Docarray](https://docs.docarray.org/) that stores documents in memory. It is a great starting point for small datasets, where you may not want to launch a database server.\n", + "\n", + "This notebook shows how to use functionality related to the `DocArrayInMemorySearch`." + ] + }, + { + "cell_type": "markdown", + "id": "5031a3ec", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "Uncomment the below cells to install docarray and get/set your OpenAI api key if you haven't already done so." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cd7391f-7759-4a21-952a-2ec972d818c6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# !pip install \"docarray\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6a40ad8-920e-4370-818d-3227e2f506ed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Get an OpenAI token: https://platform.openai.com/account/api-keys\n", + "\n", + "# import os\n", + "# from getpass import getpass\n", + "\n", + "# OPENAI_API_KEY = getpass()\n", + "\n", + "# os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e49be085-ddf1-4028-8c0c-97836ce4a873", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import DocArrayInMemorySearch\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "38222aee-adc5-44c2-913c-97977b394cf5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "documents = TextLoader('../../../state_of_the_union.txt').load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "db = DocArrayInMemorySearch.from_documents(docs, embeddings)" + ] + }, + { + "cell_type": "markdown", + "id": "efbb6684-3846-4332-a624-ddd4d75844c1", + "metadata": {}, + "source": [ + "## Similarity search" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "aa28a7f8-41d0-4299-84eb-91d1576e8a63", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = db.similarity_search(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1eb16d2a-b466-456a-b412-5e74bb8523dd", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], + "source": [ + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "43896697-f99e-47b6-9117-47a25e9afa9c", + "metadata": {}, + "source": [ + "## Similarity search with score" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8e9eef05-1516-469a-ad36-880c69aef7a9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = db.similarity_search_with_score(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bd5fb0e4-2a94-4bb4-af8a-27327ecb1a7f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={}),\n", + " 0.8154190158347903)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e5da522-ef0e-4a59-91ea-89e563f7b825", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From e97c894b881a070c717a2b8406b31e0cc9e937c6 Mon Sep 17 00:00:00 2001 From: Dev 2049 Date: Wed, 10 May 2023 14:54:47 -0700 Subject: [PATCH 19/19] nit --- langchain/vectorstores/docarray/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/langchain/vectorstores/docarray/base.py b/langchain/vectorstores/docarray/base.py index e5feea2dba4642..d7b2f3c9ac21d0 100644 --- a/langchain/vectorstores/docarray/base.py +++ b/langchain/vectorstores/docarray/base.py @@ -120,7 +120,7 @@ def similarity_search( Returns: List of Documents most similar to the query. """ - results = self.similarity_search_with_score(query=query, k=k, **kwargs) + results = self.similarity_search_with_score(query, k=k, **kwargs) return [doc for doc, _ in results] def _similarity_search_with_relevance_scores(