Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make document serializable, create utility to create a docstore #9674

Merged
merged 8 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions libs/langchain/langchain/schema/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@ class Document(Serializable):
documents, etc.).
"""

@property
def lc_serializable(self) -> bool:
"""Return whether or not the class is serializable."""
return True


class BaseDocumentTransformer(ABC):
"""Abstract base class for document transformation systems.
Expand Down
3 changes: 3 additions & 0 deletions libs/langchain/langchain/storage/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
The primary goal of these storages is to support implementation of caching.
"""

from langchain.storage._lc_store import create_kv_docstore, create_lc_store
from langchain.storage.encoder_backed import EncoderBackedStore
from langchain.storage.file_system import LocalFileStore
from langchain.storage.in_memory import InMemoryStore
Expand All @@ -16,4 +17,6 @@
"InMemoryStore",
"LocalFileStore",
"RedisStore",
"create_lc_store",
"create_kv_docstore",
]
88 changes: 88 additions & 0 deletions libs/langchain/langchain/storage/_lc_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
"""Create a key-value store for any langchain serializable object."""
from typing import Callable, Optional

from langchain.load.dump import dumps
from langchain.load.load import loads
from langchain.load.serializable import Serializable
from langchain.schema import BaseStore, Document
from langchain.storage.encoder_backed import EncoderBackedStore


def _dump_as_bytes(obj: Serializable) -> bytes:
"""Return a bytes representation of a document."""
return dumps(obj).encode("utf-8")


def _dump_document_as_bytes(obj: Document) -> bytes:
"""Return a bytes representation of a document."""
if not isinstance(obj, Document):
raise TypeError("Expected a Document instance")
return dumps(obj).encode("utf-8")


def _load_document_from_bytes(serialized: bytes) -> Document:
"""Return a document from a bytes representation."""
obj = loads(serialized.decode("utf-8"))
if not isinstance(obj, Document):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So all that is different about the specialised dump/load functions is the isinstance check?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah run time check and type annotation

raise TypeError(f"Expected a Document instance. Got {type(obj)}")
return obj


def _load_from_bytes(serialized: bytes) -> Serializable:
"""Return a document from a bytes representation."""
return loads(serialized.decode("utf-8"))


def _identity(x: str) -> str:
"""Return the same object."""
return x


# PUBLIC API


def create_lc_store(
store: BaseStore[str, bytes],
*,
key_encoder: Optional[Callable[[str], str]] = None,
) -> BaseStore[str, Serializable]:
"""Create a store for langchain serializable objects from a bytes store.

Args:
store: A bytes store to use as the underlying store.
key_encoder: A function to encode keys; if None uses identity function.

Returns:
A key-value store for documents.
"""
return EncoderBackedStore(
store,
key_encoder or _identity,
_dump_as_bytes,
_load_from_bytes,
)


def create_kv_docstore(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other possibility below -- but seems like over abstraction

Objects we'll want to serialize are:

  1. everything (then no need to type check)
  2. prompts (can do special purpose)
  3. documents (can do special purpose)

The other possibility if anyone has time is to try and introduce a more general class that proxies the embedding class but gets the type signatures right

The tricky thing is getting the type signatures right :)

user passes types and the class does run time type checking.



class LCStore(BaseStore[str, T]):
    def __init__(self, store: BaseStore[str, bytes], types: Sequence[Type]):
        """Create a store for langchain serializable objects from a bytes store."""
        self.types = types
        self.store = EncoderBackedStore(
            store,
            key_encoder or _identity,
            _dump_document_as_bytes,
            _load_document_from_bytes,
        )

    def mget(self, keys: Sequence[K]) -> List[Optional[V]]:
        """Get multiple keys."""
        return self.store.mget(keys)

    def mdelete(self, keys: Sequence[K]) -> None:
        """Delete multiple keys."""
        return self.store.mdelete(keys)

    def yield_keys(
        self, *, prefix: Optional[str] = None
    ) -> Union[Iterator[K], Iterator[str]]:
        """Yield all keys in the store."""
        yield from self.store.yield_keys(prefix=prefix)

    def mset(self, items: Sequence[Tuple[K, V]]) -> None:
        """Set multiple key-value pairs."""
        return self.store.mset(items)

store: BaseStore[str, bytes],
*,
key_encoder: Optional[Callable[[str], str]] = None,
) -> BaseStore[str, Document]:
"""Create a store for langchain Document objects from a bytes store.

This store does run time type checking to ensure that the values are
Document objects.

Args:
store: A bytes store to use as the underlying store.
key_encoder: A function to encode keys; if None uses identity function.

Returns:
A key-value store for documents.
"""
return EncoderBackedStore(
store,
key_encoder or _identity,
_dump_document_as_bytes,
_load_document_from_bytes,
)
36 changes: 36 additions & 0 deletions libs/langchain/tests/unit_tests/storage/test_lc_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import tempfile
from typing import Generator, cast

import pytest

from langchain.schema import Document
from langchain.storage._lc_store import create_kv_docstore, create_lc_store
from langchain.storage.file_system import LocalFileStore


@pytest.fixture
def file_store() -> Generator[LocalFileStore, None, None]:
# Create a temporary directory for testing
with tempfile.TemporaryDirectory() as temp_dir:
# Instantiate the LocalFileStore with the temporary directory as the root path
store = LocalFileStore(temp_dir)
yield store


def test_create_lc_store(file_store: LocalFileStore) -> None:
"""Test that a docstore is created from a base store."""
docstore = create_lc_store(file_store)
docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))])
fetched_doc = cast(Document, docstore.mget(["key1"])[0])
assert fetched_doc.page_content == "hello"
assert fetched_doc.metadata == {"key": "value"}


def test_create_kv_store(file_store: LocalFileStore) -> None:
"""Test that a docstore is created from a base store."""
docstore = create_kv_docstore(file_store)
docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))])
fetched_doc = docstore.mget(["key1"])[0]
assert isinstance(fetched_doc, Document)
assert fetched_doc.page_content == "hello"
assert fetched_doc.metadata == {"key": "value"}