-
Notifications
You must be signed in to change notification settings - Fork 15.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make document serializable, create utility to create a docstore #9674
Changes from all commits
6483551
5cb334a
e6ab06d
b3a980b
4549ef7
005e087
8e608de
770231c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
"""Create a key-value store for any langchain serializable object.""" | ||
from typing import Callable, Optional | ||
|
||
from langchain.load.dump import dumps | ||
from langchain.load.load import loads | ||
from langchain.load.serializable import Serializable | ||
from langchain.schema import BaseStore, Document | ||
from langchain.storage.encoder_backed import EncoderBackedStore | ||
|
||
|
||
def _dump_as_bytes(obj: Serializable) -> bytes: | ||
"""Return a bytes representation of a document.""" | ||
return dumps(obj).encode("utf-8") | ||
|
||
|
||
def _dump_document_as_bytes(obj: Document) -> bytes: | ||
"""Return a bytes representation of a document.""" | ||
if not isinstance(obj, Document): | ||
raise TypeError("Expected a Document instance") | ||
return dumps(obj).encode("utf-8") | ||
|
||
|
||
def _load_document_from_bytes(serialized: bytes) -> Document: | ||
"""Return a document from a bytes representation.""" | ||
obj = loads(serialized.decode("utf-8")) | ||
if not isinstance(obj, Document): | ||
raise TypeError(f"Expected a Document instance. Got {type(obj)}") | ||
return obj | ||
|
||
|
||
def _load_from_bytes(serialized: bytes) -> Serializable: | ||
"""Return a document from a bytes representation.""" | ||
return loads(serialized.decode("utf-8")) | ||
|
||
|
||
def _identity(x: str) -> str: | ||
"""Return the same object.""" | ||
return x | ||
|
||
|
||
# PUBLIC API | ||
|
||
|
||
def create_lc_store( | ||
store: BaseStore[str, bytes], | ||
*, | ||
key_encoder: Optional[Callable[[str], str]] = None, | ||
) -> BaseStore[str, Serializable]: | ||
"""Create a store for langchain serializable objects from a bytes store. | ||
|
||
Args: | ||
store: A bytes store to use as the underlying store. | ||
key_encoder: A function to encode keys; if None uses identity function. | ||
|
||
Returns: | ||
A key-value store for documents. | ||
""" | ||
return EncoderBackedStore( | ||
store, | ||
key_encoder or _identity, | ||
_dump_as_bytes, | ||
_load_from_bytes, | ||
) | ||
|
||
|
||
def create_kv_docstore( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Other possibility below -- but seems like over abstraction Objects we'll want to serialize are:
The other possibility if anyone has time is to try and introduce a more general class that proxies the embedding class but gets the type signatures right The tricky thing is getting the type signatures right :) user passes
|
||
store: BaseStore[str, bytes], | ||
*, | ||
key_encoder: Optional[Callable[[str], str]] = None, | ||
) -> BaseStore[str, Document]: | ||
"""Create a store for langchain Document objects from a bytes store. | ||
|
||
This store does run time type checking to ensure that the values are | ||
Document objects. | ||
|
||
Args: | ||
store: A bytes store to use as the underlying store. | ||
key_encoder: A function to encode keys; if None uses identity function. | ||
|
||
Returns: | ||
A key-value store for documents. | ||
""" | ||
return EncoderBackedStore( | ||
store, | ||
key_encoder or _identity, | ||
_dump_document_as_bytes, | ||
_load_document_from_bytes, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
import tempfile | ||
from typing import Generator, cast | ||
|
||
import pytest | ||
|
||
from langchain.schema import Document | ||
from langchain.storage._lc_store import create_kv_docstore, create_lc_store | ||
from langchain.storage.file_system import LocalFileStore | ||
|
||
|
||
@pytest.fixture | ||
def file_store() -> Generator[LocalFileStore, None, None]: | ||
# Create a temporary directory for testing | ||
with tempfile.TemporaryDirectory() as temp_dir: | ||
# Instantiate the LocalFileStore with the temporary directory as the root path | ||
store = LocalFileStore(temp_dir) | ||
yield store | ||
|
||
|
||
def test_create_lc_store(file_store: LocalFileStore) -> None: | ||
"""Test that a docstore is created from a base store.""" | ||
docstore = create_lc_store(file_store) | ||
docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))]) | ||
fetched_doc = cast(Document, docstore.mget(["key1"])[0]) | ||
assert fetched_doc.page_content == "hello" | ||
assert fetched_doc.metadata == {"key": "value"} | ||
|
||
|
||
def test_create_kv_store(file_store: LocalFileStore) -> None: | ||
"""Test that a docstore is created from a base store.""" | ||
docstore = create_kv_docstore(file_store) | ||
docstore.mset([("key1", Document(page_content="hello", metadata={"key": "value"}))]) | ||
fetched_doc = docstore.mget(["key1"])[0] | ||
assert isinstance(fetched_doc, Document) | ||
assert fetched_doc.page_content == "hello" | ||
assert fetched_doc.metadata == {"key": "value"} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So all that is different about the specialised dump/load functions is the isinstance check?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah run time check and type annotation