From aa9ccca77509d3d9584c75972cf7635b710d9bcc Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 20 Mar 2024 13:00:22 -0400 Subject: [PATCH] langchain[patch]: Add tests for indexing (#19342) This PR adds tests for the indexing API --- .../tests/unit_tests/indexes/test_indexing.py | 154 ++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/libs/langchain/tests/unit_tests/indexes/test_indexing.py b/libs/langchain/tests/unit_tests/indexes/test_indexing.py index 4d939815e39779..73c906a1850480 100644 --- a/libs/langchain/tests/unit_tests/indexes/test_indexing.py +++ b/libs/langchain/tests/unit_tests/indexes/test_indexing.py @@ -736,6 +736,160 @@ def test_incremental_delete( } +def test_incremental_delete_with_batch_size( + record_manager: SQLRecordManager, vector_store: InMemoryVectorStore +) -> None: + """Test indexing with incremental deletion strategy and batch size.""" + loader = ToyLoader( + documents=[ + Document( + page_content="1", + metadata={"source": "1"}, + ), + Document( + page_content="2", + metadata={"source": "2"}, + ), + Document( + page_content="3", + metadata={"source": "3"}, + ), + Document( + page_content="4", + metadata={"source": "4"}, + ), + ] + ) + + with patch.object( + record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp() + ): + assert index( + loader, + record_manager, + vector_store, + cleanup="incremental", + source_id_key="source", + batch_size=3, + ) == { + "num_added": 4, + "num_deleted": 0, + "num_skipped": 0, + "num_updated": 0, + } + + doc_texts = set( + # Ignoring type since doc should be in the store and not a None + vector_store.store.get(uid).page_content # type: ignore + for uid in vector_store.store + ) + assert doc_texts == {"1", "2", "3", "4"} + + # Attempt to index again verify that nothing changes + with patch.object( + record_manager, "get_time", return_value=datetime(2021, 1, 2).timestamp() + ): + assert index( + loader, + record_manager, + vector_store, + cleanup="incremental", + source_id_key="source", + batch_size=3, + ) == { + "num_added": 0, + "num_deleted": 0, + "num_skipped": 4, + "num_updated": 0, + } + + # Attempt to index again verify that nothing changes + with patch.object( + record_manager, "get_time", return_value=datetime(2022, 1, 3).timestamp() + ): + # Docs with same content + docs = [ + Document( + page_content="1", + metadata={"source": "1"}, + ), + Document( + page_content="2", + metadata={"source": "2"}, + ), + ] + assert index( + docs, + record_manager, + vector_store, + cleanup="incremental", + source_id_key="source", + batch_size=1, + ) == { + "num_added": 0, + "num_deleted": 0, + "num_skipped": 2, + "num_updated": 0, + } + + # Attempt to index again verify that nothing changes + with patch.object( + record_manager, "get_time", return_value=datetime(2023, 1, 3).timestamp() + ): + # Docs with same content + docs = [ + Document( + page_content="1", + metadata={"source": "1"}, + ), + Document( + page_content="2", + metadata={"source": "2"}, + ), + ] + assert index( + docs, + record_manager, + vector_store, + cleanup="incremental", + source_id_key="source", + batch_size=1, + ) == { + "num_added": 0, + "num_deleted": 0, + "num_skipped": 2, + "num_updated": 0, + } + + # Try to index with changed docs now + with patch.object( + record_manager, "get_time", return_value=datetime(2024, 1, 3).timestamp() + ): + # Docs with same content + docs = [ + Document( + page_content="changed 1", + metadata={"source": "1"}, + ), + Document( + page_content="changed 2", + metadata={"source": "2"}, + ), + ] + assert index( + docs, + record_manager, + vector_store, + cleanup="incremental", + source_id_key="source", + ) == { + "num_added": 2, + "num_deleted": 2, + "num_skipped": 0, + "num_updated": 0, + } + + @pytest.mark.requires("aiosqlite") async def test_aincremental_delete( arecord_manager: SQLRecordManager, vector_store: InMemoryVectorStore