From 6fafde4c56357d683d8588df32769253a009ddce Mon Sep 17 00:00:00 2001 From: gRedHeadphone Date: Mon, 13 Oct 2025 06:59:31 +0000 Subject: [PATCH 1/2] fix(docs): update command to install packages using uv & update port to default port used in tests --- DEVELOPMENT.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 37095a3..b26725d 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -13,7 +13,7 @@ source .venv/bin/activate Install package in editable mode. ```shell -poetry install --with dev,test,lint +uv sync --group test ``` Start PostgreSQL/PGVector. @@ -22,7 +22,7 @@ docker run --rm -it --name pgvector-container \ -e POSTGRES_USER=langchain \ -e POSTGRES_PASSWORD=langchain \ -e POSTGRES_DB=langchain_test \ - -p 6024:5432 pgvector/pgvector:pg16 \ + -p 5432:5432 pgvector/pgvector:pg16 \ postgres -c log_statement=all ``` From 5b3c3c73633c3317dbef5265afbec647a4e3ec27 Mon Sep 17 00:00:00 2001 From: gRedHeadphone Date: Mon, 13 Oct 2025 08:02:34 +0000 Subject: [PATCH 2/2] feat: json metadata filtering --- langchain_postgres/v2/async_vectorstore.py | 41 ++++- .../fixtures/metadata_filtering_data.py | 173 ++++++++++++++++++ .../v2/test_async_pg_vectorstore_search.py | 9 +- .../v2/test_pg_vectorstore_search.py | 11 +- 4 files changed, 221 insertions(+), 13 deletions(-) diff --git a/langchain_postgres/v2/async_vectorstore.py b/langchain_postgres/v2/async_vectorstore.py index 8382b3e..302dcbd 100644 --- a/langchain_postgres/v2/async_vectorstore.py +++ b/langchain_postgres/v2/async_vectorstore.py @@ -2,6 +2,7 @@ from __future__ import annotations import copy +import datetime import json import uuid from typing import Any, Callable, Iterable, Optional, Sequence @@ -54,6 +55,16 @@ .union(SPECIAL_CASED_OPERATORS) ) +PYTHON_TO_POSTGRES_TYPE_MAP = { + int: "INTEGER", + float: "FLOAT", + str: "TEXT", + bool: "BOOLEAN", + datetime.date: "DATE", + datetime.datetime: "TIMESTAMP", + datetime.time: "TIME", +} + class AsyncPGVectorStore(VectorStore): """Postgres Vector Store class""" @@ -1096,19 +1107,33 @@ def _handle_field_filter( operator = "$eq" filter_value = value + field_selector = field + if self.metadata_json_column is not None and field not in self.metadata_columns and field not in ( + self.id_column, + self.content_column, + self.embedding_column + ): + filter_value_type = type(filter_value[0]) if (isinstance(filter_value, list) or isinstance(filter_value, tuple)) else type(filter_value) + postgres_type = PYTHON_TO_POSTGRES_TYPE_MAP.get(filter_value_type) + if postgres_type is None: + raise ValueError(f"Unsupported type: {filter_value_type}") + field_selector = f"{self.metadata_json_column}->>'{field}'" + if postgres_type != "TEXT" and operator != "$exists": + field_selector = f"({field_selector})::{postgres_type}" + suffix_id = str(uuid.uuid4()).split("-")[0] if operator in COMPARISONS_TO_NATIVE: # Then we implement an equality filter # native is trusted input native = COMPARISONS_TO_NATIVE[operator] param_name = f"{field}_{suffix_id}" - return f"{field} {native} :{param_name}", {f"{param_name}": filter_value} + return f"{field_selector} {native} :{param_name}", {f"{param_name}": filter_value} elif operator == "$between": # Use AND with two comparisons low, high = filter_value low_param_name = f"{field}_low_{suffix_id}" high_param_name = f"{field}_high_{suffix_id}" - return f"({field} BETWEEN :{low_param_name} AND :{high_param_name})", { + return f"({field_selector} BETWEEN :{low_param_name} AND :{high_param_name})", { f"{low_param_name}": low, f"{high_param_name}": high, } @@ -1126,18 +1151,18 @@ def _handle_field_filter( ) param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$in": - return f"{field} = ANY(:{param_name})", {f"{param_name}": filter_value} + return f"{field_selector} = ANY(:{param_name})", {f"{param_name}": filter_value} else: # i.e. $nin - return f"{field} <> ALL (:{param_name})", { + return f"{field_selector} <> ALL (:{param_name})", { f"{param_name}": filter_value } elif operator in {"$like", "$ilike"}: param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}" if operator == "$like": - return f"({field} LIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} LIKE :{param_name})", {f"{param_name}": filter_value} else: # i.e. $ilike - return f"({field} ILIKE :{param_name})", {f"{param_name}": filter_value} + return f"({field_selector} ILIKE :{param_name})", {f"{param_name}": filter_value} elif operator == "$exists": if not isinstance(filter_value, bool): raise ValueError( @@ -1146,9 +1171,9 @@ def _handle_field_filter( ) else: if filter_value: - return f"({field} IS NOT NULL)", {} + return f"({field_selector} IS NOT NULL)", {} else: - return f"({field} IS NULL)", {} + return f"({field_selector} IS NULL)", {} else: raise NotImplementedError() diff --git a/tests/unit_tests/fixtures/metadata_filtering_data.py b/tests/unit_tests/fixtures/metadata_filtering_data.py index 8df8c01..684eef4 100644 --- a/tests/unit_tests/fixtures/metadata_filtering_data.py +++ b/tests/unit_tests/fixtures/metadata_filtering_data.py @@ -239,6 +239,179 @@ {"inventory_location": {"$exists": False}}, ["WB003"], ), + # JSON metadata filter + ( + {"code_json": "FT004"}, + ["FT004"], + ), + ( + {"name_json": "Smart Fitness Tracker"}, + ["FT004"], + ), + ( + {"is_available_json": True}, + ["WH001", "FT004", "EC002"], + ), + ( + {"code_json": "WH001", "is_available_json": True}, + ["WH001"], + ), + ( + {"available_quantity_json": {"$eq": 10}}, + ["EC002"], + ), + ( + {"available_quantity_json": {"$ne": 0}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"available_quantity_json": {"$gt": 60}}, + ["FT004"], + ), + ( + {"available_quantity_json": {"$gte": 50}}, + ["WH001", "FT004"], + ), + ( + {"available_quantity_json": {"$lt": 5}}, + ["WB003"], + ), + ( + {"available_quantity_json": {"$lte": 10}}, + ["WB003", "EC002"], + ), + ( + {"code_json": {"$eq": "WH001"}}, + ["WH001"], + ), + ( + {"code_json": {"$ne": "WB003"}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"name_json": {"$gt": "Wireless Headphones"}}, + [], + ), + ( + {"name_json": {"$gte": "Wireless Headphones"}}, + ["WH001"], + ), + ( + {"name_json": {"$lt": "Smart Fitness Tracker"}}, + ["EC002"], + ), + ( + {"name_json": {"$lte": "Smart Fitness Tracker"}}, + ["FT004", "EC002"], + ), + ( + {"is_available_json": {"$eq": True}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"is_available_json": {"$ne": True}}, + ["WB003"], + ), + ( + {"price_json": {"$gt": 200.0}}, + ["EC002"], + ), + ( + {"price_json": {"$gte": 149.99}}, + ["WH001", "EC002"], + ), + ( + {"price_json": {"$lt": 50.0}}, + ["WB003"], + ), + ( + {"price_json": {"$lte": 79.95}}, + ["FT004", "WB003"], + ), + ( + {"$or": [{"code_json": "WH001"}, {"code_json": "EC002"}]}, + ["WH001", "EC002"], + ), + ( + {"$or": [{"code_json": "WH001"}, {"available_quantity_json": 10}]}, + ["WH001", "EC002"], + ), + ( + {"$and": [{"code_json": "WH001"}, {"code_json": "EC002"}]}, + [], + ), + ( + {"$not": {"code_json": "WB003"}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": [{"code_json": "WB003"}]}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": {"available_quantity_json": 0}}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": [{"available_quantity_json": 0}]}, + ["WH001", "FT004", "EC002"], + ), + ( + {"$not": {"is_available_json": True}}, + ["WB003"], + ), + ( + {"$not": [{"is_available_json": True}]}, + ["WB003"], + ), + ( + {"$not": {"price_json": {"$gt": 150.0}}}, + ["WH001", "FT004", "WB003"], + ), + ( + {"$not": [{"price_json": {"$gt": 150.0}}]}, + ["WH001", "FT004", "WB003"], + ), + ( + {"available_quantity_json": {"$between": (40, 60)}}, + ["WH001"], + ), + ( + {"name_json": {"$in": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, + ["FT004", "WB003"], + ), + ( + {"available_quantity_json": {"$in": [0, 10]}}, + ["WB003", "EC002"], + ), + ( + {"name_json": {"$nin": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}}, + ["WH001", "EC002"], + ), + ( + {"available_quantity_json": {"$nin": [50, 0, 10]}}, + ["FT004"], + ), + ( + {"name_json": {"$like": "Wireless%"}}, + ["WH001"], + ), + ( + {"name_json": {"$like": "%less%"}}, + ["WH001", "WB003"], + ), + ( + {"$or": [{"code_json": {"$like": "WH00%"}}, {"code_json": {"$like": "EC00%"}}]}, + ["WH001", "EC002"], + ), + ( + {"tags_json": {"$exists": False}}, + [], + ), + ( + {"inventory_location_json": {"$exists": False}}, + ["WB003"], + ) ] NEGATIVE_TEST_CASES = [ diff --git a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py index 16c70fd..7211659 100644 --- a/tests/unit_tests/v2/test_async_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_async_pg_vectorstore_search.py @@ -46,7 +46,12 @@ embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] filter_docs = [ - Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) + Document( + page_content=texts[i], + metadata=( + METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()} + ) + ) for i in range(len(texts)) ] # Documents designed for hybrid search testing hybrid_docs_content = { @@ -194,7 +199,7 @@ async def vs_custom_filter( Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, ) vs_custom_filter = await AsyncPGVectorStore.create( diff --git a/tests/unit_tests/v2/test_pg_vectorstore_search.py b/tests/unit_tests/v2/test_pg_vectorstore_search.py index 7815a25..0ca690d 100644 --- a/tests/unit_tests/v2/test_pg_vectorstore_search.py +++ b/tests/unit_tests/v2/test_pg_vectorstore_search.py @@ -42,7 +42,12 @@ Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) ] filter_docs = [ - Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts)) + Document( + page_content=texts[i], + metadata=( + METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()} + ) + ) for i in range(len(texts)) ] embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] @@ -141,7 +146,7 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, overwrite_existing=True, ) @@ -352,7 +357,7 @@ async def vs_custom_filter_sync( Column("available_quantity", "INTEGER", nullable=True), ], id_column="langchain_id", - store_metadata=False, + store_metadata=True, overwrite_existing=True, )