Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ source .venv/bin/activate

Install package in editable mode.
```shell
poetry install --with dev,test,lint
uv sync --group test
```

Start PostgreSQL/PGVector.
Expand All @@ -22,7 +22,7 @@ docker run --rm -it --name pgvector-container \
-e POSTGRES_USER=langchain \
-e POSTGRES_PASSWORD=langchain \
-e POSTGRES_DB=langchain_test \
-p 6024:5432 pgvector/pgvector:pg16 \
-p 5432:5432 pgvector/pgvector:pg16 \
postgres -c log_statement=all
```

Expand Down
41 changes: 33 additions & 8 deletions langchain_postgres/v2/async_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from __future__ import annotations

import copy
import datetime
import json
import uuid
from typing import Any, Callable, Iterable, Optional, Sequence
Expand Down Expand Up @@ -54,6 +55,16 @@
.union(SPECIAL_CASED_OPERATORS)
)

PYTHON_TO_POSTGRES_TYPE_MAP = {
int: "INTEGER",
float: "FLOAT",
str: "TEXT",
bool: "BOOLEAN",
datetime.date: "DATE",
datetime.datetime: "TIMESTAMP",
datetime.time: "TIME",
}


class AsyncPGVectorStore(VectorStore):
"""Postgres Vector Store class"""
Expand Down Expand Up @@ -1096,19 +1107,33 @@ def _handle_field_filter(
operator = "$eq"
filter_value = value

field_selector = field
if self.metadata_json_column is not None and field not in self.metadata_columns and field not in (
self.id_column,
self.content_column,
self.embedding_column
):
filter_value_type = type(filter_value[0]) if (isinstance(filter_value, list) or isinstance(filter_value, tuple)) else type(filter_value)
postgres_type = PYTHON_TO_POSTGRES_TYPE_MAP.get(filter_value_type)
if postgres_type is None:
raise ValueError(f"Unsupported type: {filter_value_type}")
field_selector = f"{self.metadata_json_column}->>'{field}'"
if postgres_type != "TEXT" and operator != "$exists":
field_selector = f"({field_selector})::{postgres_type}"

suffix_id = str(uuid.uuid4()).split("-")[0]
if operator in COMPARISONS_TO_NATIVE:
# Then we implement an equality filter
# native is trusted input
native = COMPARISONS_TO_NATIVE[operator]
param_name = f"{field}_{suffix_id}"
return f"{field} {native} :{param_name}", {f"{param_name}": filter_value}
return f"{field_selector} {native} :{param_name}", {f"{param_name}": filter_value}
elif operator == "$between":
# Use AND with two comparisons
low, high = filter_value
low_param_name = f"{field}_low_{suffix_id}"
high_param_name = f"{field}_high_{suffix_id}"
return f"({field} BETWEEN :{low_param_name} AND :{high_param_name})", {
return f"({field_selector} BETWEEN :{low_param_name} AND :{high_param_name})", {
f"{low_param_name}": low,
f"{high_param_name}": high,
}
Expand All @@ -1126,18 +1151,18 @@ def _handle_field_filter(
)
param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}"
if operator == "$in":
return f"{field} = ANY(:{param_name})", {f"{param_name}": filter_value}
return f"{field_selector} = ANY(:{param_name})", {f"{param_name}": filter_value}
else: # i.e. $nin
return f"{field} <> ALL (:{param_name})", {
return f"{field_selector} <> ALL (:{param_name})", {
f"{param_name}": filter_value
}

elif operator in {"$like", "$ilike"}:
param_name = f"{field}_{operator.replace('$', '')}_{suffix_id}"
if operator == "$like":
return f"({field} LIKE :{param_name})", {f"{param_name}": filter_value}
return f"({field_selector} LIKE :{param_name})", {f"{param_name}": filter_value}
else: # i.e. $ilike
return f"({field} ILIKE :{param_name})", {f"{param_name}": filter_value}
return f"({field_selector} ILIKE :{param_name})", {f"{param_name}": filter_value}
elif operator == "$exists":
if not isinstance(filter_value, bool):
raise ValueError(
Expand All @@ -1146,9 +1171,9 @@ def _handle_field_filter(
)
else:
if filter_value:
return f"({field} IS NOT NULL)", {}
return f"({field_selector} IS NOT NULL)", {}
else:
return f"({field} IS NULL)", {}
return f"({field_selector} IS NULL)", {}
else:
raise NotImplementedError()

Expand Down
173 changes: 173 additions & 0 deletions tests/unit_tests/fixtures/metadata_filtering_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,179 @@
{"inventory_location": {"$exists": False}},
["WB003"],
),
# JSON metadata filter
(
{"code_json": "FT004"},
["FT004"],
),
(
{"name_json": "Smart Fitness Tracker"},
["FT004"],
),
(
{"is_available_json": True},
["WH001", "FT004", "EC002"],
),
(
{"code_json": "WH001", "is_available_json": True},
["WH001"],
),
(
{"available_quantity_json": {"$eq": 10}},
["EC002"],
),
(
{"available_quantity_json": {"$ne": 0}},
["WH001", "FT004", "EC002"],
),
(
{"available_quantity_json": {"$gt": 60}},
["FT004"],
),
(
{"available_quantity_json": {"$gte": 50}},
["WH001", "FT004"],
),
(
{"available_quantity_json": {"$lt": 5}},
["WB003"],
),
(
{"available_quantity_json": {"$lte": 10}},
["WB003", "EC002"],
),
(
{"code_json": {"$eq": "WH001"}},
["WH001"],
),
(
{"code_json": {"$ne": "WB003"}},
["WH001", "FT004", "EC002"],
),
(
{"name_json": {"$gt": "Wireless Headphones"}},
[],
),
(
{"name_json": {"$gte": "Wireless Headphones"}},
["WH001"],
),
(
{"name_json": {"$lt": "Smart Fitness Tracker"}},
["EC002"],
),
(
{"name_json": {"$lte": "Smart Fitness Tracker"}},
["FT004", "EC002"],
),
(
{"is_available_json": {"$eq": True}},
["WH001", "FT004", "EC002"],
),
(
{"is_available_json": {"$ne": True}},
["WB003"],
),
(
{"price_json": {"$gt": 200.0}},
["EC002"],
),
(
{"price_json": {"$gte": 149.99}},
["WH001", "EC002"],
),
(
{"price_json": {"$lt": 50.0}},
["WB003"],
),
(
{"price_json": {"$lte": 79.95}},
["FT004", "WB003"],
),
(
{"$or": [{"code_json": "WH001"}, {"code_json": "EC002"}]},
["WH001", "EC002"],
),
(
{"$or": [{"code_json": "WH001"}, {"available_quantity_json": 10}]},
["WH001", "EC002"],
),
(
{"$and": [{"code_json": "WH001"}, {"code_json": "EC002"}]},
[],
),
(
{"$not": {"code_json": "WB003"}},
["WH001", "FT004", "EC002"],
),
(
{"$not": [{"code_json": "WB003"}]},
["WH001", "FT004", "EC002"],
),
(
{"$not": {"available_quantity_json": 0}},
["WH001", "FT004", "EC002"],
),
(
{"$not": [{"available_quantity_json": 0}]},
["WH001", "FT004", "EC002"],
),
(
{"$not": {"is_available_json": True}},
["WB003"],
),
(
{"$not": [{"is_available_json": True}]},
["WB003"],
),
(
{"$not": {"price_json": {"$gt": 150.0}}},
["WH001", "FT004", "WB003"],
),
(
{"$not": [{"price_json": {"$gt": 150.0}}]},
["WH001", "FT004", "WB003"],
),
(
{"available_quantity_json": {"$between": (40, 60)}},
["WH001"],
),
(
{"name_json": {"$in": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}},
["FT004", "WB003"],
),
(
{"available_quantity_json": {"$in": [0, 10]}},
["WB003", "EC002"],
),
(
{"name_json": {"$nin": ["Smart Fitness Tracker", "Stainless Steel Water Bottle"]}},
["WH001", "EC002"],
),
(
{"available_quantity_json": {"$nin": [50, 0, 10]}},
["FT004"],
),
(
{"name_json": {"$like": "Wireless%"}},
["WH001"],
),
(
{"name_json": {"$like": "%less%"}},
["WH001", "WB003"],
),
(
{"$or": [{"code_json": {"$like": "WH00%"}}, {"code_json": {"$like": "EC00%"}}]},
["WH001", "EC002"],
),
(
{"tags_json": {"$exists": False}},
[],
),
(
{"inventory_location_json": {"$exists": False}},
["WB003"],
)
]

NEGATIVE_TEST_CASES = [
Expand Down
9 changes: 7 additions & 2 deletions tests/unit_tests/v2/test_async_pg_vectorstore_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,12 @@
embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))]

filter_docs = [
Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts))
Document(
page_content=texts[i],
metadata=(
METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()}
)
) for i in range(len(texts))
]
# Documents designed for hybrid search testing
hybrid_docs_content = {
Expand Down Expand Up @@ -194,7 +199,7 @@ async def vs_custom_filter(
Column("available_quantity", "INTEGER", nullable=True),
],
id_column="langchain_id",
store_metadata=False,
store_metadata=True,
)

vs_custom_filter = await AsyncPGVectorStore.create(
Expand Down
11 changes: 8 additions & 3 deletions tests/unit_tests/v2/test_pg_vectorstore_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,12 @@
Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts))
]
filter_docs = [
Document(page_content=texts[i], metadata=METADATAS[i]) for i in range(len(texts))
Document(
page_content=texts[i],
metadata=(
METADATAS[i] | {f"{key}_json": value for key, value in METADATAS[i].items()}
)
) for i in range(len(texts))
]

embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))]
Expand Down Expand Up @@ -141,7 +146,7 @@ async def vs_custom_filter(self, engine: PGEngine) -> AsyncIterator[PGVectorStor
Column("available_quantity", "INTEGER", nullable=True),
],
id_column="langchain_id",
store_metadata=False,
store_metadata=True,
overwrite_existing=True,
)

Expand Down Expand Up @@ -352,7 +357,7 @@ async def vs_custom_filter_sync(
Column("available_quantity", "INTEGER", nullable=True),
],
id_column="langchain_id",
store_metadata=False,
store_metadata=True,
overwrite_existing=True,
)

Expand Down