Skip to content

Commit

Permalink
Merge 41ae1f3 into 1609ff8
Browse files Browse the repository at this point in the history
  • Loading branch information
simonwoerpel committed Sep 28, 2023
2 parents 1609ff8 + 41ae1f3 commit dd943fc
Show file tree
Hide file tree
Showing 16 changed files with 454 additions and 188 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## CHANGELOG

### 0.3.0 (2023-09-19)

- Refactor filters to allow comparator lookups for `schema` and `dataset` as well

### 0.3.0 (2023-09-06)

- Add `nomeklatura` based stores and query views built upon that (needs documentation)
Expand Down
4 changes: 1 addition & 3 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,7 @@ def benchmark(uri: str):
_ = [p for p in store.iterate()]

view = store.query()
q = Query().where(
dataset=DATASET, schema="Event", prop="date", value=2023, operator="gte"
)
q = Query().where(dataset=DATASET, schema="Event", date__gte=2023)
with measure(prefix, "query"):
_ = [p for p in view.entities(q)]

Expand Down
51 changes: 42 additions & 9 deletions ftmq/filters.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import Any, Iterable, TypeVar
from typing import Any, Iterable, TypeVar, Union

from banal import as_bool, ensure_list, is_listish
from followthemoney import model
Expand Down Expand Up @@ -56,13 +56,14 @@ def apply(self, value: str | None) -> bool:
return value < self.value
if self.comparator == "lte":
return value <= self.value
if self.comparator == "like":
return self.value in value
if self.comparator == "ilike":
return self.value.lower() in value.lower()
return False


class BaseFilter:
key: str
value: Value
lookup: Lookup

def __init__(
self,
value: Value,
Expand All @@ -72,15 +73,23 @@ def __init__(
self.comparator = Comparators[comparator or "eq"]
except KeyError:
raise ValidationError(f"Invalid comparator `{comparator}`")
self.value = self.get_casted_value(value)
self.lookup = Lookup(self.comparator, self.value)
self.value: Value = self.get_casted_value(value)
self.lookup: Lookup = Lookup(self.comparator, self.value)

def __hash__(self) -> int:
return hash((self.key, str(self.value)))
return hash((self.key, str(self.lookup), str(self.value)))

def __eq__(self, other: Any) -> bool:
return hash(self) == hash(other)

def __lt__(self, other: Any) -> bool:
# allow ordering (helpful for testing)
return hash(self) < hash(other)

def __gt__(self, other: Any) -> bool:
# allow ordering (helpful for testing)
return hash(self) > hash(other)

def to_dict(self) -> dict[str, Any]:
if self.comparator == Lookup.EQUALS:
key = self.key
Expand Down Expand Up @@ -186,12 +195,36 @@ def apply(self, proxy: CE) -> bool:
return False


Filter = DatasetFilter | SchemaFilter | PropertyFilter | ReverseFilter
class IdFilter(BaseFilter):
key = "id"

def apply(self, proxy: CE) -> bool:
return self.lookup.apply(proxy.id)


class EntityIdFilter(IdFilter):
key = "entity_id"


class CanonicalIdFilter(IdFilter):
key = "canonical_id"


Filter = Union[
DatasetFilter,
SchemaFilter,
PropertyFilter,
ReverseFilter,
EntityIdFilter,
CanonicalIdFilter,
]
F = TypeVar("F", bound=Filter)

FILTERS = {
"dataset": DatasetFilter,
"schema": SchemaFilter,
"property": PropertyFilter,
"reverse": ReverseFilter,
"entity_id": EntityIdFilter,
"canonical_id": CanonicalIdFilter,
}
59 changes: 46 additions & 13 deletions ftmq/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
from nomenklatura.entity import CE

from ftmq.aggregations import Aggregation, Aggregator
from ftmq.enums import Aggregations, Properties
from ftmq.enums import Aggregations, Comparators, Properties
from ftmq.exceptions import ValidationError
from ftmq.filters import (
FILTERS,
DatasetFilter,
F,
IdFilter,
PropertyFilter,
ReverseFilter,
SchemaFilter,
Expand All @@ -25,9 +26,6 @@


class Sort:
values: tuple[str] | None = None
ascending: bool | None = True

def __init__(self, values: Iterable[str], ascending: bool | None = True) -> None:
self.values = tuple(values)
self.ascending = ascending
Expand All @@ -52,21 +50,24 @@ def serialize(self) -> list[str]:


class Query:
filters: set[F] = set()
aggregations: set[Aggregation] = set()
aggregator: Aggregator | None = None
sort: Sort | None = None
slice: Slice | None = None
DEFAULT_SEARCH_PROPS = (
Properties["name"],
Properties["firstName"],
Properties["middleName"],
Properties["lastName"],
)

def __init__(
self,
filters: Iterable[F] | None = None,
search_filters: Iterable[F] | None = None,
aggregations: Iterable[Aggregation] | None = None,
aggregator: Aggregator | None = None,
sort: Sort | None = None,
slice: Slice | None = None,
):
self.filters = set(ensure_list(filters))
self.search_filters = set(ensure_list(search_filters))
self.aggregations = set(ensure_list(aggregations))
self.aggregator = aggregator
self.sort = sort
Expand Down Expand Up @@ -112,10 +113,9 @@ def _chain(self, **kwargs):
new_kwargs[key] = new_value
return self.__class__(**new_kwargs)

@property
def lookups(self) -> dict[str, Any]:
def _get_lookups(self, filters: set[F]) -> dict[str, Any]:
data = {}
for fi in self.filters:
for fi in filters:
for k, v in fi.to_dict().items():
current = data.get(k)
if is_listish(current):
Expand All @@ -124,6 +124,14 @@ def lookups(self) -> dict[str, Any]:
data[k] = v
return data

@property
def lookups(self) -> dict[str, Any]:
return self._get_lookups(self.filters)

@property
def search_lookups(self) -> dict[str, Any]:
return self._get_lookups(self.search_filters)

@property
def limit(self) -> int | None:
if self.slice is None:
Expand All @@ -140,6 +148,10 @@ def offset(self) -> int | None:
def sql(self) -> Sql:
return Sql(self)

@property
def ids(self) -> set[IdFilter]:
return {f for f in self.filters if isinstance(f, IdFilter)}

@property
def datasets(self) -> set[DatasetFilter]:
return {f for f in self.filters if isinstance(f, DatasetFilter)}
Expand Down Expand Up @@ -171,6 +183,9 @@ def discard(self, f_cls: F) -> None:

def to_dict(self) -> dict[str, Any]:
data = self.lookups
search_data = self.search_lookups
if search_data:
data["search"] = search_data
if self.sort:
data["order_by"] = self.sort.serialize()
if self.slice:
Expand Down Expand Up @@ -225,6 +240,14 @@ def where(self, **lookup: dict[str, Any]) -> Q:

return self._chain()

def search(self, q: str, props: Iterable[Properties | str] = None) -> Q:
# reset existing search
self.search_filters: set[F] = set()
props = props or self.DEFAULT_SEARCH_PROPS
for prop in props:
self.search_filters.add(PropertyFilter(prop, q, Comparators.ilike))
return self._chain()

def order_by(self, *values: Iterable[str], ascending: bool | None = True) -> Q:
self.sort = Sort(values=values, ascending=ascending)
return self._chain()
Expand All @@ -237,11 +260,21 @@ def aggregate(self, func: Aggregations, *props: Properties) -> Q:
def get_aggregator(self) -> Aggregator:
return Aggregator(aggregations=self.aggregations)

def apply(self, proxy: CE) -> bool:
def apply_filter(self, proxy: CE) -> bool:
if not self.filters:
return True
return all(f.apply(proxy) for f in self.filters)

def apply_search(self, proxy: CE) -> bool:
if not self.search_filters:
return True
return any(f.apply(proxy) for f in self.search_filters)

def apply(self, proxy: CE) -> bool:
if self.apply_filter(proxy):
return self.apply_search(proxy)
return False

def apply_iter(self, proxies: CEGenerator) -> CEGenerator:
"""
apply a `Query` to a generator of proxies and return a generator of filtered proxies
Expand Down
7 changes: 0 additions & 7 deletions ftmq/settings.py

This file was deleted.

38 changes: 33 additions & 5 deletions ftmq/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,24 +44,35 @@ def __init__(self, q: "Q") -> None:
self.table = make_statement_table(self.metadata)

def get_expression(self, column: Column, f: F):
value = f.value
if f.comparator == Comparators.ilike:
value = f"%{value}%"
op = self.COMPARATORS.get(str(f.comparator), str(f.comparator))
op = getattr(column, op)
return op(f.value)
return op(value)

@cached_property
def clause(self) -> BooleanClauseList:
clauses = []
if self.q.ids:
clauses.append(
or_(
self.get_expression(self.table.c[f.key], f)
for f in sorted(self.q.ids)
)
)
if self.q.datasets:
clauses.append(
or_(
self.get_expression(self.table.c.dataset, f)
for f in self.q.datasets
for f in sorted(self.q.datasets)
)
)
if self.q.schemata:
clauses.append(
or_(
self.get_expression(self.table.c.schema, f) for f in self.q.schemata
self.get_expression(self.table.c.schema, f)
for f in sorted(self.q.schemata)
)
)
if self.q.reversed:
Expand All @@ -70,7 +81,7 @@ def clause(self) -> BooleanClauseList:
self.table.c.prop_type == str(registry.entity),
self.get_expression(self.table.c.value, f),
)
for f in self.q.reversed
for f in sorted(self.q.reversed)
)
rq = select(self.table.c.canonical_id.distinct()).where(
and_(rclause, *clauses)
Expand All @@ -83,14 +94,31 @@ def clause(self) -> BooleanClauseList:
self.table.c.prop == f.key,
self.get_expression(self.table.c.value, f),
)
for f in self.q.properties
for f in sorted(self.q.properties)
)
)
return and_(*clauses)

@cached_property
def search_clause(self) -> BooleanClauseList | None:
if not self.q.search_filters:
return
return or_(
and_(
self.table.c.prop == f.key,
self.get_expression(self.table.c.value, f),
)
for f in self.q.search_filters
)

@cached_property
def canonical_ids(self) -> Select:
q = select(self.table.c.canonical_id.distinct()).where(self.clause)
if self.q.search_filters:
search_ids = select(self.table.c.canonical_id.distinct()).where(
self.search_clause
)
q = q.where(self.table.c.canonical_id.in_(search_ids))
if self.q.sort is None:
q = q.limit(self.q.limit).offset(self.q.offset)
return q
Expand Down
3 changes: 1 addition & 2 deletions ftmq/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from ftmq.model.coverage import Collector, Coverage
from ftmq.model.dataset import C, Catalog, Dataset
from ftmq.query import Q, Query
from ftmq.settings import STORE_URI
from ftmq.types import CE, CEGenerator, PathLike


Expand Down Expand Up @@ -208,7 +207,7 @@ def from_uri(

@cache
def get_store(
uri: PathLike | None = STORE_URI,
uri: PathLike | None = "memory:///",
catalog: C | None = None,
dataset: Dataset | str | None = None,
resolver: Resolver | str | None = None,
Expand Down
22 changes: 21 additions & 1 deletion ftmq/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from nomenklatura.dataset import DataCatalog, Dataset, DefaultDataset
from nomenklatura.entity import CE, CompositeEntity
from nomenklatura.statement import Statement
from normality import slugify

from ftmq.enums import Comparators
from ftmq.exceptions import ValidationError
Expand Down Expand Up @@ -50,7 +51,7 @@ def make_proxy(data: dict[str, Any], dataset: str | Dataset | None = None) -> CE
datasets = ensure_list(data.pop("datasets", None))
if dataset is not None:
if isinstance(dataset, str):
dataset = dataset = make_dataset(dataset)
dataset = make_dataset(dataset)
datasets.append(dataset.name)
elif datasets:
dataset = datasets[0]
Expand Down Expand Up @@ -103,3 +104,22 @@ def to_numeric(value: str) -> float | int | None:
return to_numeric(value.replace(",", ""))
if re.match(NUMERIC_DE, value):
return to_numeric(value.replace(".", "").replace(",", "."))


def join_slug(
*parts: str | None,
prefix: str | None = None,
sep: str = "-",
strict: bool = True,
max_len: int = 255,
) -> str | None:
sections = [slugify(p, sep=sep) for p in parts]
if strict and None in sections:
return None
texts = [p for p in sections if p is not None]
if not len(texts):
return None
prefix = slugify(prefix, sep=sep)
if prefix is not None:
texts = [prefix, *texts]
return sep.join(texts)[:max_len].strip(sep)
Loading

0 comments on commit dd943fc

Please sign in to comment.