Skip to content

Commit

Permalink
Merge pull request #203 from investigativedata/develop
Browse files Browse the repository at this point in the history
v0.6.1
  • Loading branch information
simonwoerpel committed Mar 14, 2024
2 parents 26fd016 + 0b8dea9 commit d9984d6
Show file tree
Hide file tree
Showing 26 changed files with 1,300 additions and 46,671 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.6.0
current_version = 0.6.1
commit = True
tag = True
message = 🔖 Bump version: {current_version} → {new_version}
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.6.0
0.6.1
2 changes: 1 addition & 1 deletion ftmq/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ftmq.query import Query

__version__ = "0.6.0"
__version__ = "0.6.1"
__all__ = ["Query"]
20 changes: 5 additions & 15 deletions ftmq/aggregations.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,20 @@
import statistics
from collections import defaultdict
from functools import cache
from typing import Any, Generator, Iterable, TypeAlias

from anystore.util import clean_dict
from banal import ensure_list
from followthemoney.schema import Schema
from followthemoney.types import registry
from pydantic import BaseModel

from ftmq.enums import Aggregations, Fields, Properties
from ftmq.types import CE, CEGenerator
from ftmq.util import to_numeric
from ftmq.util import prop_is_numeric, to_numeric

Value: TypeAlias = int | float | str
Values: TypeAlias = list[Value]


@cache
def get_is_numeric(schema: Schema, prop: str) -> bool:
prop = schema.get(prop)
if prop is not None:
return prop.type == registry.number
return False


class Aggregation(BaseModel):
prop: Properties | Fields
func: Aggregations
Expand Down Expand Up @@ -71,7 +61,7 @@ def get_proxy_values(
yield from proxy.get(prop, quiet=True)

def collect(self, proxy: CE) -> CE:
is_numeric = get_is_numeric(proxy.schema, self.prop)
is_numeric = prop_is_numeric(proxy.schema, self.prop)
for value in self.get_proxy_values(proxy):
if is_numeric:
value = to_numeric(value)
Expand Down Expand Up @@ -118,9 +108,9 @@ def __exit__(self, *args, **kwargs) -> None:
for agg in self.aggregations:
self.result[str(agg.func)][str(agg.prop)] = agg.value
for group in agg.group_props:
self.result["groups"][str(group)][str(agg.func)][
str(agg.prop)
] = agg.groups[group]
self.result["groups"][str(group)][str(agg.func)][str(agg.prop)] = (
agg.groups[group]
)
self.result = clean_dict(self.result)

def apply(self, proxies: CEGenerator) -> CEGenerator:
Expand Down
2 changes: 1 addition & 1 deletion ftmq/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def q(
smart_write_proxies(output_uri, proxies, serialize=True, dataset=store_dataset)
if stats_uri:
stats = stats.export()
stats = orjson.dumps(stats.dict(), option=orjson.OPT_APPEND_NEWLINE)
stats = orjson.dumps(stats.model_dump(), option=orjson.OPT_APPEND_NEWLINE)
smart_write(stats_uri, stats)
if q.aggregator:
result = orjson.dumps(
Expand Down
12 changes: 10 additions & 2 deletions ftmq/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
from ftmq.model.coverage import Coverage
from ftmq.model.coverage import DatasetStats
from ftmq.model.dataset import Catalog, Dataset, Maintainer, Publisher, Resource
from ftmq.model.proxy import Entity

__all__ = [Catalog, Coverage, Dataset, Entity, Maintainer, Publisher, Resource]
__all__ = [
"Catalog",
"DatasetStats",
"Dataset",
"Entity",
"Maintainer",
"Publisher",
"Resource",
]
19 changes: 10 additions & 9 deletions ftmq/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@
)
from ftmq.sql import Sql
from ftmq.types import CEGenerator
from ftmq.util import parse_comparator, parse_unknown_filters
from ftmq.util import (
parse_comparator,
parse_unknown_filters,
prop_is_numeric,
to_numeric,
)

Q = TypeVar("Q", bound="Query")
Slice = TypeVar("Slice", bound=slice)
Expand All @@ -33,9 +38,10 @@ def __init__(self, values: Iterable[str], ascending: bool | None = True) -> None
def apply(self, proxy: CE) -> tuple[str]:
values = tuple()
for v in self.values:
p_values = proxy.get(v, quiet=True)
if p_values is not None:
values = values + (tuple(p_values))
p_values = proxy.get(v, quiet=True) or []
if prop_is_numeric(proxy.schema, v):
p_values = map(to_numeric, p_values)
values = values + (tuple(p_values))
return values

def apply_iter(self, proxies: CEGenerator) -> CEGenerator:
Expand Down Expand Up @@ -301,8 +307,3 @@ def apply_iter(self, proxies: CEGenerator) -> CEGenerator:
self.aggregator = self.get_aggregator()
proxies = self.aggregator.apply(proxies)
yield from proxies

def apply_aggregations(self, proxies: CEGenerator) -> Aggregator:
aggregator = self.get_aggregator()
[x for x in aggregator.apply(proxies)]
return aggregator
3 changes: 2 additions & 1 deletion ftmq/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,10 +164,11 @@ def _sorted_statements(self) -> Select:
value = self.table.c.value
if PropertyTypesMap[prop].value == registry.number:
value = func.cast(self.table.c.value, NUMERIC)
group_func = func.min if self.q.sort.ascending else func.max
inner = (
select(
self.table.c.canonical_id,
func.group_concat(value).label("sortable_value"),
group_func(value).label("sortable_value"),
)
.where(
and_(
Expand Down
4 changes: 2 additions & 2 deletions ftmq/store/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def aggregations(self, query: Q) -> AggregatorResult | None:
key = f"agg-{hash(query)}"
if key in self._cache:
return self._cache[key]
aggregator = query.apply_aggregations(self.entities(query))
res = dict(aggregator.result)
_ = [x for x in self.entities(query)]
res = dict(query.aggregator.result)
self._cache[key] = res
return res
22 changes: 15 additions & 7 deletions ftmq/util.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re
from collections.abc import Generator, Iterable
from functools import cache, lru_cache
from typing import Any
from typing import Any, Generator

import pycountry
from banal import ensure_list
from followthemoney.schema import Schema
from followthemoney.types import registry
from followthemoney.util import make_entity_id, sanitize_text
from nomenklatura.dataset import Dataset
Expand Down Expand Up @@ -72,7 +72,7 @@ def make_proxy(data: dict[str, Any], dataset: str | Dataset | None = None) -> CE
return proxy


def get_statements(proxy: CE, *datasets: Iterable[str]) -> SGenerator:
def get_statements(proxy: CE, *datasets: str) -> SGenerator:
datasets = datasets or ["default"]
for dataset in datasets:
# FIXME
Expand Down Expand Up @@ -177,7 +177,7 @@ def clean_name(value: Any) -> str | None:


@lru_cache(1024)
def fingerprint(value: Any) -> str | None:
def make_fingerprint(value: Any) -> str | None:
"""
Create a stable but simplified string or None from input that can be used
to generate ids (to mimic `fingerprints.generate` which is unstable for IDs
Expand All @@ -190,10 +190,18 @@ def fingerprint(value: Any) -> str | None:


@lru_cache(1024)
def string_id(value: Any) -> str | None:
def make_string_id(value: Any) -> str | None:
return make_entity_id(clean_name(value))


@lru_cache(1024)
def fingerprint_id(value: Any) -> str | None:
return make_entity_id(fingerprint(value))
def make_fingerprint_id(value: Any) -> str | None:
return make_entity_id(make_fingerprint(value))


@cache
def prop_is_numeric(schema: Schema, prop: str) -> bool:
prop = schema.get(prop)
if prop is not None:
return prop.type == registry.number
return False
Loading

0 comments on commit d9984d6

Please sign in to comment.