Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

v0.3.4 #85

Merged
merged 39 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
10f0f5e
🚧 (model) tweak logo_url and source uri types
simonwoerpel Oct 3, 2023
0663d1b
🐛 (model) fix dataset.prefix initialization
simonwoerpel Oct 3, 2023
fea925e
✨ (store) implement resolver
simonwoerpel Oct 4, 2023
8104b1c
✨ (cli) catalog iterate
simonwoerpel Oct 5, 2023
54a9edc
🚧 (dataset) use own DefaultDataset
simonwoerpel Oct 6, 2023
bc3651f
🐛 gosh
simonwoerpel Oct 6, 2023
6b6d123
🚧 (io) simplify smart_get_store
simonwoerpel Oct 6, 2023
12eab18
🚧 (model) tweak catalog names getter
simonwoerpel Oct 6, 2023
1276c19
🚧 (model) struggle with external nk modelling sync
simonwoerpel Oct 6, 2023
5dc7fb5
✨ (cli) make-dataset from yaml
simonwoerpel Oct 7, 2023
bd30679
✨ (aggregation) add grouping
simonwoerpel Oct 7, 2023
c39b50c
🚧 (aggregations) improve grouping
simonwoerpel Oct 7, 2023
287ce8a
✅ tweak test
simonwoerpel Oct 7, 2023
814458e
✨ (util) clean_dict for serialization
simonwoerpel Oct 7, 2023
4ab5f5c
⚡️ (store) adjust sql agg groups limit via env var
simonwoerpel Oct 7, 2023
a053f01
🚧 (aggregations) rewrite sql grouping agg
simonwoerpel Oct 8, 2023
64da543
✅ (tests) fix test_util
simonwoerpel Oct 8, 2023
afc7799
✅ (tests) fix test_proxy
simonwoerpel Oct 8, 2023
4e77832
✨ (aggregation) add group by years
simonwoerpel Oct 9, 2023
4072fe7
✅ (tests) fix test_query
simonwoerpel Oct 9, 2023
882462b
Bump orjson from 3.9.7 to 3.9.8
dependabot[bot] Oct 10, 2023
22df5e9
Bump mypy from 1.5.1 to 1.6.0
dependabot[bot] Oct 11, 2023
97362df
Merge pull request #90 from investigativedata/dependabot/pip/develop/…
simonwoerpel Oct 12, 2023
fa313d9
Merge pull request #89 from investigativedata/dependabot/pip/develop/…
simonwoerpel Oct 12, 2023
49194e1
Bump pre-commit from 3.4.0 to 3.5.0
dependabot[bot] Oct 13, 2023
eaf083b
Bump sqlalchemy from 2.0.21 to 2.0.22
dependabot[bot] Oct 13, 2023
6f10db0
Bump orjson from 3.9.8 to 3.9.9
dependabot[bot] Oct 13, 2023
63af2f8
🐛 (io) make read / write functions more stable
simonwoerpel Oct 14, 2023
facf16d
✨ (util) add some string helper functions
simonwoerpel Oct 14, 2023
5b9a4e8
🧑‍💻 vscode
simonwoerpel Oct 14, 2023
39ed1cd
Merge pull request #93 from investigativedata:dependabot/pip/develop/…
simonwoerpel Oct 14, 2023
baf8bdf
Merge pull request #92 from investigativedata:dependabot/pip/develop/…
simonwoerpel Oct 14, 2023
8978301
🚧 (model) add aleph categories for dataset
simonwoerpel Oct 16, 2023
16f88a5
Merge pull request #91 from investigativedata/dependabot/pip/develop/…
simonwoerpel Oct 16, 2023
0f6093e
📝 CHANGELOG.md
simonwoerpel Oct 16, 2023
48a516f
✅ (tests) fix test_model
simonwoerpel Oct 16, 2023
1facdea
⬆️ pre-commit
simonwoerpel Oct 16, 2023
fb3a8ff
⬆️ poetry lock
simonwoerpel Oct 16, 2023
a92ad62
🔖 Bump version: 0.3.3 → 0.3.4
simonwoerpel Oct 16, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.3.3
current_version = 0.3.4
commit = True
tag = True
message = 🔖 Bump version: {current_version} → {new_version}
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
# * Run "pre-commit install".
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.4.0
rev: v4.5.0
hooks:
- id: check-added-large-files
- id: check-case-conflict
Expand Down
7 changes: 7 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@

## CHANGELOG

### 0.3.4 (2023-10-16)

- Add group based aggregation, aggregate by year
- Add some util functions from downstream dependencies
- Update dependencies

### 0.3.3 (2023-10-03)

- Update dependencies
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.3.3
0.3.4
2 changes: 1 addition & 1 deletion ftmq/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ftmq.query import Query

__version__ = "0.3.3"
__version__ = "0.3.4"
__all__ = ["Query"]
80 changes: 62 additions & 18 deletions ftmq/aggregations.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import statistics
from collections import defaultdict
from functools import cache
from typing import Any, Iterable, TypeAlias
from typing import Any, Generator, Iterable, TypeAlias

from banal import ensure_list
from followthemoney.schema import Schema
from followthemoney.types import registry
from pydantic import BaseModel

from ftmq.enums import Aggregations, Properties
from ftmq.enums import Aggregations, Fields, Properties
from ftmq.types import CE, CEGenerator
from ftmq.util import to_numeric
from ftmq.util import clean_dict, to_numeric

Value: TypeAlias = int | float | str
Values: TypeAlias = list[Value]


@cache
Expand All @@ -24,34 +25,60 @@ def get_is_numeric(schema: Schema, prop: str) -> bool:


class Aggregation(BaseModel):
prop: Properties
prop: Properties | Fields
func: Aggregations
values: list[int | float] = []
values: Values = []
value: Value | None = None
group_props: list[Properties | Fields] | None = []
grouper: dict[Properties, dict[str, Values]] = defaultdict(
lambda: defaultdict(list)
)
groups: dict[Properties, dict[str, Value]] = defaultdict(dict)

def __hash__(self) -> int:
return hash((self.prop, self.func))
return hash((self.prop, self.func, *sorted(ensure_list(self.group_props))))

def __eq__(self, other: Any) -> bool:
return hash(self) == hash(other)

def get_value(self) -> Value | None:
def get_value(self, values: Values) -> Value | None:
if self.func == "min":
return min(self.values)
return min(values)
if self.func == "max":
return max(self.values)
return max(values)
if self.func == "sum":
return sum(self.values)
return sum(values)
if self.func == "avg":
return statistics.mean(self.values)
return statistics.mean(values)
if self.func == "count":
return len(set(values))

def get_proxy_values(
self, proxy: CE, prop: Properties | Fields | None = None
) -> Generator[str, None, None]:
prop = prop or self.prop
if prop == Fields.id:
yield proxy.id
elif prop == Fields.dataset:
yield from proxy.datasets
elif prop == Fields.schema:
yield proxy.schema.name
elif prop == Fields.year:
for value in proxy.get_type_values(registry.date):
yield value[:4]
else:
yield from proxy.get(prop, quiet=True)

def collect(self, proxy: CE) -> CE:
is_numeric = get_is_numeric(proxy.schema, self.prop)
for value in proxy.get(self.prop, quiet=True):
for value in self.get_proxy_values(proxy):
if is_numeric:
value = to_numeric(value)
if value is not None:
self.values.append(value)
for prop in self.group_props:
for g in self.get_proxy_values(proxy, prop):
self.grouper[prop][g].append(value)
return proxy

def apply(self, proxies: CEGenerator) -> CEGenerator:
Expand All @@ -63,14 +90,19 @@ def __enter__(self) -> "Aggregation":
return self

def __exit__(self, *args, **kwargs) -> None:
self.value = self.get_value()
self.value = self.get_value(self.values)
for prop in self.group_props:
for g, values in self.grouper[prop].items():
self.groups[prop][g] = self.get_value(values)

def dict(self, *args, **kwargs) -> dict[str, Any]:
self.__exit__()
return super().dict(*args, **kwargs)


AggregatorResult: TypeAlias = dict[Aggregations, dict[Properties, Value]]
AggregatorResult: TypeAlias = dict[
Aggregations | dict[str, Aggregations], dict[Properties, Value]
]


class Aggregator(BaseModel):
Expand All @@ -81,8 +113,14 @@ def __enter__(self) -> "Aggregator":
return self

def __exit__(self, *args, **kwargs) -> None:
self.result["groups"] = defaultdict(lambda: defaultdict(dict))
for agg in self.aggregations:
self.result[str(agg.func)][str(agg.prop)] = agg.value
for group in agg.group_props:
self.result["groups"][str(group)][str(agg.func)][
str(agg.prop)
] = agg.groups[group]
self.result = clean_dict(self.result)

def apply(self, proxies: CEGenerator) -> CEGenerator:
for agg in self.aggregations:
Expand All @@ -91,17 +129,23 @@ def apply(self, proxies: CEGenerator) -> CEGenerator:
self.__exit__()

@classmethod
def from_dict(cls, data: dict[Aggregations, Iterable[Properties]]) -> "Aggregator":
def from_dict(
cls, data: dict[Aggregations | str, Iterable[Properties]]
) -> "Aggregator":
groups = ensure_list(data.pop("groups", None))
return cls(
aggregations=[
Aggregation(prop=p, func=agg)
Aggregation(prop=p, func=agg, group_props=groups)
for agg, props in data.items()
for p in ensure_list(props)
]
],
)

def to_dict(self) -> dict[str, set[str]]:
data = defaultdict(set)
data["groups"] = defaultdict(lambda: defaultdict(set))
for agg in self.aggregations:
data[str(agg.func)].add(str(agg.prop))
return dict(data)
for group in agg.group_props:
data["groups"][str(group)][str(agg.func)].add(str(agg.prop))
return clean_dict(data)
Loading
Loading