Skip to content

Commit

Permalink
Merge pull request #35 from investigativedata/develop
Browse files Browse the repository at this point in the history
v0.2.1
  • Loading branch information
simonwoerpel committed Aug 7, 2023
2 parents e171dd9 + 81ab3a2 commit 6a2d2ab
Show file tree
Hide file tree
Showing 16 changed files with 1,578 additions and 765 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.2.0
current_version = 0.2.1
commit = True
tag = True
message = 🔖 Bump version: {current_version} → {new_version}
Expand Down
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@

## CHANGELOG

### 0.2.1 (2023-08-07)

- Align with `nomenklatura` v3

### 0.2.0 (2023-07-31)

- Model [nomenklatura](github.com/opensanctions/nomenklatura) catalog/dataset via pydantic
- Model [nomenklatura](https://github.com/opensanctions/nomenklatura) catalog/dataset via pydantic
- Replace `smart_open` with [fsspec](https://github.com/fsspec/filesystem_spec)
- add generic IO handling based on `fsspec`
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.0
0.2.1
2 changes: 1 addition & 1 deletion ftmq/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.0"
__version__ = "0.2.1"
44 changes: 33 additions & 11 deletions ftmq/io.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,37 @@
import contextlib
import logging
import sys
from collections.abc import Iterable
from typing import Any, Literal

import orjson
from banal import ensure_list, is_listish
from followthemoney import model
from fsspec import open
from nomenklatura.dataset import DefaultDataset
from nomenklatura.entity import CE, CompositeEntity
from nomenklatura.statement import Statement
from nomenklatura.util import PathLike

from ftmq.types import CEGenerator, SDict
from ftmq.types import CEGenerator, SDict, SGenerator
from ftmq.util import make_dataset

log = logging.getLogger(__name__)

def load_proxy(data: dict[str, Any]) -> CE:
proxy = CompositeEntity.from_dict(model, data)
proxy.datasets.discard("default")

def make_proxy(data: dict[str, Any], dataset: str | None = None) -> CE:
datasets = ensure_list(data.pop("datasets", None))
if dataset is not None:
datasets.append(dataset)
dataset = make_dataset(dataset)
elif datasets:
dataset = datasets[0]
dataset = make_dataset(dataset)
else:
dataset = DefaultDataset
proxy = CompositeEntity(dataset, data)
if datasets:
statements = get_statements(proxy, *datasets)
return CompositeEntity.from_statements(dataset, statements)
return proxy


Expand Down Expand Up @@ -73,7 +89,7 @@ def smart_read_proxies(
break
data = orjson.loads(line)
if serialize:
data = load_proxy(data)
data = make_proxy(data)
yield data


Expand Down Expand Up @@ -101,8 +117,14 @@ def apply_datasets(
) -> CEGenerator:
for proxy in proxies:
if datasets:
if replace:
proxy.datasets = set(datasets)
else:
proxy.datasets.update(datasets)
yield proxy
if not replace:
datasets = proxy.datasets | set(datasets)
statements = get_statements(proxy, *datasets)
dataset = make_dataset(list(datasets)[0])
yield CompositeEntity.from_statements(dataset, statements)


def get_statements(proxy: CE, *datasets: Iterable[str]) -> SGenerator:
datasets = datasets or ["default"]
for dataset in datasets:
yield from Statement.from_entity(proxy, dataset)
14 changes: 6 additions & 8 deletions ftmq/model/coverage.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
from collections import Counter
from datetime import date
from typing import Any

from nomenklatura.dataset.coverage import DataCoverage as NKCoverage
from pydantic import PrivateAttr

from ftmq.enums import Properties
from ftmq.types import CE, CEGenerator, Frequencies, Schemata

from .mixins import NKModel
from ftmq.model.mixins import NKModel
from ftmq.types import CE, CEGenerator, DateLike, Frequencies, Schemata


class Collector:
schemata: Counter = None
countries: set[str] = None
start: set[date] = None
end: set[date] = None
start: set[DateLike] = None
end: set[DateLike] = None

def __init__(self):
self.schemata = Counter()
Expand Down Expand Up @@ -59,8 +57,8 @@ class Coverage(NKModel):
_nk_model = NKCoverage
_collector: Collector | None = PrivateAttr()

start: date | None = None
end: date | None = None
start: DateLike | None = None
end: DateLike | None = None
countries: list[str] | None = []
frequency: Frequencies | None = "unknown"

Expand Down
11 changes: 8 additions & 3 deletions ftmq/types.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import os
from collections.abc import Generator
from datetime import date, datetime
from pathlib import Path
from typing import Any, Generator, Literal, TypeAlias
from typing import Any, Literal, TypeAlias

from nomenklatura.entity import CE
from nomenklatura.statement.statement import S

from .enums import Frequencies, Properties, Schemata
from ftmq.enums import Frequencies, Properties, Schemata

# a string-keyed dict
SDict: TypeAlias = dict[str, Any]
Expand All @@ -14,6 +17,7 @@

# composite entity generator
CEGenerator: TypeAlias = Generator[CE, None, None]
SGenerator: TypeAlias = Generator[S, None, None]

StrGenerator: TypeAlias = Generator[str, None, None]
BytesGenerator: TypeAlias = Generator[bytes, None, None]
Expand All @@ -23,7 +27,7 @@
Frequencies: TypeAlias = Literal[tuple(f.name for f in Frequencies)]

PathLike: TypeAlias = str | os.PathLike[str] | Path

DateLike: TypeAlias = date | datetime

__all__ = [
BytesGenerator,
Expand All @@ -34,6 +38,7 @@
Properties,
Schemata,
SDict,
SGenerator,
StrGenerator,
Value,
]
4 changes: 3 additions & 1 deletion ftmq/util.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
from typing import Generator
from collections.abc import Generator
from functools import cache

from nomenklatura.dataset import DataCatalog, Dataset


@cache
def make_dataset(name: str) -> Dataset:
catalog = DataCatalog(
Dataset, {"datasets": [{"name": name, "title": name.title()}]}
Expand Down
Loading

0 comments on commit 6a2d2ab

Please sign in to comment.