Skip to content

Commit

Permalink
⬆️ ftmq
Browse files Browse the repository at this point in the history
  • Loading branch information
simonwoerpel committed Mar 14, 2024
1 parent c99d59e commit 2fd169d
Show file tree
Hide file tree
Showing 18 changed files with 565 additions and 84 deletions.
34 changes: 20 additions & 14 deletions .github/workflows/datasets.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ on:
jobs:
data:
runs-on: ubuntu-latest
container: ghcr.io/investigativedata/investigraph-eu:main
services:
postgres:
image: postgres:alpine
Expand All @@ -33,14 +32,18 @@ jobs:
--health-retries 5
ports:
- 6379:6379
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
FSSPEC_S3_ENDPOINT_URL: ${{ secrets.FSSPEC_S3_ENDPOINT_URL }}
DEBUG: false
REDIS_URL: redis://redis:6379/0
PREFECT_API_DATABASE_CONNECTION_URL: postgresql+asyncpg://investigraph:investigraph@postgres/investigraph
PREFECT_TASK_RUNNER: dask
container:
image: ghcr.io/investigativedata/investigraph-eu:main
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
FSSPEC_S3_ENDPOINT_URL: ${{ secrets.FSSPEC_S3_ENDPOINT_URL }}
DEBUG: false
REDIS_URL: redis://redis:6379/0
PREFECT_API_DATABASE_CONNECTION_URL: postgresql+asyncpg://investigraph:investigraph@postgres/investigraph
PREFECT_TASK_RUNNER: dask
volumes:
- ./datasets:/datasets
strategy:
matrix:
dataset:
Expand All @@ -54,11 +57,14 @@ jobs:
catalog:
needs: data
runs-on: ubuntu-latest
container: ghcr.io/investigativedata/investigraph-eu:main
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
FSSPEC_S3_ENDPOINT_URL: ${{ secrets.FSSPEC_S3_ENDPOINT_URL }}
container:
image: ghcr.io/investigativedata/investigraph-eu:main
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
FSSPEC_S3_ENDPOINT_URL: ${{ secrets.FSSPEC_S3_ENDPOINT_URL }}
volumes:
- ./datasets:/datasets
steps:
- name: Update the catalog
run: "investigraph build-catalog /datasets/catalog.yml -o s3://data.ftm.store/investigraph.eu.json"
Expand Down
9 changes: 1 addition & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,6 @@ FROM ghcr.io/investigativedata/investigraph:main

USER root
RUN apt install -y curl
RUN pip install lxml html5lib psycopg2-binary
RUN pip install lxml html5lib
RUN pip uninstall -y followthemoney
RUN pip install "followthemoney @ git+https://github.com/investigativedata/followthemoney.git@schema/science-identifiers"

USER 1000

COPY ./catalog.yml /datasets/catalog.yml
COPY ./datasets/eu_transparency_register /datasets/eu_transparency_register
COPY ./datasets/eu_authorities /datasets/eu_authorities
COPY ./datasets/ec_meetings /datasets/ec_meetings
53 changes: 44 additions & 9 deletions catalog.yml
Original file line number Diff line number Diff line change
@@ -1,10 +1,45 @@
datasets:
- include: https://data.ftm.store/eu_transparency_register/index.json
- include: https://data.ftm.store/ec_meetings/index.json
- include: https://data.ftm.store/eu_fts/index.json
- include: https://data.ftm.store/eu_authorities/index.json
- include: https://data.ftm.store/eu_horizon_europe/index.json
- include: https://data.ftm.store/eu_fp7/index.json
- include: https://data.opensanctions.org/datasets/latest/eu_meps/index.json
- include: https://data.opensanctions.org/datasets/latest/eu_cor_members/index.json
- include: https://data.opensanctions.org/datasets/latest/eu_fsf/index.json
- from_uri: https://data.ftm.store/eu_transparency_register/index.json
- from_uri: https://data.ftm.store/ec_meetings/index.json
- from_uri: https://data.ftm.store/eu_fts/index.json
git_repo: https://github.com/investigativedata/investigraph-eu
maintainer:
name: investigativedata.io
url: https://investigativedata.io
logo_url: https://cdn.investigativedata.org/style/logo_square_light.svg
- from_uri: https://data.ftm.store/eu_authorities/index.json
git_repo: https://github.com/investigativedata/investigraph-eu
maintainer:
name: investigativedata.io
url: https://investigativedata.io
logo_url: https://cdn.investigativedata.org/style/logo_square_light.svg
- from_uri: https://data.ftm.store/eu_horizon_europe/index.json
git_repo: https://github.com/investigativedata/investigraph-eu
maintainer:
name: investigativedata.io
url: https://investigativedata.io
logo_url: https://cdn.investigativedata.org/style/logo_square_light.svg
- from_uri: https://data.ftm.store/eu_fp7/index.json
git_repo: https://github.com/investigativedata/investigraph-eu
maintainer:
name: investigativedata.io
url: https://investigativedata.io
logo_url: https://cdn.investigativedata.org/style/logo_square_light.svg
- from_uri: https://data.opensanctions.org/datasets/latest/eu_meps/index.json
git_repo: https://github.com/opensanctions/opensanctions
maintainer:
name: OpenSanctions
url: https://opensanctions.org
logo_url: https://assets.opensanctions.org/images/ura/logo_text.svg
- from_uri: https://data.opensanctions.org/datasets/latest/eu_cor_members/index.json
git_repo: https://github.com/opensanctions/opensanctions
maintainer:
name: OpenSanctions
url: https://opensanctions.org
logo_url: https://assets.opensanctions.org/images/ura/logo_text.svg
- from_uri: https://data.opensanctions.org/datasets/latest/eu_fsf/index.json
git_repo: https://github.com/opensanctions/opensanctions
maintainer:
name: OpenSanctions
url: https://opensanctions.org
logo_url: https://assets.opensanctions.org/images/ura/logo_text.svg
5 changes: 5 additions & 0 deletions datasets/ec_meetings/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ publisher:
Commission’s work – both in shaping new policies, and in steering them
through the other EU institutions. It supports the whole Commission.
url: https://commission.europa.eu/about-european-commission/departments-and-executive-agencies/secretariat-general_en
maintainer:
name: investigativedata.io
url: https://investigativedata.io
logo_url: https://cdn.investigativedata.org/style/logo_square_light.svg
git_repo: https://github.com/investigativedata/investigraph-eu

extract:
pandas:
Expand Down
60 changes: 26 additions & 34 deletions datasets/ec_meetings/transform.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,30 @@
from typing import Generator

from ftmq.util import fingerprint as fp
from ftmq.util import make_fingerprint_id as fp
from ftmq.util import make_entity_id
from investigraph.model import Context
from investigraph.types import CE, CEGenerator, Record
from investigraph.util import join_text
from investigraph.util import clean_name, join_text


def make_address(ctx: Context, data: Record) -> CE | None:
proxy = ctx.make_proxy("Address")
location = data.pop("Location")
if not fp(location):
return
proxy.id = ctx.make_id(fp(location), prefix="addr")
proxy.add("full", location)
return proxy
proxy_id = ctx.make_id(fp(location), prefix="addr")
return ctx.make_proxy("Address", proxy_id, full=location)


def make_person(ctx: Context, name: str, role: str, body: CE) -> CE:
proxy = ctx.make_proxy("Person")
proxy.id = ctx.make_slug("person", make_entity_id(body.id, fp(name)))
proxy.add("name", name)
proxy.add("description", role)
return proxy
proxy_id = ctx.make_slug("person", make_entity_id(body.id, fp(name)))
return ctx.make_proxy("Person", proxy_id, name=name, description=role)


def make_organization(ctx: Context, regId: str, name: str | None = None) -> CE:
proxy = ctx.make_proxy("Organization")
proxy.id = ctx.make_slug(regId, prefix="eu-tr")
proxy_id = ctx.make_slug(regId, prefix="eu-tr")
proxy = ctx.make_proxy("Organization", proxy_id, idNumber=regId)
if fp(name):
proxy.add("name", name)
proxy.add("idNumber", regId)
return proxy


Expand All @@ -56,17 +50,14 @@ def make_organizations(ctx: Context, data: Record) -> CEGenerator:
data.pop("Name of interest representative"),
regIds,
):
org = make_organization(ctx, regId, name)
if org.id:
if clean_name(regId):
orgs = True
yield org
yield make_organization(ctx, regId, name)
if not orgs:
# yield only via id
for regId in regIds.split(","):
regId = regId.strip()
org = make_organization(ctx, regId)
if org.id:
yield org
if clean_name(regId):
yield make_organization(ctx, regId)


def make_persons(ctx: Context, data: Record, body: CE) -> CEGenerator:
Expand All @@ -83,12 +74,12 @@ def make_event(
) -> CEGenerator:
date = data.pop("Date of meeting")
participants = [o for o in make_organizations(ctx, data)]
proxy = ctx.make_proxy("Event")
proxy.id = ctx.make_slug(
proxy_id = ctx.make_slug(
"meeting",
date,
make_entity_id(organizer.id, *sorted([p.id for p in participants])),
)
proxy = ctx.make_proxy("Event", proxy_id)
label = join_text(*[p.first("name") for p in participants])
name = f"{date} - {organizer.caption} x {label}"
proxy.add("name", name)
Expand Down Expand Up @@ -119,8 +110,8 @@ def parse_record(ctx: Context, data: Record, body: CE):
yield from involved

for member in involved:
rel = ctx.make_proxy("Membership")
rel.id = ctx.make_slug("membership", make_entity_id(body.id, member.id)) # noqa
rel_id = ctx.make_slug("membership", make_entity_id(body.id, member.id)) # noqa
rel = ctx.make_proxy("Membership", rel_id)
rel.add("organization", body)
rel.add("member", member)
rel.add("role", member.get("description"))
Expand All @@ -129,11 +120,9 @@ def parse_record(ctx: Context, data: Record, body: CE):

def parse_record_ec(ctx: Context, data: Record):
# meetings of EC representatives
body = ctx.make_proxy("PublicBody")
name = data.pop("Name of cabinet")
body.id = ctx.make_slug(fp(name))
body.add("name", name)
body.add("jurisdiction", "eu")
body_id = ctx.make_slug(fp(name))
body = ctx.make_proxy("PublicBody", body_id, name=name, jurisdiction="eu")

yield body
yield from parse_record(ctx, data, body)
Expand All @@ -142,11 +131,14 @@ def parse_record_ec(ctx: Context, data: Record):
def parse_record_dg(ctx: Context, data: Record):
# meetings of EC Directors-General
acronym = data.pop("Name of DG - acronym")
body = ctx.make_proxy("PublicBody")
body.id = ctx.make_slug("dg", acronym)
body.add("name", data.pop("Name of DG - full name"))
body.add("weakAlias", acronym)
body.add("jurisdiction", "eu")
body_id = ctx.make_slug("dg", acronym)
body = ctx.make_proxy(
"PublicBody",
body_id,
name=data.pop("Name of DG - full name"),
weakAlias=acronym,
jurisdiction="eu",
)

yield body
yield from parse_record(ctx, data, body)
Expand Down
79 changes: 79 additions & 0 deletions datasets/eu_farmsubsidies/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
name: eu_farmsubsidies
title: EU Farmsubsidies
prefix: eu-fs
country: eu
summary: |
Farm subsidies are EU payments to public authorities, companies and farmers
who are active in the agricultural sector and/or contribute to its
maintenance. Subsidies are financed by taxpayers' money, primarily through
the EU, but the Member States also contribute financially in a direct way.
Subsidies are part of the EU's Common Agricultural Policy (CAP), which came
into force in 1962. All the data presented on this website has either been
published directly by EU country governments, or obtained from them by
freedom of information requests.
resources:
- name: entities.ftm.json
url: https://data.ftm.store/eu_farmsubsidies/entities.ftm.json
mime_type: application/json+ftm
publisher:
name: farmsubsidy.org
description: |
The aim of farmsubsidy.org is to obtain detailed data relating to payments
and recipients of farm subsidies in every EU member state and make this
data available in a way that is useful to European citizens.
Farmsubsidy.org is run by FragDenStaat, the central contact for all
questions relating to freedom of information in Germany.
url: https://www.farmsubsidy.org

seed:
handler: ./extract.py:seed
glob:
- https://data.farmsubsidy.org/cleaned/*.csv.gz


extract:
handler: ./extract.py:handle
fetch: false

transform:
queries:
- entities:
recipient:
schema: LegalEntity
id_column: recipient_id
properties:
name:
column: recipient_name
address:
column: recipient_address
country:
column: recipient_country
website:
column: recipient_url
payment:
schema: Payment
id_column: pk
properties:
purpose:
columns:
- scheme_code
- scheme
join: " - "
programme:
literal: "The Common Agricultural Policy (CAP)"
description:
column: scheme_description
amount:
column: amount_original
currency:
column: currency_original
amountEur:
column: amount
date:
column: year
beneficiary:
entity: recipient

load:
index_uri: s3://data.ftm.store/eu_farmsubsidies_2023/index.json
entities_uri: s3://data.ftm.store/eu_farmsubsidies_2023/entities.ftm.json
Loading

0 comments on commit 2fd169d

Please sign in to comment.