## Run dependant pipelines

In [16]:
!{'cd /pipelines; KNESSET_LOAD_FROM_URL=1 dpp run --verbose ./people/committee-meeting-attendees'}

[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 RUNNING ./people/committee-meeting-attendees
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Collecting dependencies
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Running async task
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Waiting for completion
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Async task starting
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Searching for existing caches
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Building process chain:
[./people/committee-meeting-attendees:T_0] >>> INFO    :- load_resource
[./people/committee-meeting-attendees:T_0] >>> INFO    :- dump.to_path
[./people/committee-meeting-attendees:T_0] >>> INFO    :- (sink)
[./people/committee-meeting-attendees:T_0] >>> INFO    :load_resource: DEBUG   :Starting new HTTP connection (1): storage.googleapis.com:80
[./people/co

## Add a filter step

You should uncomment / add a filter step to committee-meeting-speaker-stats pipeline:

```
  - run: filter
    cache: true
    parameters:
      resources: kns_committeesession
      in:
      - CommitteeSessionID: 2059313
  - run: committee_meeting_speaker_stats
```

## Download protocol parts files

upgrade to latest dataflows library

In [None]:
!{'pip install --upgrade dataflows'}

Restart the kernel if an upgrade was done

Choose some session IDs to download protocol files for:

In [1]:
session_ids = [2059313,86485,2058899,2081445,65548]

In [5]:
from dataflows import Flow, load

sessions = []

for session in Flow(load('/pipelines/data/committees/kns_committeesession/datapackage.json')).results()[0][0]:
    if session['CommitteeSessionID'] in session_ids:
        sessions.append(session)

In [7]:
import os
import subprocess
import sys

for session in sessions:
    for attr in ['text_parsed_filename', 'parts_parsed_filename']:
        filename = session[attr]
        if filename:
            pathpart = 'meeting_protocols_text' if attr == 'text_parsed_filename' else 'meeting_protocols_parts'
            url = 'https://production.oknesset.org/pipelines/data/committees/{}/{}'.format(pathpart, filename)
            filename = '/pipelines/data/committees/{}/{}'.format(pathpart, session[attr])
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            cmd = 'curl -s -o {} {}'.format(filename, url)
            print(cmd, file=sys.stderr)
            subprocess.check_call(cmd, shell=True)

curl -s -o /pipelines/data/committees/meeting_protocols_text/files/6/5/65548.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/6/5/65548.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/6/5/65548.csv https://production.oknesset.org/pipelines/data/committees/meeting_protocols_parts/files/6/5/65548.csv
curl -s -o /pipelines/data/committees/meeting_protocols_text/files/8/6/86485.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/8/6/86485.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/8/6/86485.csv https://production.oknesset.org/pipelines/data/committees/meeting_protocols_parts/files/8/6/86485.csv
curl -s -o /pipelines/data/committees/meeting_protocols_text/files/2/0/2058899.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/2/0/2058899.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/2/0/2058899.cs

## Run the pipeline

In [11]:
!{'rm -rf /pipelines/data/people/committees/meeting-speaker-stats/cache_hash'}

In [12]:
!{'cd /pipelines; dpp run --verbose ./people/committee-meeting-speaker-stats'}

[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :b37196e2 RUNNING ./people/committee-meeting-speaker-stats
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :b37196e2 Collecting dependencies
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :b37196e2 Running async task
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :b37196e2 Waiting for completion
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :b37196e2 Async task starting
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :b37196e2 Searching for existing caches
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :Found cache for step 1: filter
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :b37196e2 Building process chain:
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- cache_loader
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- committee_meeting_speaker_stats
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- knesse

## Inspect the output

In [29]:
from dataflows import Flow, load, printer
from collections import defaultdict

session_parts_stats = {}
header_parts_stats = {}

def aggregate_speaker_session_stats(rows):
    if rows.res.name == 'speaker_stats':
        for row in rows:
            session = sessions_data
            session_stats_row = session_parts_stats.setdefault(row['CommitteeSessionID'], defaultdict(int))
            session_stats_row['num_parts'] += 1
            headers_stats_row = header_parts_stats.setdefault(row['header'], defaultdict(int))
            headers_stats_row['num_parts'] += 1
    yield from rows

Flow(
    load('/pipelines/data/people/committees/meeting-speaker-stats/datapackage.json'),
    aggregate_speaker_session_stats,
).process()

print('  session_id | stats')
for session_id, parts_stats in session_parts_stats.items():
    print('{:10}   | {}'.format(session_id, parts_stats))

  session_id | stats
   2058899   | defaultdict(<class 'int'>, {'num_parts': 128})
   2059313   | defaultdict(<class 'int'>, {'num_parts': 435})
     65548   | defaultdict(<class 'int'>, {'num_parts': 44})
     86485   | defaultdict(<class 'int'>, {'num_parts': 29})


## Header stats

In [42]:
sample_rows = (
    {
        'header': header,
        **headers_stats_row
    } for header, headers_stats_row in header_parts_stats.items()
    if header and 'יעל תמיר' in header
)

print('           header          | stats')
for row in sample_rows:
    print('{:25}  |  {}'.format(row.pop('header').strip(), row))


           header          | stats
שרת הקליטה יעל תמיר        |  {'num_parts': 25}
שרת העליה והקליטה יעל תמיר  |  {'num_parts': 1}
שר הקליטה והעליה יעל תמיר  |  {'num_parts': 1}
