## Run dependant pipelines

In [16]:
!{'cd /pipelines; KNESSET_LOAD_FROM_URL=1 dpp run --verbose ./people/committee-meeting-attendees'}

[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 RUNNING ./people/committee-meeting-attendees
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Collecting dependencies
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Running async task
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Waiting for completion
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Async task starting
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Searching for existing caches
[./people/committee-meeting-attendees:T_0] >>> INFO    :1701d501 Building process chain:
[./people/committee-meeting-attendees:T_0] >>> INFO    :- load_resource
[./people/committee-meeting-attendees:T_0] >>> INFO    :- dump.to_path
[./people/committee-meeting-attendees:T_0] >>> INFO    :- (sink)
[./people/committee-meeting-attendees:T_0] >>> INFO    :load_resource: DEBUG   :Starting new HTTP connection (1): storage.googleapis.com:80
[./people/co

## Add a filter step

You should uncomment / add a filter step to committee-meeting-speaker-stats pipeline:

```
  - run: filter
    cache: true
    parameters:
      resources: kns_committeesession
      in:
      - CommitteeSessionID: 2059313
  - run: committee_meeting_speaker_stats
```

## Download protocol parts files

upgrade to latest dataflows library

In [None]:
!{'pip install --upgrade dataflows'}

Restart the kernel if an upgrade was done

Choose some session IDs to download protocol files for:

In [1]:
session_ids = [2059313,86485,2058899]

In [2]:
from dataflows import Flow, load

sessions = []

for session in Flow(load('/pipelines/data/committees/kns_committeesession/datapackage.json')).results()[0][0]:
    if session['CommitteeSessionID'] in session_ids:
        sessions.append(session)

In [3]:
import os
import subprocess
import sys

for session in sessions:
    for attr in ['text_parsed_filename', 'parts_parsed_filename']:
        filename = session[attr]
        if filename:
            pathpart = 'meeting_protocols_text' if attr == 'text_parsed_filename' else 'meeting_protocols_parts'
            url = 'https://production.oknesset.org/pipelines/data/committees/{}/{}'.format(pathpart, filename)
            filename = '/pipelines/data/committees/{}/{}'.format(pathpart, session[attr])
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            cmd = 'curl -s -o {} {}'.format(filename, url)
            print(cmd, file=sys.stderr)
            subprocess.check_call(cmd, shell=True)

curl -s -o /pipelines/data/committees/meeting_protocols_text/files/8/6/86485.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/8/6/86485.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/8/6/86485.csv https://production.oknesset.org/pipelines/data/committees/meeting_protocols_parts/files/8/6/86485.csv
curl -s -o /pipelines/data/committees/meeting_protocols_text/files/2/0/2058899.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/2/0/2058899.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/2/0/2058899.csv https://production.oknesset.org/pipelines/data/committees/meeting_protocols_parts/files/2/0/2058899.csv
curl -s -o /pipelines/data/committees/meeting_protocols_text/files/2/0/2059313.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/2/0/2059313.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/2/0/20

## Run the pipeline

In [28]:
!{'rm -rf /pipelines/data/people/committees/meeting-speaker-stats/cache_hash'}

In [34]:
!{'cd /pipelines; dpp run --verbose ./people/committee-meeting-speaker-stats'}

[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :4ecc5fb1 RUNNING ./people/committee-meeting-speaker-stats
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :4ecc5fb1 Collecting dependencies
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :4ecc5fb1 Running async task
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :4ecc5fb1 Waiting for completion
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :4ecc5fb1 Async task starting
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :4ecc5fb1 Searching for existing caches
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :Found cache for step 1: filter
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :4ecc5fb1 Building process chain:
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- cache_loader
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- committee_meeting_speaker_stats
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- knesse

## Inspect the output

In [18]:
from dataflows import Flow, load, printer
from collections import defaultdict

session_parts_stats = {}
header_parts_stats = {}

def aggregate_speaker_session_stats(rows):
    if rows.res.name == 'speaker_stats':
        for row in rows:
            session_stats_row = session_parts_stats.setdefault(row['CommitteeSessionID'], defaultdict(int))
            session_stats_row['num_parts'] += 1
            session_stats_row['num_words'] += row['body_num_words']
            session_stats_row['total_length'] += row['body_length']
            headers_stats_row = header_parts_stats.setdefault(row['header'], defaultdict(int))
            headers_stats_row['num_parts'] += 1
            headers_stats_row['num_words'] += row['body_num_words']
            headers_stats_row['total_length'] += row['body_length']
    yield from rows

Flow(
    load('/pipelines/data/people/committees/meeting-speaker-stats/datapackage.json'),
    aggregate_speaker_session_stats,
).process()

print('  session_id | stats')
for session_id, parts_stats in session_parts_stats.items():
    print('{:10}   | {}'.format(session_id, dict(parts_stats)))

  session_id | stats
   2058899   | {'num_parts': 128, 'num_words': Decimal('6909'), 'total_length': Decimal('38045')}
   2059313   | {'num_parts': 435, 'num_words': Decimal('18724'), 'total_length': Decimal('104018')}
     86485   | {'num_parts': 29, 'num_words': Decimal('772'), 'total_length': Decimal('4829')}


## Header stats

In [19]:
sample_rows = (
    {
        'header': header,
        **headers_stats_row
    } for header, headers_stats_row in header_parts_stats.items()
    if header and 'יעל תמיר' in header
)

print('           header          | stats')
for row in sample_rows:
    print('{:25}  |  {}'.format(row.pop('header').strip(), row))


           header          | stats
שרת הקליטה יעל תמיר        |  {'num_parts': 25, 'num_words': Decimal('1833'), 'total_length': Decimal('10004')}
שרת העליה והקליטה יעל תמיר  |  {'num_parts': 1, 'num_words': Decimal('72'), 'total_length': Decimal('393')}
שר הקליטה והעליה יעל תמיר  |  {'num_parts': 1, 'num_words': Decimal('80'), 'total_length': Decimal('450')}


## Speaker categorization

In [35]:
for row in Flow(
    load('/pipelines/data/people/committees/meeting-speaker-stats/datapackage.json'),
).results()[0][1]:
    if row['CommitteeSessionID'] == 2059313:
        print(row)

{'CommitteeSessionID': Decimal('2059313'), 'parts_crc32c': 'iafLBQ==', 'part_index': Decimal('0'), 'header': None, 'body_length': Decimal('280'), 'body_num_words': Decimal('30'), 'part_category': None}
{'CommitteeSessionID': Decimal('2059313'), 'parts_crc32c': 'iafLBQ==', 'part_index': Decimal('1'), 'header': 'סדר היום', 'body_length': Decimal('343'), 'body_num_words': Decimal('57'), 'part_category': None}
{'CommitteeSessionID': Decimal('2059313'), 'parts_crc32c': 'iafLBQ==', 'part_index': Decimal('10'), 'header': 'יעקב ליצמן', 'body_length': Decimal('47'), 'body_num_words': Decimal('10'), 'part_category': 'chairperson'}
{'CommitteeSessionID': Decimal('2059313'), 'parts_crc32c': 'iafLBQ==', 'part_index': Decimal('100'), 'header': 'יעל אנדורן', 'body_length': Decimal('71'), 'body_num_words': Decimal('13'), 'part_category': None}
{'CommitteeSessionID': Decimal('2059313'), 'parts_crc32c': 'iafLBQ==', 'part_index': Decimal('101'), 'header': 'שמואל אולפינר', 'body_length': Decimal('426'), '