## Run dependant pipelines

In [1]:
!{'cd /pipelines; KNESSET_LOAD_FROM_URL=1 dpp run --verbose ./people/committee-meeting-attendees'}

[./people/committee-meeting-attendees:T_0] >>> INFO    :ca56ab66 RUNNING ./people/committee-meeting-attendees
[./people/committee-meeting-attendees:T_0] >>> INFO    :ca56ab66 Collecting dependencies
[./people/committee-meeting-attendees:T_0] >>> INFO    :ca56ab66 Running async task
[./people/committee-meeting-attendees:T_0] >>> INFO    :ca56ab66 Waiting for completion
[./people/committee-meeting-attendees:T_0] >>> INFO    :ca56ab66 Async task starting
[./people/committee-meeting-attendees:T_0] >>> INFO    :ca56ab66 Searching for existing caches
[./people/committee-meeting-attendees:T_0] >>> INFO    :ca56ab66 Building process chain:
[./people/committee-meeting-attendees:T_0] >>> INFO    :- load_resource
[./people/committee-meeting-attendees:T_0] >>> INFO    :- dump.to_path
[./people/committee-meeting-attendees:T_0] >>> INFO    :- (sink)
[./people/committee-meeting-attendees:T_0] >>> INFO    :load_resource: DEBUG   :Starting new HTTP connection (1): storage.googleapis.com:80
[./people/co

## Add a filter step

You should uncomment / add a filter step to committee-meeting-speaker-stats pipeline:

```
  - run: filter
    cache: true
    parameters:
      resources: kns_committeesession
      in:
      - CommitteeSessionID: 2059313
      - CommitteeSessionID: 86485
      - CommitteeSessionID: 2058899
  - run: committee_meeting_speaker_stats
```

## Download protocol parts files

upgrade to latest dataflows library

In [None]:
!{'pip install --upgrade dataflows'}

Install knesset-data-pipelines dependencies

In [None]:
!{'pip install -e ..'}

Restart the kernel if any upgrades were done

Choose some session IDs to download protocol files for:

In [1]:
session_ids = [2059313,86485,2058899]

In [2]:
from dataflows import Flow, load

sessions = []

for session in Flow(load('/pipelines/data/committees/kns_committeesession/datapackage.json')).results()[0][0]:
    if session['CommitteeSessionID'] in session_ids:
        sessions.append(session)

In [3]:
import os
import subprocess
import sys

for session in sessions:
    for attr in ['text_parsed_filename', 'parts_parsed_filename']:
        filename = session[attr]
        if filename:
            pathpart = 'meeting_protocols_text' if attr == 'text_parsed_filename' else 'meeting_protocols_parts'
            url = 'https://production.oknesset.org/pipelines/data/committees/{}/{}'.format(pathpart, filename)
            filename = '/pipelines/data/committees/{}/{}'.format(pathpart, session[attr])
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            cmd = 'curl -s -o {} {}'.format(filename, url)
            print(cmd, file=sys.stderr)
            subprocess.check_call(cmd, shell=True)

curl -s -o /pipelines/data/committees/meeting_protocols_text/files/8/6/86485.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/8/6/86485.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/8/6/86485.csv https://production.oknesset.org/pipelines/data/committees/meeting_protocols_parts/files/8/6/86485.csv
curl -s -o /pipelines/data/committees/meeting_protocols_text/files/2/0/2058899.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/2/0/2058899.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/2/0/2058899.csv https://production.oknesset.org/pipelines/data/committees/meeting_protocols_parts/files/2/0/2058899.csv
curl -s -o /pipelines/data/committees/meeting_protocols_text/files/2/0/2059313.txt https://production.oknesset.org/pipelines/data/committees/meeting_protocols_text/files/2/0/2059313.txt
curl -s -o /pipelines/data/committees/meeting_protocols_parts/files/2/0/20

## Run the pipeline

In [16]:
!{'rm -rf /pipelines/data/people/committees/meeting-speaker-stats/cache_hash'}

In [18]:
!{'cd /pipelines; dpp run --verbose ./people/committee-meeting-speaker-stats'}

[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :09d7a2bb RUNNING ./people/committee-meeting-speaker-stats
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :09d7a2bb Collecting dependencies
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :09d7a2bb Running async task
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :09d7a2bb Waiting for completion
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :09d7a2bb Async task starting
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :09d7a2bb Searching for existing caches
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :Found cache for step 1: filter
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :09d7a2bb Building process chain:
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- cache_loader
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- committee_meeting_speaker_stats
[./people/committee-meeting-speaker-stats:T_0] >>> INFO    :- knesse

## Inspect the output

In [19]:
from dataflows import Flow, load, printer
from collections import defaultdict

session_parts_stats = {}
header_parts_stats = {}

def aggregate_speaker_session_stats(rows):
    if rows.res.name == 'speaker_stats':
        for row in rows:
            session_stats_row = session_parts_stats.setdefault(row['CommitteeSessionID'], defaultdict(int))
            session_stats_row['num_parts'] += 1
            session_stats_row['num_words'] += row['body_num_words']
            session_stats_row['total_length'] += row['body_length']
            headers_stats_row = header_parts_stats.setdefault(row['header'], defaultdict(int))
            headers_stats_row['num_parts'] += 1
            headers_stats_row['num_words'] += row['body_num_words']
            headers_stats_row['total_length'] += row['body_length']
    yield from rows

Flow(
    load('/pipelines/data/people/committees/meeting-speaker-stats/datapackage.json'),
    aggregate_speaker_session_stats,
).process()

print('  session_id | stats')
for session_id, parts_stats in session_parts_stats.items():
    print('{:10}   | {}'.format(session_id, dict(parts_stats)))

  session_id | stats
   2058899   | {'num_parts': 128, 'num_words': Decimal('6909'), 'total_length': Decimal('38045')}
   2059313   | {'num_parts': 435, 'num_words': Decimal('18724'), 'total_length': Decimal('104018')}
     86485   | {'num_parts': 29, 'num_words': Decimal('772'), 'total_length': Decimal('4829')}


## Header stats

In [20]:
sample_rows = (
    {
        'header': header,
        **headers_stats_row
    } for header, headers_stats_row in header_parts_stats.items()
    if header and 'יעל תמיר' in header
)

print('           header          | stats')
for row in sample_rows:
    print('{:25}  |  {}'.format(row.pop('header').strip(), row))


           header          | stats
שרת הקליטה יעל תמיר        |  {'num_parts': 25, 'num_words': Decimal('1833'), 'total_length': Decimal('10004')}
שרת העליה והקליטה יעל תמיר  |  {'num_parts': 1, 'num_words': Decimal('72'), 'total_length': Decimal('393')}
שר הקליטה והעליה יעל תמיר  |  {'num_parts': 1, 'num_words': Decimal('80'), 'total_length': Decimal('450')}


## Speaker categorization

In [22]:
from dataflows import Flow, load, printer, filter_rows


Flow(
    load('/pipelines/data/people/committees/meeting-speaker-stats/datapackage.json'),
    # filter_rows(lambda row: row['CommitteeSessionID'] == 2059313),
    printer(tablefmt='html', num_rows=9999)
).process()[1]

#,CommitteeSessionID (integer),Number (integer),KnessetNum (integer),TypeID (integer),TypeDesc (string),CommitteeID (integer),Location (string),SessionUrl (string),BroadcastUrl (string),StartDate (datetime),FinishDate (datetime),Note (string),LastUpdatedDate (datetime),download_crc32c (string),download_filename (string),download_filesize (integer),parts_crc32c (string),parts_filesize (integer),parts_parsed_filename (string),text_crc32c (string),text_filesize (integer),text_parsed_filename (string),topics (array),committee_name (string),mks (array),invitees (array),legal_advisors (array),manager (array),financial_advisors (array),attended_mk_individual_ids (array)
1,86485,334,16,161,פתוחה,23,"חדר הוועדה, באגף הוועדות (קדמה), קומה 3, חדר 3740",http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx?Tab=3&ItemID=86485,,2004-12-27 11:30:00,,,2012-09-19 15:27:34,8zHRDQ==,files/23/7/1/71955.DOC,38912,BXS4LQ==,9393,files/8/6/86485.csv,NxzKWQ==,9323,files/8/6/86485.txt,"['חוק רשות התעופה האזרחית, התשס""ה-2005']",הכלכלה,"['שלום שמחון - היו""ר', 'מיכאל גורלובסקי', 'אבשלום וילן']","[{'name': 'עו""ד מלי סיטון', 'role': 'יועצת משפטית, משרד התחבורה'}, {'name': 'צחי חבושה', 'role': 'יו ...","['איתי עצמון - מתמחה', 'אתי בנדלר']",['לאה ורון'],[],"[123, 203, 733]"
2,2058899,51,15,161,פתוחה,2,"חדר הוועדה, באגף הוועדות (קדמה), קומה 3, חדר 3750",http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx?Tab=3&ItemID=2058899,,1999-11-25 00:00:00,1999-11-25 12:15:00,תקציב המשרד לקליטת העלייה לשנת 2002.,2018-10-10 11:03:06,L9/m4w==,files/23/4/5/450720.DOC,51456,xn4lhw==,71220,files/2/0/2058899.csv,UjcTNA==,71956,files/2/0/2058899.txt,,הכספים,"['שלום שמחון - מ""מ היו""ר', 'משה גפני', 'ישראל כץ', 'יעקב ליצמן', 'נחום לנגנטל', 'מאיר פרוש', 'יוסף י ...","[{'name': 'שרת הקליטה - יעל תמיר'}, {'name': 'בוריס מפצר - מנכ""ל המשרד לקליטת עליה'}, {'name': 'ד ...",[],['איוור קירשנר'],['סמדר אלחנני'],"[35, 227, 69, 103, 696, 123, 216]"
3,2059313,462,15,161,פתוחה,2,"חדר הוועדה, באגף הוועדות (קדמה), קומה 3, חדר 3750",http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx?Tab=3&ItemID=2059313,,2002-11-19 00:00:00,2002-11-19 00:00:00,"הצעת חוק ההסדרים במשק המדינה (תיקוני חקיקה להשגת יעדי התקציב והמדיניות הכלכלית לשנת הכספים 2003, התש ...",2018-10-10 11:03:06,+lu4+A==,files/23/4/3/430592.DOC,91162,iafLBQ==,194574,files/2/0/2059313.csv,jD1Riw==,195031,files/2/0/2059313.txt,,הכספים,"['יעקב ליצמן - היו""ר', 'אבשלום וילן', 'עופר חוגי', 'אמנון כהן', 'רחמים מלול', 'משולם נהרי']","[{'name': 'חה""כ צבי הנדל'}, {'name': 'חה""כ עמיר פרץ'}, {'name': 'יעקב ניזרי-סמנכ""ל שירות התעסוקה, מש ...","['שגית אפיק', 'אנה שניידר', 'ליאורה סידי (מתמחה)']",['טמיר כהן'],[],"[105, 203, 46, 207, 210, 216, 219, 222]"


#,CommitteeSessionID (number),parts_crc32c (string),part_index (number),header (string),body_length (number),body_num_words (number),part_categories (string),name_role (string)
1,2058899,xn4lhw==,0,,263,26,,
2,2058899,xn4lhw==,1,נכחו,128,23,,
3,2058899,xn4lhw==,10,יוסף יצחק פריצקי,33,8,mk,
4,2058899,xn4lhw==,100,דני טופז,140,27,,דני טופז -
5,2058899,xn4lhw==,101,יורי שטרן,44,7,,
6,2058899,xn4lhw==,102,דני טופז,96,17,,דני טופז -
7,2058899,xn4lhw==,103,שרת הקליטה יעל תמיר,749,135,,שרת הקליטה - יעל תמיר -
8,2058899,xn4lhw==,104,יורי שטרן,34,6,,
9,2058899,xn4lhw==,105,שרת העליה והקליטה יעל תמיר,393,72,,שרת הקליטה - יעל תמיר -
10,2058899,xn4lhw==,106,משה גפני,4,1,mk,


{}