Install supported dataflows library (and restart the kernel if an upgrade was done)

In [None]:
!{'pip install --upgrade pip && pip install dataflows==0.0.58'}

## Load the basic committee sessions data and store in local cache

In [72]:
from dataflows import Flow, load, printer, checkpoint

protocols_speakers_categorization_checkpoint = checkpoint('protocol speakers categorization - 15')

Flow(
    load('https://storage.googleapis.com/knesset-data-pipelines/data/committees/kns_committeesession/datapackage.json'),
    load('https://storage.googleapis.com/knesset-data-pipelines/data/committees/kns_cmtsessionitem/datapackage.json'),
    load('https://storage.googleapis.com/knesset-data-pipelines/data/knesset/kns_itemtype/datapackage.json'),
    protocols_speakers_categorization_checkpoint,
).process()

saving checkpoint to: .checkpoints/protocol speakers categorization - 15
checkpoint saved: protocol speakers categorization - 15


(<datapackage.package.Package at 0x7fb6fbaa3780>, {})

## Load sessions and item types to memory

In [64]:
from collections import defaultdict

stats = defaultdict(int)

sessions_by_id = {}
item_types_by_id = {}

def load_sessions_by_id(rows):
    if rows.res.name == 'kns_committeesession':
        for row in rows:
            sessions_by_id[row['CommitteeSessionID']] = row
            stats['num sessions'] += 1
    yield from rows

def relate_session_items_to_sessions(rows):
    if rows.res.name == 'kns_cmtsessionitem':
        for row in rows:
            stats['num session items'] += 1
            session_id = row['CommitteeSessionID']
            session = sessions_by_id.get(session_id)
            if session:
                session.setdefault('items', []).append(row)
            else:
                stats['num session items without related session'] += 1
    yield from rows

def load_item_types(rows):
    if rows.res.name == 'kns_itemtype':
        for row in rows:
            item_types_by_id[row['ItemTypeID']] = row
            stats['num item types'] += 1
    yield from rows

Flow(
    protocols_speakers_categorization_checkpoint,
    load_sessions_by_id,
    relate_session_items_to_sessions,
    load_item_types
).process()

print(dict(stats))

using checkpoint data from .checkpoints/protocol speakers categorization - 14
{'num sessions': 74405, 'num session items': 47595, 'num item types': 10}


In [65]:
print(item_types_by_id)

{1: {'Desc': 'שאילתה', 'ItemTypeID': 1, 'TableName': 'KNS_Query'}, 2: {'Desc': 'הצעת חוק', 'ItemTypeID': 2, 'TableName': 'KNS_Bill'}, 3: {'Desc': 'הצעת אי אמון', 'ItemTypeID': 3, 'TableName': None}, 4: {'Desc': 'הצעה לסדר היום', 'ItemTypeID': 4, 'TableName': 'KNS_Agenda'}, 5: {'Desc': 'ישיבת מליאה', 'ItemTypeID': 5, 'TableName': 'KNS_PlenumSession'}, 12: {'Desc': 'ישיבת ועדה', 'ItemTypeID': 12, 'TableName': 'KNS_CommitteSession'}, 6000: {'Desc': 'פעולה על פי חוק', 'ItemTypeID': 6000, 'TableName': 'KNS_Law'}, 6001: {'Desc': 'תיקון טעות', 'ItemTypeID': 6001, 'TableName': 'KNS_Law'}, 6002: {'Desc': 'חוק בן חיצוני', 'ItemTypeID': 6002, 'TableName': 'KNS_Law'}, 6003: {'Desc': 'חוק אב', 'ItemTypeID': 6003, 'TableName': 'KNS_IsraelLaw'}}


## Relate sessions to bills/laws based on item types

In [66]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    session['related_to_bill'] = False
    session['related_to_law'] = False
    for item in session.get('items', []):
        item_type_id = item['ItemTypeID']
        item_type = item_types_by_id.get(item_type_id)
        if item_type:
            table_name = item_type['TableName']
            if table_name in ['KNS_Law', 'KNS_IsraelLaw']:
                session['related_to_law'] = True
            elif table_name == 'KNS_Bill':
                session['related_to_bill'] = True
        else:
            assert item_type_id in [11,15], item_type_id
            stats['unknown item type id 11/15'] += 1
    if session['related_to_bill'] and session['related_to_law']:
        stats['related to bill and law'] += 1
    elif session['related_to_bill']:
        stats['related to bill'] += 1
    elif session['related_to_law']:
        stats['related to law'] += 1

print(dict(stats))

{'num sessions': 74405, 'related to bill': 11446, 'unknown item type id 11/15': 26509, 'related to law': 1195, 'related to bill and law': 28}


## Mark sessions which have parts / text protocols

In [67]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    session['has_downloaded_protocol'] = session['download_filesize'] and session['download_filesize'] > 100
    session['has_parts_protocol'] = session['parts_filesize'] and session['parts_filesize'] > 100
    session['has_text_protocol'] = session['text_filesize'] and session['text_filesize'] > 100
    if session['has_downloaded_protocol']:
        stats['has_downloaded_protocol'] += 1
    if session['has_parts_protocol']:
        stats['has_parts_protocol'] += 1
    if session['has_text_protocol']:
        stats['has_text_protocol'] += 1

print(dict(stats))

{'num sessions': 74405, 'has_downloaded_protocol': 46677, 'has_parts_protocol': 46662, 'has_text_protocol': 46022}


## Load protocol attendees data

In [68]:
old_protocols_speakers_categorization_checkpoint = protocols_speakers_categorization_checkpoint
protocols_speakers_categorization_checkpoint = checkpoint('protocol speakers categorization - 25')

Flow(
    old_protocols_speakers_categorization_checkpoint,
    load('https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/meeting-attendees/datapackage.json'),
    protocols_speakers_categorization_checkpoint
).process()

saving checkpoint to: .checkpoints/protocol speakers categorization - 25
using checkpoint data from .checkpoints/protocol speakers categorization - 14
checkpoint saved: protocol speakers categorization - 25


(<datapackage.package.Package at 0x7fb6fbd49438>, {})

## Update sessions data in memory with loaded attendees data

In [71]:
stats = defaultdict(int)

def load_attendees():

    def _process(rows):
        print(rows.res.descriptor['schema']['fields'])
        if 'financial_advisors' in [f['name'] for f in rows.res.descriptor['schema']['fields']]:
            for row in rows:
                stats['num sessions'] += 1
                session = sessions_by_id.get(row['CommitteeSessionID'])
                if session:
                    session.update(
                        attended_mk_individual_ids=row['attended_mk_individual_ids'],
                        invitees=row['invitees'],
                        legal_advisors=row['legal_advisors'],
                        manager=row['manager'],
                        mks=row['mks'],
                        financial_advisors=row['financial_advisors']
                    )
                else:
                    stats['missing related session'] += 1
        yield from rows
    
    return _process

Flow(
    protocols_speakers_categorization_checkpoint,
    load_attendees()
).process()

using checkpoint data from .checkpoints/protocol speakers categorization - 25
[{'description': 'מספר השורה בטבלה זו', 'name': 'CommitteeSessionID', 'type': 'integer'}, {'description': 'מספר הישיבה', 'name': 'Number', 'type': 'integer'}, {'description': 'מספר הכנסת', 'name': 'KnessetNum', 'type': 'integer'}, {'description': 'קוד סוג הישיבה', 'name': 'TypeID', 'type': 'integer'}, {'description': 'תיאור סוג הישיבה (פתוחה, חסויה, סיור)', 'name': 'TypeDesc', 'type': 'string'}, {'description': 'קוד הוועדה', 'name': 'CommitteeID', 'type': 'integer'}, {'description': 'מיקום הישיבה', 'name': 'Location', 'type': 'string'}, {'description': 'קישור לישיבה באתר הכנסת', 'name': 'SessionUrl', 'type': 'string'}, {'description': 'קישור לשידור הישיבה באתר הכנסת', 'name': 'BroadcastUrl', 'type': 'string'}, {'description': 'תאריך התחלה', 'format': '%Y-%m-%d %H:%M:%S', 'name': 'StartDate', 'type': 'datetime'}, {'description': 'תאריך סיום', 'format': '%Y-%m-%d %H:%M:%S', 'name': 'FinishDate', 'type': 'dateti

(<datapackage.package.Package at 0x7fb6fbcddc88>, {})

In [70]:
print(dict(stats))

{}


## Mark sessions which have different attendee types

In [11]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    for invitees_type in ['mks', 'manager', 'legal_advisors', 'invitees', 'attended_mk_individual_ids']:
        session['has {}'.format(invitees_type)] = False
        invitees = session.get(invitees_type)
        if invitees and len(invitees) > 0:
            session['has {}'.format(invitees_type)] = True
            stats['has {}'.format(invitees_type)] += 1
#     if session['has manager'] and session['has legal_advisors'] and session['has manager']:
#     if session['text_parsed_filename'] and not session['has manager']:
#         print(session)
#         break
            
print(dict(stats))

{'num sessions': 74405, 'has mks': 35614, 'has invitees': 36011, 'has attended_mk_individual_ids': 32735, 'has legal_advisors': 13069, 'has manager': 17905}


## Download parts protocols (to local cache) and count number of parts per session

In [44]:
import requests
import os
import csv
import sys

csv.field_size_limit(sys.maxsize)

stats = defaultdict(int)

for session in sessions_by_id.values():
    session['num_protocol_parts'] = 0
    stats['num_sessions'] += 1
    
    if stats['num_sessions'] >= 56877: break
        
    if 11 <= int(session['KnessetNum']) <= 20:
        stats['num_sessions_knesset_11-20'] += 1
        if session['has_parts_protocol']:
            url = 'https://storage.googleapis.com/knesset-data-pipelines/data/committees/meeting_protocols_parts/{}'.format(session['parts_parsed_filename'])
            filename = 'data/protocol_parts/{}'.format(session['parts_parsed_filename'])
            if not os.path.exists(filename):
                try:
                    res = requests.get(url, timeout=30)
                    if res.status_code == 200:
                        content = res.content
                    else:
                        stats['http_{}_errors'.format(res.status_code)] += 1
                        content = b''
                except Exception:
                    stats['exceptions'] += 1
                    content = b''
                os.makedirs(os.path.dirname(filename), exist_ok=True)
                with open(filename, 'wb') as f:
                    f.write(content)
            session['num_protocol_parts'] = 0
            if os.path.exists(filename) and os.path.getsize(filename) > 10:
                stats['downloaded_parts'] += 1
                parts = Flow(load(filename)).results()[0][0]
                session['num_protocol_parts'] = len(parts)
                stats['total_len_parts'] += len(parts)
                # if stats['downloaded_parts'] > 20: break
                # if stats['downloaded_parts'] % 100 == 0:
                #     print(dict(stats))

stats['average_len_parts'] = int(stats['total_len_parts'] / stats['downloaded_parts'])

print(dict(stats))

{'num_sessions': 56877, 'num_sessions_knesset_11-20': 37534, 'downloaded_parts': 31099, 'total_len_parts': 4941472, 'average_len_parts': 158}


## Generate aggregate data per knesset

In [45]:
from dataflows import dump_to_path

bool_fields = [
    # 'has_downloaded_protocol',
    # 'has_parts_protocol',
    'has_text_protocol',
    'has mks',
    'has manager',
    'has legal_advisors',
    # 'has invitees',
    # 'has attended_mk_individual_ids',
    'related_to_bill',
    'related_to_law',
]

def dump_sessions():
    for session in sessions_by_id.values():
        yield {
            **{
                k: session[k]
                for k in [
                    'KnessetNum',
                    'StartDate',
                    'TypeDesc',
                    'committee_name',
                ]
            },
            **{
                k: bool(session[k])
                for k in bool_fields
            },
            'num_protocol_parts': session['num_protocol_parts'],
        }

def dump_sessions_agg_by_knesset():
    knessets = {}
    for session in sessions_by_id.values():
        knesset = knessets.setdefault(session['KnessetNum'], {})
        knesset["total_sessions"] = knesset.get("total_sessions", 0) + 1
        for field in bool_fields:
            cur_value = knesset.setdefault(field, 0)
            if session[field]:
                knesset[field] = cur_value+1
        num_parts = knesset.setdefault('num_protocol_parts', 0)
        if session.get('num_protocol_parts'):
            knesset['num_protocol_parts'] = num_parts+session['num_protocol_parts']
            knesset['parts <= 2'] = knesset.setdefault('parts <= 2', 0) + (1 if session['num_protocol_parts'] <= 2 else 0)
            knesset['parts <= 5'] = knesset.setdefault('parts <= 5', 0) + (1 if 2 < session['num_protocol_parts'] <= 5 else 0)
            knesset['parts <= 10'] = knesset.setdefault('parts <= 10', 0) + (1 if 5 < session['num_protocol_parts'] <= 10 else 0)
            knesset['parts > 10'] = knesset.setdefault('parts > 10', 0) + (1 if session['num_protocol_parts'] > 10 else 0)
    for knesset_num, knesset in knessets.items():
        knesset['average_protocol_parts'] = int(knesset['num_protocol_parts'] / knesset['total_sessions'])
    for knesset_num, knesset in knessets.items():
        yield {
            "KnessetNum": knesset_num,
            **knesset
        }
        
Flow(
    dump_sessions(),
    dump_sessions_agg_by_knesset(),
    dump_to_path('data/protocol_speakers_categorization')
).process()

(<datapackage.package.Package at 0x7fb6f5f0add8>,
 {'count_of_rows': 74426,
  'bytes': 7790155,
  'hash': '9e0a7029cefa18159d941ef2068a0e27',
  'dataset_name': None})

## Generate report

In [46]:
from dataflows import sort_rows, filter_rows

Flow(
    load('data/protocol_speakers_categorization/res_2.csv'),
    filter_rows(lambda row: 11 <= int(row['KnessetNum']) <= 20),
    sort_rows('{KnessetNum:0>2}'),
    printer(tablefmt='html', num_rows=999)
).process()

#,KnessetNum (integer),total_sessions (integer),has_text_protocol (integer),has mks (integer),has manager (integer),has legal_advisors (integer),related_to_bill (integer),related_to_law (integer),num_protocol_parts (integer),parts <= 2 (string),parts <= 5 (string),parts <= 10 (string),parts > 10 (string),average_protocol_parts (integer)
1,11,3092,3008,1135,0,0,11,0,119628,387,405,301,1994,38
2,12,3104,3017,1342,0,0,1,0,181231,461,420,157,2053,58
3,13,3709,3584,1304,23,0,0,0,289033,406,366,162,2746,77
4,14,3068,2926,1409,101,0,0,0,295747,362,210,88,2348,96
5,15,4694,4139,3459,84,6,5,0,721018,400,104,126,3804,153
6,16,7251,4121,2743,136,132,1884,0,565074,135,38,1173,2767,77
7,17,6904,5634,5437,1693,1288,2286,0,1009113,2,21,102,5508,146
8,18,8669,7313,7100,4642,3407,2706,0,1707841,1,13,101,7196,197
9,19,4130,3404,3402,3233,2383,1270,0,1064940,0,0,20,3383,257
10,20,10263,8806,8215,7925,5787,3208,1208,2499044,439,121,199,8051,243


(<datapackage.package.Package at 0x7fb6de1919b0>, {})