# Protocol speakers categorization

In [1]:
import datetime
update_tag = datetime.datetime.now().strftime('%Y-%m-%d %H %M')
print('\n\n\n###############################\nLAST UPDATE: {}\n###############################\n\n\n'.format(datetime.datetime.now().strftime('%d/%m/%Y')))




###############################
LAST UPDATE: 27/10/2019
###############################





## Load the basic committee sessions data to memory

In [2]:
from dataflows import Flow, load

committee_sessions_data = Flow(
    load('https://production.oknesset.org/pipelines/data/committees/kns_committeesession/datapackage.json'),
    load('https://production.oknesset.org/pipelines/data/committees/kns_cmtsessionitem/datapackage.json'),
    load('https://production.oknesset.org/pipelines/data/knesset/kns_itemtype/datapackage.json')
).results()

In [6]:
kns_committeesession_rows = committee_sessions_data[0][0]
kns_cmtsessionitem_rows = committee_sessions_data[0][1]
kns_itemtype_rows = committee_sessions_data[0][2]

In [7]:
from IPython.display import HTML, display
import tabulate
for title, rows in {
    'kns_committeesession': kns_committeesession_rows, 
    'kns_cmtsessionitem_rows': kns_cmtsessionitem_rows, 
    'kns_itemtype_rows': kns_itemtype_rows
}.items():
    display(HTML('<h2>{}</h2><br/>{}'.format(title, tabulate.tabulate(rows[:2], tablefmt='html', headers='keys'))))

CommitteeSessionID,Number,KnessetNum,TypeID,TypeDesc,CommitteeID,Location,SessionUrl,BroadcastUrl,StartDate,FinishDate,Note,LastUpdatedDate,download_crc32c,download_filename,download_filesize,parts_crc32c,parts_filesize,parts_parsed_filename,text_crc32c,text_filesize,text_parsed_filename,topics,committee_name
64990,,15,161,פתוחה,25,"חדר הוועדה, באגף קדמה, קומה 1, חדר 1720",http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx?Tab=3&ItemID=64990,,2002-06-12 09:00:00,,,2011-04-12 05:28:59,,,,,,,,,,"['חוק הבחירות לכנסת (תיקון מס\' 52), התשס""ד-2004']","החוקה, חוק ומשפט"
470756,,18,161,פתוחה,661,"חדר הוועדה, באגף הוועדות (קדמה), קומה 2, חדר 2740",http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx?Tab=3&ItemID=470756,,2012-06-06 09:30:00,,,2012-09-19 15:27:32,,,,,,,,,,"['דו""ח מבקר המדינה על ההתמודדות עם המשט הטורקי ']",לענייני ביקורת המדינה


CmtSessionItemID,ItemID,CommitteeSessionID,Ordinal,StatusID,Name,ItemTypeID,LastUpdatedDate
29859,74042,66076,,,מסגרות להשמה חוץ ביתית לילדים בסיכון - סקירה,11,2012-09-20 22:23:47
29860,73194,66077,10.0,,"הפרסום בעיתון מעריב על שעות העבודה הרבות של נהגי ""אגד"" - דיון מהיר בהצעתה של ח""כ אורית נוקד",11,2012-09-20 22:23:47


ItemTypeID,Desc,TableName
1,שאילתה,KNS_Query
2,הצעת חוק,KNS_Bill


In [8]:
from collections import defaultdict

stats = defaultdict(int)

sessions_by_id = {}
item_types_by_id = {}

for row in kns_committeesession_rows:
    sessions_by_id[row['CommitteeSessionID']] = row
    stats['num sessions'] += 1

for row in kns_cmtsessionitem_rows:
    stats['num session items'] += 1
    session_id = row['CommitteeSessionID']
    session = sessions_by_id.get(session_id)
    if session:
        session.setdefault('items', []).append(row)
    else:
        stats['num session items without related session'] += 1

for row in kns_itemtype_rows:
    item_types_by_id[row['ItemTypeID']] = row
    stats['num item types'] += 1

display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,num session items,num item types
74414,47608,10


In [9]:
display(HTML('{}'.format(tabulate.tabulate(item_types_by_id.values(), tablefmt='html', headers='keys'))))

ItemTypeID,Desc,TableName
1,שאילתה,KNS_Query
2,הצעת חוק,KNS_Bill
3,הצעת אי אמון,
4,הצעה לסדר היום,KNS_Agenda
5,ישיבת מליאה,KNS_PlenumSession
12,ישיבת ועדה,KNS_CommitteSession
6000,פעולה על פי חוק,KNS_Law
6001,תיקון טעות,KNS_Law
6002,חוק בן חיצוני,KNS_Law
6003,חוק אב,KNS_IsraelLaw


## Relate sessions to bills/laws based on item types

In [10]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    session['related_to_bill'] = False
    session['related_to_law'] = False
    for item in session.get('items', []):
        item_type_id = item['ItemTypeID']
        item_type = item_types_by_id.get(item_type_id)
        if item_type:
            table_name = item_type['TableName']
            if table_name in ['KNS_Law', 'KNS_IsraelLaw']:
                session['related_to_law'] = True
            elif table_name == 'KNS_Bill':
                session['related_to_bill'] = True
        else:
            assert item_type_id in [11,15], item_type_id
            stats['unknown item type id 11/15'] += 1
    if session['related_to_bill'] and session['related_to_law']:
        stats['related to bill and law'] += 1
    elif session['related_to_bill']:
        stats['related to bill'] += 1
    elif session['related_to_law']:
        stats['related to law'] += 1

display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,related to bill,unknown item type id 11/15,related to law,related to bill and law
74414,11448,26516,1198,28


## Mark sessions which have parts / text protocols

In [11]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    session['has_downloaded_protocol'] = session['download_filesize'] and session['download_filesize'] > 100
    session['has_parts_protocol'] = session['parts_filesize'] and session['parts_filesize'] > 100
    session['has_text_protocol'] = session['text_filesize'] and session['text_filesize'] > 100
    if session['has_downloaded_protocol']:
        stats['has_downloaded_protocol'] += 1
    if session['has_parts_protocol']:
        stats['has_parts_protocol'] += 1
    if session['has_text_protocol']:
        stats['has_text_protocol'] += 1

display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,has_downloaded_protocol,has_parts_protocol,has_text_protocol
74414,46677,46651,46646


## Load protocol attendees data

In [12]:
stats = defaultdict(int)

def load_attendees(rows):
    for row in rows:
        yield row
        stats['num sessions'] += 1
        session = sessions_by_id.get(row['CommitteeSessionID'])
        if session:
            stats['valid session with attendees data'] += 1
            session.update(
                attended_mk_individual_ids=row['attended_mk_individual_ids'],
                invitees=row['invitees'],
                legal_advisors=row['legal_advisors'],
                manager=row['manager'],
                mks=row['mks'],
                financial_advisors=row['financial_advisors']
            )
        else:
            stats['invalid session, missing related session in sessions_by_id'] += 1

Flow(
    load('https://production.oknesset.org/pipelines/data/people/committees/meeting-attendees/datapackage.json'),
    load_attendees
).process()

display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,valid session with attendees data
74414,74414


## Mark sessions which have different attendee types

In [20]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    for invitees_type in ['mks', 'manager', 'legal_advisors', 'invitees', 'attended_mk_individual_ids']:
        session['has {}'.format(invitees_type)] = False
        invitees = session.get(invitees_type)
        if invitees and len(invitees) > 0:
            session['has {}'.format(invitees_type)] = True
            stats['has {}'.format(invitees_type)] += 1
#     if session['has manager'] and session['has legal_advisors'] and session['has manager']:
#     if session['text_parsed_filename'] and not session['has manager']:
#         print(session)
#         break
            
display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,has mks,has manager,has legal_advisors,has invitees,has attended_mk_individual_ids
74414,38653,33593,23548,38277,35737


## Aggregate speaker stats per session

In [4]:
for row in Flow(
    load('https://app.redash.io/hasadna/api/queries/276124/results.csv')
).results()[0][0]:
    session = sessions_by_id.get(row['CommitteeSessionID'])
    if session:
        session['number of protocol parts'] = row['num_parts']
        session['total protocol part bodies length'] = row['total_body_length']

(<datapackage.package.Package at 0x7f56cf93d748>, {})

## Generate aggregate data per knesset

In [28]:
from dataflows import dump_to_path

bool_fields = [
    # 'has_downloaded_protocol',
    # 'has_parts_protocol',
    'has_text_protocol',
    'has mks',
    'has manager',
    'has legal_advisors',
    # 'has invitees',
    # 'has attended_mk_individual_ids',
    'related_to_bill',
    'related_to_law',
]

def dump_sessions():
    for session in sessions_by_id.values():
        yield {
            **{
                k: session[k]
                for k in [
                    'KnessetNum',
                    'StartDate',
                    'TypeDesc',
                    'committee_name',
                ]
            },
            **{
                k: bool(session[k])
                for k in bool_fields
            },
            'number of protocol parts': session.get('number of protocol parts', 0),
            'total protocol part bodies length': session.get('total protocol part bodies length', 0)
        }

knessets = {}
for session in sessions_by_id.values():
    knesset = knessets.setdefault(session['KnessetNum'], {})
    knesset["total_sessions"] = knesset.get("total_sessions", 0) + 1
    for field in bool_fields:
        cur_value = knesset.setdefault(field, 0)
        if session[field]:
            knesset[field] = cur_value+1
    num_parts = knesset.setdefault('number of protocol parts', 0)
    if session.get('number of protocol parts'):
        knesset['number of protocol parts'] = num_parts+session['number of protocol parts']
        knesset['parts <= 2'] = knesset.setdefault('parts <= 2', 0) + (1 if session['number of protocol parts'] <= 2 else 0)
        knesset['parts <= 5'] = knesset.setdefault('parts <= 5', 0) + (1 if 2 < session['number of protocol parts'] <= 5 else 0)
        knesset['parts <= 10'] = knesset.setdefault('parts <= 10', 0) + (1 if 5 < session['number of protocol parts'] <= 10 else 0)
        knesset['parts > 10'] = knesset.setdefault('parts > 10', 0) + (1 if session['number of protocol parts'] > 10 else 0)
    body_length = knesset.setdefault('total protocol part bodies length', 0)
    if session.get('total protocol part bodies length'):
        knesset['total protocol part bodies length'] = body_length + session['total protocol part bodies length']
for knesset_num, knesset in knessets.items():
    knesset['average_protocol_parts'] = int(knesset['number of protocol parts'] / knesset['total_sessions'])
    knesset['average_body_length'] = int(knesset['total protocol part bodies length'] / knesset['total_sessions'])
        
Flow(
    dump_sessions(),
    ({"KnessetNum": knesset_num, **knesset} for knesset_num, knesset in knessets.items()),
    dump_to_path('data/protocol_speakers_categorization')
).process()

(<datapackage.package.Package at 0x7f56b2af6470>,
 {'count_of_rows': 74436,
  'bytes': 8090943,
  'hash': '2d3b6e8d6cc9813abad201adb8354787',
  'dataset_name': None})