# Protocol speakers categorization

In [1]:
import datetime
update_tag = datetime.datetime.now().strftime('%Y-%m-%d %H %M')
print('\n\n\n###############################\nLAST UPDATE: {}\n###############################\n\n\n'.format(datetime.datetime.now().strftime('%d/%m/%Y')))




###############################
LAST UPDATE: 30/11/2019
###############################





## Load the basic committee sessions data to memory

In [2]:
from dataflows import Flow, load

committee_sessions_data = Flow(
    load('https://production.oknesset.org/pipelines/data/committees/kns_committeesession/datapackage.json'),
    load('https://production.oknesset.org/pipelines/data/committees/kns_cmtsessionitem/datapackage.json'),
    load('https://production.oknesset.org/pipelines/data/knesset/kns_itemtype/datapackage.json')
).results()

In [3]:
kns_committeesession_rows = committee_sessions_data[0][0]
kns_cmtsessionitem_rows = committee_sessions_data[0][1]
kns_itemtype_rows = committee_sessions_data[0][2]

In [4]:
from IPython.display import HTML, display
import tabulate
for title, rows in {
    'kns_committeesession': kns_committeesession_rows, 
    'kns_cmtsessionitem_rows': kns_cmtsessionitem_rows, 
    'kns_itemtype_rows': kns_itemtype_rows
}.items():
    display(HTML('<h2>{}</h2><br/>{}'.format(title, tabulate.tabulate(rows[:2], tablefmt='html', headers='keys'))))

CommitteeSessionID,Number,KnessetNum,TypeID,TypeDesc,CommitteeID,Location,SessionUrl,BroadcastUrl,StartDate,FinishDate,Note,LastUpdatedDate,download_crc32c,download_filename,download_filesize,parts_crc32c,parts_filesize,parts_parsed_filename,text_crc32c,text_filesize,text_parsed_filename,topics,committee_name
64990,,15,161,פתוחה,25,"חדר הוועדה, באגף קדמה, קומה 1, חדר 1720",http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx?Tab=3&ItemID=64990,,2002-06-12 09:00:00,,,2011-04-12 05:28:59,,,,,,,,,,"['חוק הבחירות לכנסת (תיקון מס\' 52), התשס""ד-2004']","ועדת החוקה, חוק ומשפט"
470756,,18,161,פתוחה,661,"חדר הוועדה, באגף הוועדות (קדמה), קומה 2, חדר 2740",http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx?Tab=3&ItemID=470756,,2012-06-06 09:30:00,,,2012-09-19 15:27:32,,,,,,,,,,"['דו""ח מבקר המדינה על ההתמודדות עם המשט הטורקי ']",לענייני ביקורת המדינה


CmtSessionItemID,ItemID,CommitteeSessionID,Ordinal,StatusID,Name,ItemTypeID,LastUpdatedDate
29859,74042,66076,,,מסגרות להשמה חוץ ביתית לילדים בסיכון - סקירה,11,2012-09-20 22:23:47
29860,73194,66077,10.0,,"הפרסום בעיתון מעריב על שעות העבודה הרבות של נהגי ""אגד"" - דיון מהיר בהצעתה של ח""כ אורית נוקד",11,2012-09-20 22:23:47


ItemTypeID,Desc,TableName
1,שאילתה,KNS_Query
2,הצעת חוק,KNS_Bill


In [5]:
from collections import defaultdict

stats = defaultdict(int)

sessions_by_id = {}
item_types_by_id = {}

for row in kns_committeesession_rows:
    sessions_by_id[row['CommitteeSessionID']] = row
    stats['num sessions'] += 1

for row in kns_cmtsessionitem_rows:
    stats['num session items'] += 1
    session_id = row['CommitteeSessionID']
    session = sessions_by_id.get(session_id)
    if session:
        session.setdefault('items', []).append(row)
    else:
        stats['num session items without related session'] += 1

for row in kns_itemtype_rows:
    item_types_by_id[row['ItemTypeID']] = row
    stats['num item types'] += 1

display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,num session items,num item types
74441,47705,10


In [6]:
display(HTML('{}'.format(tabulate.tabulate(item_types_by_id.values(), tablefmt='html', headers='keys'))))

ItemTypeID,Desc,TableName
1,שאילתה,KNS_Query
2,הצעת חוק,KNS_Bill
3,הצעת אי אמון,
4,הצעה לסדר היום,KNS_Agenda
5,ישיבת מליאה,KNS_PlenumSession
12,ישיבת ועדה,KNS_CommitteSession
6000,פעולה על פי חוק,KNS_Law
6001,תיקון טעות,KNS_Law
6002,חוק בן חיצוני,KNS_Law
6003,חוק אב,KNS_IsraelLaw


## Relate sessions to bills/laws based on item types

In [7]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    session['related_to_bill'] = False
    session['related_to_law'] = False
    for item in session.get('items', []):
        item_type_id = item['ItemTypeID']
        item_type = item_types_by_id.get(item_type_id)
        if item_type:
            table_name = item_type['TableName']
            if table_name in ['KNS_Law', 'KNS_IsraelLaw']:
                session['related_to_law'] = True
            elif table_name == 'KNS_Bill':
                session['related_to_bill'] = True
        else:
            assert item_type_id in [11,15], item_type_id
            stats['unknown item type id 11/15'] += 1
    if session['related_to_bill'] and session['related_to_law']:
        stats['related to bill and law'] += 1
    elif session['related_to_bill']:
        stats['related to bill'] += 1
    elif session['related_to_law']:
        stats['related to law'] += 1

display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,related to bill,unknown item type id 11/15,related to law,related to bill and law
74441,11505,26542,1204,28


## Mark sessions which have parts / text protocols

In [8]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    session['has_downloaded_protocol'] = session['download_filesize'] and session['download_filesize'] > 100
    session['has_parts_protocol'] = session['parts_filesize'] and session['parts_filesize'] > 100
    session['has_text_protocol'] = session['text_filesize'] and session['text_filesize'] > 100
    if session['has_downloaded_protocol']:
        stats['has_downloaded_protocol'] += 1
    if session['has_parts_protocol']:
        stats['has_parts_protocol'] += 1
    if session['has_text_protocol']:
        stats['has_text_protocol'] += 1

display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,has_downloaded_protocol,has_parts_protocol,has_text_protocol
74441,46690,46664,46659


## Load protocol attendees data

In [9]:
stats = defaultdict(int)

def load_attendees(rows):
    for row in rows:
        yield row
        stats['num sessions'] += 1
        session = sessions_by_id.get(row['CommitteeSessionID'])
        if session:
            stats['valid session with attendees data'] += 1
            session.update(
                attended_mk_individual_ids=row['attended_mk_individual_ids'],
                invitees=row['invitees'],
                legal_advisors=row['legal_advisors'],
                manager=row['manager'],
                mks=row['mks'],
                financial_advisors=row['financial_advisors']
            )
        else:
            stats['invalid session, missing related session in sessions_by_id'] += 1

Flow(
    load('https://production.oknesset.org/pipelines/data/people/committees/meeting-attendees/datapackage.json'),
    load_attendees
).process()

display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,valid session with attendees data
74441,74441


## Mark sessions which have different attendee types

In [10]:
stats = defaultdict(int)

for session in sessions_by_id.values():
    stats['num sessions'] += 1
    for invitees_type in ['mks', 'manager', 'legal_advisors', 'invitees', 'attended_mk_individual_ids']:
        session['has {}'.format(invitees_type)] = False
        invitees = session.get(invitees_type)
        if invitees and len(invitees) > 0:
            session['has {}'.format(invitees_type)] = True
            stats['has {}'.format(invitees_type)] += 1
#     if session['has manager'] and session['has legal_advisors'] and session['has manager']:
#     if session['text_parsed_filename'] and not session['has manager']:
#         print(session)
#         break
            
display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

num sessions,has mks,has manager,has legal_advisors,has invitees,has attended_mk_individual_ids
74441,38667,33607,23560,38292,35753


## Load bill/lawdata

In [11]:
kns_bill_rows, kns_law_binding_rows, kns_law_rows, kns_israel_law_rows = Flow(
    load('https://production.oknesset.org/pipelines/data/bills/kns_bill/datapackage.json'),
    load('https://production.oknesset.org/pipelines/data/laws/kns_law_binding/datapackage.json'),
    load('https://production.oknesset.org/pipelines/data/laws/kns_law/datapackage.json'),
    load('https://production.oknesset.org/pipelines/data/laws/kns_israel_law/datapackage.json'),
).results()[0]

In [12]:
bills = {}
for bill in kns_bill_rows:
    bills[bill['BillID']] = bill

In [13]:
laws = {}
for law in kns_law_rows:
    laws[law['LawID']] = law

In [14]:
israel_laws = {}
for israel_law in kns_israel_law_rows:
    israel_laws[israel_law['IsraelLawID']] = israel_law

In [15]:
for law_binding in kns_law_binding_rows:
    law = laws.get(law_binding['LawID'])
    if law:
        law.setdefault('law_bindings', []).append(law_binding)

In [16]:
kns_israel_law_binding_rows = Flow(load('https://production.oknesset.org/pipelines/data/laws/kns_israel_law_binding/datapackage.json')).results()[0][0]

In [17]:
match = 0
no_match = 0
for israel_law_binding in kns_israel_law_binding_rows:
    israel_law = israel_laws.get(israel_law_binding['IsraelLawID'])
    if israel_law:
        match += 1
        israel_law.setdefault('israel_law_bindings', []).append(israel_law_binding)
    else:
        no_match +=1
print('match: {}, no match: {}'.format(match, no_match))        

match: 353, no match: 0


## Identify session bill/law type

In [18]:
mismatch_laws = 0
mismatch_israel_laws = 0
for session in sessions_by_id.values():
    session['bill_types'] = set()
    session['bill_names'] = set()
    session['law_names'] = set()
    for item in session.get('items', []):
        if item['ItemTypeID'] == 2:
            bill = bills[item['ItemID']]
            session['bill_types'].add(bill['SubTypeDesc'])
            session['bill_names'].add(bill['Name'])
        elif item['ItemTypeID'] in [6000,6001,6002]:
            law = laws.get(int(item['ItemID']))
            if law:
                raise Exception(law)
            else:
                mismatch_laws += 1
        elif item['ItemTypeID'] == 6003:
            israel_law = israel_laws.get(int(item['ItemID']))
            if israel_law:
                raise Exception(israel_law)
            else:
                mismatch_israel_laws += 1
print('mismatch laws: {}, mismatch israel laws: {}'.format(mismatch_laws, mismatch_israel_laws))

mismatch laws: 1550, mismatch israel laws: 0


## Generate detailed speaker parts datapackage

In [34]:
%%bash
mkdir -p data/people_committees_meeting_speaker_stats
curl -o data/people_committees_meeting_speaker_stats/datapackage.json https://production.oknesset.org/pipelines/data/people/committees/meeting-speaker-stats/datapackage.json
curl -o data/people_committees_meeting_speaker_stats/kns_committeesession.csv https://production.oknesset.org/pipelines/data/people/committees/meeting-speaker-stats/kns_committeesession.csv
curl -o data/people_committees_meeting_speaker_stats/speaker_stats.csv https://production.oknesset.org/pipelines/data/people/committees/meeting-speaker-stats/speaker_stats.csv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  5778  100  5778    0     0  11913      0 --:--:-- --:--:-- --:--:-- 11888
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  117M  100  117M    0     0  2744k      0  0:00:43  0:00:43 --:--:-- 4379k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  483M  100  483M    0     0  2383k      0  0:03:27  0:03:27 --:--:-- 2091k


## Prepare the data

In [53]:
from dataflows import dump_to_path, printer
from datapackage import Package

# change to False to work on the full data (takes a loooooong time)
IS_SPEAKER_STATS_SAMPLE_DATA = True

MAX_NUM_ROWS = 10000 if IS_SPEAKER_STATS_SAMPLE_DATA else 9999999999999999
SPEAKER_STATS_DATAPACKAGE_PATH = 'data/people_committees_meeting_speaker_stats' + ('_sample' if IS_SPEAKER_STATS_SAMPLE_DATA else '')

stats = defaultdict(int)

def _get_sample(package):
    package.pkg.remove_resource('kns_committeesession')
    yield package.pkg
    for resourcenum, resource in enumerate(package):
        if resourcenum == 0:
            for row in resource:
                pass
        else:
            yield (row for rownum, row in enumerate(resource) if rownum < MAX_NUM_ROWS)

Flow(
    load('data/people_committees_meeting_speaker_stats/datapackage.json'),
    _get_sample,
    dump_to_path(SPEAKER_STATS_DATAPACKAGE_PATH),
    printer()
).process()

speaker_stats:
#        CommitteeSessionID  parts_crc32c      part_index  header                     body_length    body_num_words  part_category
                   (number)  (string)            (number)  (string)                      (number)          (number)  (string)
-----  --------------------  --------------  ------------  -----------------------  -------------  ----------------  ---------------
1                    100017  EIzyDg==                   0                                     214                25
2                    100017  EIzyDg==                   1  סדר היום                           129                20
3                    100017  EIzyDg==                  10  דוד טל                             191                34  mk
4                    100017  EIzyDg==                 100  אמנון כהן                           39                 9  chairperson
5                    100017  EIzyDg==                 101  שאול שניידר                        156                3

(<datapackage.package.Package at 0x7f32668eb2b0>,
 {'count_of_rows': 10000,
  'bytes': 515149,
  'hash': '9644f20669fca6bab8bf0da353accacb',
  'dataset_name': None})

In [58]:
from dataflows import dump_to_path, printer, set_type, sort_rows
import csv
import os

stats = defaultdict(int)

def _add_detailed_speaker_parts_fields_to_schema(package):
    package.pkg.resources[0].descriptor['schema']['fields'] += [
        {
            'name': 'KnessetNum',
            'type': 'number'
        },
        {
            'name': 'StartDate',
            'type': 'date'
        },
        {
            'name': 'CommitteeID',
            'type': 'number'
        },
        {
            'name': 'committee_name',
            'type': 'string'
        },
        {
            'name': 'bill_types',
            'type': 'string'
        },
        {
            'name': 'bill_names',
            'type': 'string'
        },
        {
            'name': 'topics',
            'type': 'string'
        },
        {
            'name': 'Note',
            'type': 'string'
        }
    ]
    yield package.pkg
    yield from package

def _process_speaker_stats_rows(rows):
    for part_row_num, part_row in enumerate(rows):
        stats['last_row_num'] = part_row_num
        stats['total_parts'] += 1
        stats['total_body_length'] += part_row['body_length'] if part_row['body_length'] else 0
        stats['total_headers_length'] += len(part_row['header']) if part_row['header'] else 0
        stats['total_body_words'] += part_row['body_num_words']
        session = sessions_by_id.get(part_row['CommitteeSessionID'])
        if session:
            part_row['KnessetNum'] = session['KnessetNum']
            part_row['StartDate'] = session['StartDate']
            part_row['CommitteeID'] = session['CommitteeID']
            part_row['committee_name'] = session['committee_name']
            part_row['bill_types'] = ','.join(session['bill_types']) if session['bill_types'] else ''
            part_row['bill_names'] = ','.join(session['bill_names']) if session['bill_names'] else ''
            part_row['topics'] = ','.join(session['topics']) if session['topics'] else ''
            part_row['Note'] = session['Note']
        else:
            print('missing session: ' + str(part_row['CommitteeSessionID']))
            part_row['KnessetNum'] = 0
            part_row['StartDate'] = None
            part_row['CommitteeID'] = 0
            part_row['committee_name'] = ''
            part_row['bill_types'] = ''
            part_row['bill_names'] = ''
            part_row['topics'] = ''
            part_row['Note'] = ''
        yield part_row

def _dump_session_rows(session_id, part_rows):
    if session_id:
        session = sessions_by_id.get(part_row['CommitteeSessionID'])
        if session:
            dirname = 'data/detailed_speaker_stats/{}/{}/'.format(session['KnessetNum'], session['committee_name'])
            os.makedirs(dirname)
            is_first = os.path.exists(dirname+'')
            with open('eggs.csv', 'w', newline='') as csvfile:
                spamwriter = csv.writer(csvfile, delimiter=' ',
                                        quotechar='|', quoting=csv.QUOTE_MINIMAL)
                spamwriter.writerow(['Spam'] * 5 + ['Baked Beans'])
                spamwriter.writerow(['Spam', 'Lovely Spam', 'Wonderful Spam'])
        else:
            print('missing session: ' + str(session_id))
        
def _dump_detailed_speaker_parts(rows):
    current_session_id = None
    current_session_rows = None
    
    for row in rows:
        if row['CommitteeSessionID'] != current_session_id:
            if current_session_id:
                _dump_session_rows(current_session_id, current_session_rows)
            current_session_id = row['CommitteeSessionID']
        
        
Flow(
    load(SPEAKER_STATS_DATAPACKAGE_PATH + '/datapackage.json'),
    _dump_detailed_speaker_parts
).process()

# display(HTML('{}'.format(tabulate.tabulate([dict(stats)], tablefmt='html', headers='keys'))))

(<datapackage.package.Package at 0x7f32664c2588>,
 {'count_of_rows': 10000,
  'bytes': 515149,
  'hash': '9644f20669fca6bab8bf0da353accacb',
  'dataset_name': None})

## Generate Detailed speakers data

## Generate aggregate data per knesset

In [129]:
from dataflows import dump_to_path

knessets = {}

# aggregate number of sessions with given bool value in sessions data

bool_fields = [
    # 'has_downloaded_protocol',
    # 'has_parts_protocol',
    'has_text_protocol',
    # 'has mks',
    # 'has manager',
    # 'has invitees',
    # 'has attended_mk_individual_ids',
    'related_to_bill',
    'related_to_law',
]

for session in sessions_by_id.values():
    knesset = knessets.setdefault(session['KnessetNum'], {'KnessetNum': session['KnessetNum']})
    knesset["total_sessions"] = knesset.get("total_sessions", 0) + 1
    for field in bool_fields:
        cur_value = knesset.setdefault(field, 0)
        if session[field]:
            knesset[field] = cur_value+1         

display(HTML('{}'.format(tabulate.tabulate([dict(knesset) for knesset in knessets.values()], tablefmt='html', headers='keys'))))

KnessetNum,total_sessions,has_text_protocol,related_to_bill,related_to_law
15,4694,4441,5,0
18,8669,7316,2706,0
16,7251,4119,1884,0
17,6904,5632,2286,0
19,4130,3404,1270,0
1,917,0,50,0
2,1736,0,16,0
3,1682,0,0,0
4,757,0,3,0
5,1974,0,0,0


In [151]:
## aggregate has_bill / has_law

for knesset in knessets.values():
    knesset['total bill names'] = 0

for session in sessions_by_id.values():
    knesset = knessets[session['KnessetNum']]
    for bill_name in session.get('bill_names', set()):
        knesset['total bill names'] += 1

average_num_bill_names = 'average number of bill names per meeting (which has protocol text)'
        
for knesset in knessets.values():
    if knesset['has_text_protocol'] > 0:
        knesset[average_num_bill_names] = knesset['total bill names'] / knesset['has_text_protocol'] 
    else:
        knesset[average_num_bill_names] = 0

knesset_rows = [knessets[knesset_num] for knesset_num in [13,14,15,16,17,18,19,20,21]]
display(HTML('{}'.format(tabulate.tabulate(knesset_rows, tablefmt='html', headers='keys'))))

KnessetNum,total_sessions,has_text_protocol,related_to_bill,related_to_law,average number of bill names per meeting (which has protocol text),total bill names
13,3709,3673,0,0,0.0,0
14,3068,3009,0,0,0.0,0
15,4694,4441,5,0,0.00135105,6
16,7251,4119,1884,0,0.61277,2524
17,6904,5632,2286,0,0.533558,3005
18,8669,7316,2706,0,0.435484,3186
19,4130,3404,1270,0,0.469448,1598
20,10263,8811,3208,1208,0.466235,4108
21,112,79,19,15,0.240506,19


In [159]:
# aggregate protocol parts data per knesset

for knesset in knessets.values():
    knesset['number of protocol parts'] = 0
    knesset['total protocol part bodies length'] = 0
    knesset['num sessions with parts'] = 0
    knesset['num_parts_with_part_category'] = 0

for session in sessions_by_id.values():
    knesset = knessets[session['KnessetNum']]
    if len(session.get('parts', [])) > 2:
        knesset['num sessions with parts'] += 1
    num_parts_with_part_category = 0
    for part in session.get('parts', []):
        knesset['number of protocol parts'] += 1
        knesset['total protocol part bodies length'] += part['body_length']
        if part['part_category'] and part['part_category'] != '':
            knesset['num_parts_with_part_category'] += 1
    
for knesset_num, knesset in knessets.items():
    knesset['average_protocol_parts'] = int(knesset['number of protocol parts'] / knesset['total_sessions'])
    knesset['average_body_length'] = int(knesset['total protocol part bodies length'] / knesset['total_sessions'])
    knesset['average_num_parts_with_part_category'] = int(knesset['num_parts_with_part_category'] / knesset['total_sessions'])

knesset_rows = [
    {
        **{k: v for k, v in knessets[knesset_num].items() if k in [
            'KnessetNum',
            'total_sessions',
            'related_to_bill',
            'average number of bill names per meeting (which has protocol text)',
            'number of protocol parts',
        ]},
        'average number of protocol parts per meeting': knessets[knesset_num]['average_protocol_parts'],
        'average number of protocol parts per meeting which has a part category': knessets[knesset_num]['average_num_parts_with_part_category'],
        'average body length per meeting (not including header)': knessets[knesset_num]['average_body_length'],
    } for knesset_num in [13,14,15,16,17,18,19,20,21]
]
display(HTML('{}'.format(tabulate.tabulate(knesset_rows, tablefmt='html', headers='keys'))))

KnessetNum,total_sessions,related_to_bill,average number of bill names per meeting (which has protocol text),number of protocol parts,average number of protocol parts per meeting,average number of protocol parts per meeting which has a part category,average body length per meeting (not including header)
13,3709,0,0.0,336063,90,0,41462
14,3068,0,0.0,333070,108,0,36690
15,4694,5,0.00135105,841970,179,0,34390
16,7251,1884,0.61277,1035437,142,0,27193
17,6904,2286,0.533558,1269979,183,0,35910
18,8669,2706,0.435484,1196528,138,0,33748
19,4130,1270,0.469448,77891,18,0,40616
20,10263,3208,0.466235,3510975,342,0,50529
21,112,19,0.240506,22302,199,0,24429
