# Example flow for processing and aggregating stats about committee meeting attendees and protocol parts

See the [DataFlows documentation](https://github.com/datahq/dataflows) for more details regarding the Flow object and processing functions.

Feel free to modify and commit changes which demonstrate additional functionality or relevant data.

## Constants

In [56]:
# Limit processing of protocol parts for development
PROCESS_PARTS_LIMIT = 2

# Enable caching of protocol parts data (not efficient, should only be used for local development with sensible PROCESS_PARTS_LIMIT)
PROCESS_PARTS_CACHE = True

# Filter the meetings to be processed, these kwargs are passed along to DataFlows filter_rows processor for meetings resource
MEETINGS_FILTER_ROWS_KWARGS = {'equals': [{'KnessetNum': 20}]}

# Don'e use local data - loads everything from knesset data remote storage
# When set to False - also enables caching, so you won't download from remote storage on 2nd run.
USE_DATA = False

## Loading lexicons

In [67]:
import os

def read_topic_to_set(topic_name):
    lines = open(os.path.join(dir_name, topic_name + ".txt"), 'r').readlines()
    return set([line.strip().replace("\ufeff", "") for line in lines])

dir_name = "../topics/lexicons"        

files = os.listdir(dir_name)

topics = [file.split('.')[0] for file in files]

lexicons = {}
for topic_name in topics:
    lexicons[topic_name] = read_topic_to_list(topic_name)
    
lexicons

{'Diplomacy': ['שגריר',
  'שגרירה',
  'שגרירים',
  'שגרירות',
  'שגרירויות',
  'דיפלומט',
  'דיפלומטית',
  'דיפלומטים',
  'אום',
  'או"ם',
  'קונסוליה',
  'קונסוליות',
  'משלחת',
  'משלחות',
  'משרד החוץ',
  'BDS',
  'ארצות הברית',
  'צרפת',
  'גרמניה',
  'הולנד',
  'הודו'],
 'Ecologics': ['אקולוגיה',
  'מחזור',
  'מיחזור',
  'למחזר',
  'איכות הסביבה',
  'זיהום',
  'מזהם',
  'מזהמת',
  'מזהמים',
  'מזהמות',
  'ממחזר',
  'ממחזרת',
  'ממחזרים',
  'ממחזרות'],
 'Economics': ['ריבית',
  'אינפלציה',
  'דיפלציה',
  'דפלציה',
  'כלכלה',
  'כלכלית',
  'משכורות',
  'משכורת',
  'תל"ג',
  'מניות',
  'מניה',
  'תלג',
  'תל"ג',
  'תמג',
  'תמ"ג',
  'תוצר לאומי גולמי',
  'תוצר מקומי גולמי',
  'בנק',
  'בנקים',
  'שכר',
  'הלוואה',
  'הלוואות',
  'משכנתא',
  'משכנתאות',
  'אוברדראפט',
  'נדלן',
  'נדל"ן',
  'פנסיה',
  'פנסיות',
  'אוצר',
  'אשראי',
  'קפיטליזם',
  'סוציאליזם'],
 'Education': ['מורים',
  'מורה',
  'תלמיד',
  'תלמידה',
  'תלמידים',
  'תלמידות',
  'כיתה',
  'כיתות',
  'שיעור',
  'שיעורים

## Load source data

In [57]:
from dataflows import filter_rows, cache
from datapackage_pipelines_knesset.common_flow import load_knesset_data, load_member_names

# Loads a dict containing mapping between knesset member id and the member name
member_names = load_member_names(use_data=USE_DATA)

# define flow steps for loading the source committee meetings data
# the actual loading is done later in the Flow
load_steps = (
    load_knesset_data('people/committees/meeting-attendees/datapackage.json', USE_DATA),
    filter_rows(**MEETINGS_FILTER_ROWS_KWARGS)
)

if not USE_DATA:
    # when loading from URL - enable caching which will skip loading on 2nd run
    load_steps = (cache(*load_steps, cache_path='.cache/people-committee-meeting-attendees-knesset-20'),)

loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/members/mk_individual/datapackage.json
using cache data from .cache/members-mk-individual-names
loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/meeting-attendees/datapackage.json


## Inspect the datapackages which will be loaded

Last command's output log should contain urls to datapackage.json files, open them and check the table schema to see the resource metadata and available fields which you can use in the processing functions.

Check the [frictionlessdata docs](https://frictionlessdata.io/docs/) for more details about the datapackage file format.

## Extract topics from lines

In [108]:
from collections import defaultdict
from dataflows import Flow

first = True
running_index = 0

meeting_data_global = {}
topics_df = None

stats = defaultdict(int)
member_attended_meetings = defaultdict(int)
rows = []

def initialize_meeting_data_global(meeting_row):
    for topic_name in lexicons:
        meeting_data_global[topic_name + "_score"] = 0
        meeting_data_global['KnessetNum'] = meeting_row['KnessetNum']
        #meeting_data_global['topics'] = meeting_row['topics']
        #meeting_data_global['mks'] = meeting_row['mks']
        meeting_data_global['KnessetNum'] = meeting_row['KnessetNum']
    
    
def word_permutations(word):
    clean_word = word.strip()
    permutations = [clean_word]
    if len(word) > 1 and word.startswith('ה') or word.startswith('ב') or word.startswith('ל'):
        permutations.append(word[1:])
    return permutations


def in_lexicon(word, lexicon):
    for p in word_permutations(word):
        if p in lexicon:
            return True
    return False

            
def lexicon_count(lexicon, words):
    count = 0
    for word in words:
        if in_lexicon(word, lexicon):
            count += 1
    return count
        
    
def process_meeting_protocol_part(row):
    global meeting_data_global
    stats['processed parts'] += 1
    words = row['body'].split() if row['body'] is not None else []
    for topic_name, lexicon in lexicons.items():
        meeting_data_global[topic_name + "_score"] += lexicon_count(lexicon, words)  
            

def process_meeting(row):
    global topics_df
    global meeting_data_global
    global running_index
    stats['total meetings'] += 1
    if row['attended_mk_individual_ids']:
        for mk_id in row['attended_mk_individual_ids']:
            member_attended_meetings[mk_id] += 1
    parts_filename = row['parts_parsed_filename']
    if parts_filename:
        initialize_meeting_data_global(row)
        if topics_df is None:
            print(meeting_data_global)
            topics_df = pd.DataFrame(data=meeting_data_global, index=[running_index])
        else:
            topics_df.append(pd.DataFrame(data=meeting_data_global, index=[running_index]))
        if PROCESS_PARTS_LIMIT and stats['processed parts'] < PROCESS_PARTS_LIMIT:
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (process_meeting_protocol_part,)
            Flow(*steps).process()

process_steps = (process_meeting,)

In [109]:
from dataflows import Flow, dump_to_path

Flow(*load_steps, *process_steps, dump_to_path('data/committee-meeting-attendees-parts')).process()

using cache data from .cache/people-committee-meeting-attendees-knesset-20
{'Diplomacy_score': 0, 'KnessetNum': 20, 'Ecologics_score': 0, 'Economics_score': 0, 'Education_score': 0, 'Health_score': 0, 'Security_score': 0}
loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/committees/meeting_protocols_parts/files/5/6/562716.csv
using cache data from .cache/committee-meeting-protocol-parts/files/5/6/562716.csv


(<datapackage.package.Package at 0x7f30475db9e8>,
 {'count_of_rows': 10256,
  'bytes': 30129277,
  'hash': '9bb63d3b4c724c88df1416113d0fb80c',
  'dataset_name': None})

In [107]:
dictionary = {'Diplomacy_score': 0, 'KnessetNum': 20, 'Ecologics_score': 0, 'Economics_score': 0, 'Education_score': 0, 'Health_score': 0, 'Security_score': 0}
pd.DataFrame(data=dictionary, index=[1])

Unnamed: 0,Diplomacy_score,KnessetNum,Ecologics_score,Economics_score,Education_score,Health_score,Security_score
1,0,20,0,0,0,0,0


In [110]:
print(topics_df)

   Diplomacy_score  KnessetNum  Ecologics_score  Economics_score  \
0                0          20                0                0   

   Education_score  Health_score  Security_score  
0                0             0               0  


In [65]:
few_rows = rows[:10]
few_rows

[{'header': None,
  'body': 'הכנסת העשרים\n\nנוסח לא מתוקן\n\nמושב ראשון\n\nפרוטוקול מס\' 1\n\nמישיבת ועדת הכספים\n\nיום רביעי, י"ב בניסן התשע"ה (01 באפריל 2015), שעה 14:10'},
 {'header': 'סדר היום', 'body': 'בחירת יושב-ראש לוועדת הכספים הזמנית'},
 {'header': 'נכחו', 'body': None},
 {'header': 'חברי הוועדה:',
  'body': 'זאב אלקין – יו"ר הוועדה המסדרת\n\nניסן סלומינסקי – היו"ר\n\nאלי אלאלוף\n\nמיקי לוי\n\nאיוב קרא\n\nעאידה תומא-סלימאן'},
 {'header': 'מוזמנים:', 'body': 'אתי בן יוסף - מנהלת ועדת הכנסת, כנסת ישראל'},
 {'header': 'ייעוץ משפטי', 'body': 'אייל לב ארי'},
 {'header': 'מנהל הוועדה', 'body': 'טמיר כהן'},
 {'header': 'רישום פרלמנטרי',
  'body': 'יעקב סימן טוב\n\nבחירת יושב-ראש לוועדת הכספים הזמנית'},
 {'header': 'זאב אלקין',
  'body': 'אני כאן בשם הוועדה המסדרת כדי שבהתאם להוראות סעיף 106א(1) לתקנון הכנסת; כידוע, הוועדה צריכה לבחור, הוועדה הזאת, ועדת הכספים הזמנית, אחד מבין חבריה ליושב ראש, בעקבות המלצה של ועדת הכנסת. לעניין הוועדות הזמניות, הוועדה המסדרת היא זאת שמשמשת כמחליפה ש

In [None]:
session_score = 

In [59]:
from dataflows import Flow, dump_to_path, protocols_to_dataframe

parts_processed = 0
rows = []

def process_meeting_protocol_part(row):
    rows.append(row)
    parts_processed += 1
    #stats['processed parts'] += 1
    #if row['body'] and 'אנחנו ככנסת צריכים להיות ערוכים' in row['body']:
    #    stats['meetings contain text: we as knesset need to be prepared'] += 1
        
def process_meeting(row):
    parts_filename = row['parts_parsed_filename']
    if parts_filename:
        if PROCESS_PARTS_LIMIT and parts_processed < PROCESS_PARTS_LIMIT:
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (protocols_to_dataframe,)
            Flow(*steps).process()
            

# Parse meeting metadata
process_steps = (process_meeting,)
Flow(*load_steps, *process_steps, dump_to_path('data/committee-meeting-attendees-parts')).process()

ImportError: cannot import name 'protocols_to_dataframe'

In [54]:
rows[0]['body']

KeyError: 'body'

In [None]:
rows = []

def process_meeting_protocol_part(row):
    stats['processed parts'] += 1
    if row['body'] and 'אנחנו ככנסת צריכים להיות ערוכים' in row['body']:
        stats['meetings contain text: we as knesset need to be prepared'] += 1

def process_meeting(row):
    stats['total meetings'] += 1
    if row['attended_mk_individual_ids']:
        for mk_id in row['attended_mk_individual_ids']:
            member_attended_meetings[mk_id] += 1
    parts_filename = row['parts_parsed_filename']
    if parts_filename:
        if PROCESS_PARTS_LIMIT and stats['processed parts'] < PROCESS_PARTS_LIMIT:
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (process_meeting_protocol_part,)

In [48]:
from collections import defaultdict
from dataflows import Flow

parts_processed = 0

rows = []
stats = {}

def process_meeting_protocol_part(row):
    stats['processed parts'] += 1
    if row['body'] and 'אנחנו ככנסת צריכים להיות ערוכים' in row['body']:
        stats['meetings contain text: we as knesset need to be prepared'] += 1

def process_meeting(row):
    stats['total meetings'] += 1
    if row['attended_mk_individual_ids']:
        for mk_id in row['attended_mk_individual_ids']:
            member_attended_meetings[mk_id] += 1
    parts_filename = row['parts_parsed_filename']
    if parts_filename:
        if PROCESS_PARTS_LIMIT and stats['processed parts'] < PROCESS_PARTS_LIMIT:
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (process_meeting_protocol_part,)
            Flow(*steps).process()
            

#def process_meeting_protocol_part(row):
#    stats['processed parts'] += 1
#    if row['body'] and 'אנחנו ככנסת צריכים להיות ערוכים' in row['body']:
#        stats['meetings contain text: we as knesset need to be prepared'] += 1

def add_row(row):
    rows.append(row)

def process_meeting(row):
    parts_filename = row['parts_parsed_filename']
    if parts_filename:     
        if parts_processed < PROCESS_PARTS_LIMIT:
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (add_row,)
            Flow(*steps).process()
            parts_processed += 1
            
'''           
def extract_meeting_metadata(row):
    row_data = {
        'KnessetNum': row['KnessetNum'],
        'StartDate': row['StartDate'],
        
    }
    print(row)

#process_steps = (process_meeting,)
'''

"\n            \ndef extract_meeting_metadata(row):\n    row_data = {\n        'KnessetNum': row['KnessetNum'],\n        'StartDate': row['StartDate'],\n        \n    }\n    print(row)\n\n#process_steps = (process_meeting,)\n"

## Run the flow

In [49]:
from dataflows import Flow, dump_to_path

# Parse meeting metadata
process_steps = (add_row,)
Flow(*load_steps, *process_steps, dump_to_path('data/committee-meeting-attendees-parts')).process()

using cache data from .cache/people-committee-meeting-attendees-knesset-20


(<datapackage.package.Package at 0x7f304857ad30>,
 {'count_of_rows': 10256,
  'bytes': 30129277,
  'hash': '9bb63d3b4c724c88df1416113d0fb80c',
  'dataset_name': None})

In [50]:
few_rows = rows[:10]
few_rows

[{'CommitteeSessionID': 562716,
  'Number': 1,
  'KnessetNum': 20,
  'TypeID': 161,
  'TypeDesc': 'פתוחה',
  'CommitteeID': 922,
  'Location': 'חדר הוועדה, באגף הוועדות (קדמה), קומה 3, חדר 3750',
  'SessionUrl': 'http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesAgenda.aspx?Tab=3&ItemID=562716',
  'BroadcastUrl': 'http://main.knesset.gov.il/Activity/committees/Pages/AllCommitteesBroadcast.aspx?TopicID=7433',
  'StartDate': datetime.datetime(2015, 4, 1, 14, 0),
  'FinishDate': None,
  'Note': None,
  'LastUpdatedDate': datetime.datetime(2015, 4, 19, 9, 4, 43),
  'download_crc32c': '/1BF2g==',
  'download_filename': 'files/23/2/8/286490.DOC',
  'download_filesize': 71168,
  'parts_crc32c': 'W/pKBA==',
  'parts_filesize': 11218,
  'parts_parsed_filename': 'files/5/6/562716.csv',
  'text_crc32c': 't643Ig==',
  'text_filesize': 11202,
  'text_parsed_filename': 'files/5/6/562716.txt',
  'topics': ['בחירת יושב-ראש לוועדת הכספים הזמנית'],
  'committee_name': 'ועדת הכספים',
  'm

In [41]:
import pandas as pd
df = pd.DataFrame(data=few_rows)[['KnessetNum', 'parts_parsed_filename', 'text_parsed_filename', 'topics', 'mks']]

    parts_filename = row['parts_parsed_filename']
    if parts_filename:
        if PROCESS_PARTS_LIMIT and stats['processed parts'] < PROCESS_PARTS_LIMIT:
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (process_meeting_protocol_part,)
            Flow(*steps).process()

## Aggregate and print stats

In [5]:
from collections import deque
import yaml

top_attended_member_names = [member_names[mk_id] for mk_id, num_attended in
                             deque(sorted(member_attended_meetings.items(), key=lambda kv: kv[1]), maxlen=5)]
print('\n')
print('-- top attended members --')
print(top_attended_member_names)
print('\n')
print('-- stats --')
print(yaml.dump(dict(stats), default_flow_style=False, allow_unicode=True))



-- top attended members --
['איתן ברושי', 'מיכאל לוי', 'דוב חנין', 'משה גפני', 'אורי מקלב']


-- stats --
processed parts: 624
total meetings: 9402



## Get output data

Output data is available in the left sidebar under data directory, you can check the datapackage.json and created csv file to explore the data and schema.