# Example flow for processing and aggregating stats about committee meeting attendees and protocol parts

See the [DataFlows documentation](https://github.com/datahq/dataflows) for more details regarding the Flow object and processing functions.

Feel free to modify and commit changes which demonstrate additional functionality or relevant data.

## Constants

In [42]:
# Limit processing of protocol parts for development, -1 means no limit.
PROCESS_PARTS_LIMIT = 100

# Knesset num to query
KNESSET_NUM = 20

# Enable caching of protocol parts data (not efficient, should only be used for local development with sensible PROCESS_PARTS_LIMIT)
PROCESS_PARTS_CACHE = True

# Filter the meetings to be processed, these kwargs are passed along to DataFlows filter_rows processor for meetings resource
MEETINGS_FILTER_ROWS_KWARGS = {'equals': [{'KnessetNum': KNESSET_NUM}]}

# Don'e use local data - loads everything from knesset data remote storage
# When set to False - also enables caching, so you won't download from remote storage on 2nd run.
USE_DATA = False
USE_CACHE = not USE_DATA

## Loading lexicons

In [43]:
import os

def read_topic_to_set(topic_name):
    lines = open(os.path.join(dir_name, topic_name + ".txt"), 'r').readlines()
    return set([line.strip().replace("\ufeff", "") for line in lines])

dir_name = "../topics/lexicons"        

files = os.listdir(dir_name)

topics = [file.split('.')[0] for file in files]

lexicons = {}
for topic_name in topics:
    lexicons[topic_name] = read_topic_to_set(topic_name)

## Load source data

In [44]:
from dataflows import filter_rows, cache
from datapackage_pipelines_knesset.common_flow import load_knesset_data, load_member_names

# Loads a dict containing mapping between knesset member id and the member name
member_names = load_member_names(use_data=USE_DATA)

# define flow steps for loading the source committee meetings data
# the actual loading is done later in the Flow
load_steps = (
    load_knesset_data('people/committees/meeting-attendees/datapackage.json', USE_DATA),
    filter_rows(**MEETINGS_FILTER_ROWS_KWARGS)
)

if not USE_DATA:
    # when loading from URL - enable caching which will skip loading on 2nd run
    path = '.cache/people-committee-meeting-attendees-knesset-{}'.format(KNESSET_NUM)
    load_steps = (cache(*load_steps, cache_path=path),)

loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/members/mk_individual/datapackage.json
using cache data from .cache/members-mk-individual-names
loading from url: https://storage.googleapis.com/knesset-data-pipelines/data/people/committees/meeting-attendees/datapackage.json


## Inspect the datapackages which will be loaded

Last command's output log should contain urls to datapackage.json files, open them and check the table schema to see the resource metadata and available fields which you can use in the processing functions.

Check the [frictionlessdata docs](https://frictionlessdata.io/docs/) for more details about the datapackage file format.

## Extract topics from lines

In [45]:
from collections import defaultdict
from dataflows import Flow
import pandas as pd
import numpy as np

first = True
running_index = 0

meeting_data_global = None
topics_df = None

stats = defaultdict(int)
member_attended_meetings = defaultdict(int)
rows = []
m_rows = []

def initialize_meeting_data_global(meeting_row):
    global meeting_data_global
    global running_index
    topics_exists = meeting_row['topics'] is not None
    topics = ";".join(meeting_row['topics']) if topics_exists else ""

    meeting_data_global = {
        'KnessetNum': [meeting_row['KnessetNum']],
        'Year': [str(meeting_row['StartDate']).split("-")[0]],
        'Month': = df['StartDate'].apply(lambda l: str(l).split("-")[1])
        'CommitteeSessionID': [meeting_row['CommitteeSessionID']],
        'Number': [meeting_row['Number']],
        'Mks': [';'.join(meeting_row['mks'])] if meeting_row['mks'] is not None else [""],
        'Topics': [topics],
        'StartDate': [meeting_row['StartDate']],
        'CommitteeID': [meeting_row['CommitteeID']]
    } 
    
    # Adding topic counts in the 'topics' column 
    if(topics_exists):
        topic_words = topics.split()
        topic_words_size_2 = [" ".join(topic_words[i:i+2]) for i in range(len(topic_words) - 2)]
        topic_words_size_3 = [" ".join(topic_words[i:i+3]) for i in range(len(topic_words) - 3)]
      
    for topic_name, lexicon in lexicons.items():
        if not topics_exists:
            meeting_data_global[topic_name + "_score"] = [0]
        else:
            count = lexicon_count(lexicon, topic_words) + lexicon_count(lexicon, topic_words_size_2) + lexicon_count(lexicon, topic_words_size_3)
            meeting_data_global[topic_name + "_score"] = [count*3]
    
def word_permutations(word):
    clean_word = word.strip()
    permutations = [clean_word]
    if len(word) > 1 and word.startswith('ה') or word.startswith('ב') or word.startswith('ל'):
        permutations.append(word[1:])
    return permutations


def in_lexicon(word, lexicon):
    for p in word_permutations(word):
        if p in lexicon:
            return True
    return False

            
def lexicon_count(lexicon, words):
    count = 0
    for word in words:
        if in_lexicon(word, lexicon):
            count += 1
    return count   


def process_meeting_protocol_part(row):
    global meeting_data_global
    global running_index
    global rows
    rows.append(row)
    stats['processed parts'] += 1
    if 'header' in row and row['header'] is not None:
        words = row['header'].split()
    else:
        words = []
    if 'body' in row and row['body'] is not None:
        words += row['body'].split()      
    words_size_2 = [" ".join(words[i:i+2]) for i in range(len(words) - 2)]
    words_size_3 = [" ".join(words[i:i+3]) for i in range(len(words) - 3)]
                    
    for topic_name, lexicon in lexicons.items():
        meeting_data_global[topic_name + "_score"][0] += lexicon_count(lexicon, words)
        meeting_data_global[topic_name + "_score"][0] += lexicon_count(lexicon, words_size_2)  
        meeting_data_global[topic_name + "_score"][0] += lexicon_count(lexicon, words_size_3)  
            

def process_meeting(row):
    global topics_df
    global running_index
    
    m_rows.append(row)
    stats['total meetings'] += 1
    if row['attended_mk_individual_ids']:
        for mk_id in row['attended_mk_individual_ids']:
            member_attended_meetings[mk_id] += 1
    parts_filename = row['parts_parsed_filename']
    if parts_filename:
        if PROCESS_PARTS_LIMIT == -1 or (PROCESS_PARTS_LIMIT and stats['processed parts'] < PROCESS_PARTS_LIMIT):
            initialize_meeting_data_global(row)
            steps = (load_knesset_data('committees/meeting_protocols_parts/' + parts_filename, USE_DATA),)
            if not USE_DATA and PROCESS_PARTS_CACHE:
                steps = (cache(*steps, cache_path='.cache/committee-meeting-protocol-parts/' + parts_filename),)
            steps += (process_meeting_protocol_part,)
            Flow(*steps).process()
            running_index += 1

def add_meeting_data_to_table(row):
    global topics_df
    global meeting_data_global
    #max_score = 0
    #max_lexicon = "None"
    #for lexicon_name in lexicons:
    #    score = meeting_data_global[lexicon_name+"_score"]
    #    if score > max_score:
    #        max_score = score
    #        max_lexicon = lexicon_name        
    #meeting_data_global['BestTopic'] = [max_lexicon]
    if topics_df is None:
        topics_df = pd.DataFrame(meeting_data_global)
    else:
        topics_df = topics_df.append(pd.DataFrame(meeting_data_global), ignore_index=True)


process_steps = (
    process_meeting,
    add_meeting_data_to_table,
)

In [None]:
from dataflows import Flow, dump_to_path

Flow(*load_steps, *process_steps, dump_to_path('data/committee-meeting-attendees-parts')).process()

In [None]:
topics_df

In [None]:
topics_df.to_csv("topics_df_knesset_{}.csv".format(KNESSET_NUM))

In [None]:
df_summary = topics_df

In [None]:
df_summary['BestTopic'] = df[['Diplomacy_score', 'Ecologics_score', 'Economics_score','Education_score', 'Health_score', 'Security_score']].idxmax(axis=1)

In [None]:
df_summary.to_csv("topics_df_knesset_{}_summarized.csv".format(KNESSET_NUM))

In [None]:
#g = df[['Year', 'BestTopic']].groupby(['Year', 'BestTopic']).count()

In [None]:
#g

In [None]:
#with open("topics_output.csv", 'w') as f:
#    f.write(topics_df.to_csv())

## Aggregate and print stats

In [None]:
#from collections import deque
#import yaml

#top_attended_member_names = [member_names[mk_id] for mk_id, num_attended in
                             deque(sorted(member_attended_meetings.items(), key=lambda kv: kv[1]), maxlen=5)]
#print('\n')
#print('-- top attended members --')
#print(top_attended_member_names)
#print('\n')
#print('-- stats --')
#print(yaml.dump(dict(stats), default_flow_style=False, allow_unicode=True))

## Get output data

Output data is available in the left sidebar under data directory, you can check the datapackage.json and created csv file to explore the data and schema.