# Install Packages

In [1]:
%matplotlib inline

In [2]:
from __future__ import print_function
from collections import defaultdict
from datetime import datetime
import functools
import io
import itertools
import json
import os
import pandas
import pickle
from pprint import PrettyPrinter
import pymongo
import re
from scipy.sparse import dok_matrix

# Data Check

In [3]:
conn=pymongo.MongoClient('mongodb', 27017)

conn.database_names()

['admin', 'iati', 'local']

In [4]:
db = conn.iati

activities=db.activities

db.collection_names()

['organizations',
 'activities',
 'activities_metadata',
 'transactions',
 'organizations_metadata']

In [5]:
print(activities.count())

764159


# Cache Data

Bring all the activities out of MongoDB and into memory so that we can process them much faster. Iterating over the MongoDB collection takes minutes. Iterating over an in-memory version containing only the fields we're interested in takes seconds.

However, we need a lot of memory to fit everything, so we'll only include a few of the fields which might be useful when constructing the graph.

In [6]:
fields = set([
    '@w210-key',
    'reporting-org',
    'participating-org',
    'recipient-country',
    'transaction'
])

In [7]:
all_activities = []
activities_count = activities.count()

print(datetime.now(), 'Started processing')

for num, activity in enumerate(activities.find()):
    if num % 50000 == 0:
        print(datetime.now(), 'Processed', num, 'of', activities_count)

    activity_copy = { key: value for key, value in activity.items() if key in fields }

    all_activities.append(activity_copy)

print(datetime.now(), 'Finished processing')

2017-12-31 18:55:01.675431 Started processing
2017-12-31 18:55:01.688327 Processed 0 of 764159
2017-12-31 18:55:07.639210 Processed 50000 of 764159
2017-12-31 18:55:15.104585 Processed 100000 of 764159
2017-12-31 18:55:21.930865 Processed 150000 of 764159
2017-12-31 18:55:30.015696 Processed 200000 of 764159
2017-12-31 18:55:38.334383 Processed 250000 of 764159
2017-12-31 18:55:44.661114 Processed 300000 of 764159
2017-12-31 18:55:52.034615 Processed 350000 of 764159
2017-12-31 18:55:58.219051 Processed 400000 of 764159
2017-12-31 18:56:09.044564 Processed 450000 of 764159
2017-12-31 18:56:13.563898 Processed 500000 of 764159
2017-12-31 18:56:20.703983 Processed 550000 of 764159
2017-12-31 18:56:29.133042 Processed 600000 of 764159
2017-12-31 18:56:38.101520 Processed 650000 of 764159
2017-12-31 18:56:41.864497 Processed 700000 of 764159
2017-12-31 18:56:51.083654 Processed 750000 of 764159
2017-12-31 18:56:51.999599 Finished processing


# Create Adjacency List Files

In [8]:
def clean_text(text):
    return re.sub('\s+', ' ', text).strip()

In [9]:
def get_text(element, attribute):

    if element is None:
        return None

    if attribute not in element:
        return None

    try:
        value = element[attribute]
    except Exception as e:
        print(element, attribute)
        raise e

    if type(value) == dict:
        if '#text' in value:
            return value['#text']

        return None

    if type(value) != list:
        return value

    return [
        item['#text'] if type(item) == dict and '#text' in item else
            None if type(item) == dict else item
        for item in value
    ]

In [10]:
def get_node_list(parent, field_keys):
    value = parent

    for key in field_keys:

        # If we have a dictionary, we simply access the attribute

        if type(value) == dict:
            if key not in value:
                return []

            value = value[key]
            continue

        # If we have something that is neither a dict nor a list, we
        # cannot navigate further down the JSON object, so we were
        # unable to find what we needed.

        if type(value) != list:
            return []

        # If we have a list, then we'll check the key in each element
        # of the list.

        value = [ item[key] for item in value if item is not None and key in item ]

    if value is None:
        return []

    if type(value) != list:
        value = [value]

    return value

In [11]:
def get_edge_list(
    activity, field_keys, left_element, left_child, right_element, right_child,
    left_reporting_org_fallback, right_reporting_org_fallback):

    node_list = get_node_list(activity, field_keys)

    return_value = []

    left_fallback = None

    if left_reporting_org_fallback is not None:
        left = get_node_list(activity, ['reporting-org'])
        left_fallback = get_text(left[0], left_reporting_org_fallback)

    right_fallback = None

    if right_reporting_org_fallback is not None:
        right = get_node_list(activity, ['reporting-org'])
        right_fallback = get_text(right[0], right_reporting_org_fallback)

    for node in node_list:
        if left_element not in node or right_element not in node:
            continue

        left = get_node_list(node, [left_element])
        right = get_node_list(node, [right_element])

        for left_node, right_node in itertools.product(left, right):
            left_list = get_text(left_node, left_child)
            right_list = get_text(right_node, right_child)

            if left_list is None:
                left_list = left_fallback

            if right_list is None:
                right_list = right_fallback

            if left_list is None or right_list is None:
                continue

            if type(left_list) != list:
                left_list = [left_list]

            if type(right_list) != list:
                right_list = [right_list]

            return_value += [
                (activity['@w210-key'], clean_text(left_value), clean_text(right_value))
                    for left_value, right_value in itertools.product(left_list, right_list)
                        if left_value is not None and right_value is not None
            ]

    return return_value

In [12]:
def get_edges(
    field_path, left_element, left_child, right_element, right_child,
    left_reporting_org_fallback = None, right_reporting_org_fallback = None):

    if field_path is None:
        field_keys = []
    else:
        field_keys = field_path.split('.')

    return_values = []

    for activity in all_activities:
        new_values = get_edge_list(
            activity, field_keys, left_element, left_child, right_element, right_child,
            left_reporting_org_fallback, right_reporting_org_fallback)

        return_values += new_values

    return return_values

## Graph from Root Elements

In [13]:
root_edges_ref = get_edges(
    None, 'reporting-org', '@ref', 'participating-org', '@ref')

In [14]:
len(root_edges_ref)

2133338

In [15]:
with open('graph_root_ref.txt', 'wb') as f:
    pickle.dump(root_edges_ref, f)

In [16]:
root_edges_narrative = get_edges(
    None, 'reporting-org', 'narrative', 'participating-org', 'narrative')

In [17]:
len(root_edges_narrative)

2442966

In [18]:
with open('graph_root_narrative.txt', 'wb') as f:
    pickle.dump(root_edges_ref, f)

## Graph from Transaction Elements

In [19]:
transaction_edges_ref = get_edges(
    'transaction', 'provider-org', '@ref', 'receiver-org', '@ref', '@ref', None)

In [20]:
len(transaction_edges_ref)

658076

In [21]:
with open('graph_transaction_ref.txt', 'wb') as f:
    pickle.dump(transaction_edges_ref, f)

In [22]:
transaction_edges_narrative = get_edges(
    'transaction', 'provider-org', 'narrative', 'receiver-org', 'narrative', 'narrative', None)

In [23]:
len(transaction_edges_narrative)

1041716

In [24]:
with open('graph_transaction_narrative.txt', 'wb') as f:
    pickle.dump(root_edges_ref, f)

## Graph from Recipient Country

In [25]:
recipient_edges_ref = get_edges(
    None, 'recipient-country', '@code', 'reporting-org', '@ref')

In [26]:
len(recipient_edges_ref)

615047

In [27]:
with open('graph_country_ref.txt', 'wb') as f:
    pickle.dump(recipient_edges_ref, f)

# Create Graph Files from Adjacency Lists

We'll want to convert these `.txt` files into actual graphs.

In [28]:
def get_node_id(node_ids, name):

    # Increment the counter if we haven't seen it

    if name not in node_ids:
        node_ids[name] = len(node_ids)

    return node_ids[name]

In [29]:
def load_graph(file_name):
    node_ids = {}

    get_graph_node_id = functools.partial(get_node_id, node_ids)

    # Iterate once in order to initialize the node ID dictionary

    print(datetime.now(), 'Identifying nodes in', file_name)

    with open(file_name, 'rb') as graph_file:
        edges = pickle.load(graph_file)

        for activity_file, source_name, target_name in edges:
            source_id = get_graph_node_id(source_name)
            target_id = get_graph_node_id(target_name)

    node_count = len(node_ids)

    graph_matrix = dok_matrix((node_count, node_count))

    print(datetime.now(), 'Building sparse matrix for', file_name)

    # Iterate again in order to populate the sparse matrix

    with open(file_name, 'rb') as graph_file:
        edges = pickle.load(graph_file)

        for activity_file, source_name, target_name in edges:
            source_id = get_graph_node_id(source_name)
            target_id = get_graph_node_id(target_name)

            graph_matrix[source_id, target_id] += 1

    print(datetime.now(), 'Finished processing', file_name)

    return node_ids, graph_matrix

# Find Dangling Nodes

In [30]:
def get_dangling_nodes(check_axis, graph_ids, graph):

    # Create a reverse lookup table

    graph_names = { value: key for key, value in graph_ids.items() }

    # Sum by the axis and identify the non-zero entries

    sums = graph.sum(axis = check_axis)
    sums = sums.reshape((sums.shape[1 - check_axis], 1))

    return [graph_names[index] for index, value in enumerate(sums) if value == 0]

In [31]:
get_source_nodes = functools.partial(get_dangling_nodes, 0)
get_sink_nodes = functools.partial(get_dangling_nodes, 1)

# Run against All Graphs

In [32]:
for file_name in os.listdir('.'):
    if len(file_name) > 4 and file_name[-4:] == '.ids':
        os.remove(file_name)
    elif len(file_name) > 6 and file_name[-6:] == '.graph':
        os.remove(file_name)

In [33]:
graph_stats = []
graph_data = []

for file_name in os.listdir('.'):
    if file_name[0:5] != 'graph' or file_name[-4:] != '.txt':
        continue

    # Load the file from cache if we've already done the computation once

    if os.path.isfile(file_name + '.ids') and os.path.isfile(file_name + '.graph'):
        print(datetime.now(), 'Loading cached graph for', file_name)

        with open(file_name + '.ids', 'rb') as id_file:
            graph_ids = pickle.load(id_file)

        with open(file_name + '.graph', 'rb') as graph_file:
            graph = pickle.load(graph_file)

    # Otherwise, perform the computation and save the resulting computations

    else:
        print(datetime.now(), 'Loading graph for', file_name)

        graph_ids, graph = load_graph(file_name)

        with open(file_name + '.ids', 'wb') as id_file:
            pickle.dump(graph_ids, id_file)

        with open(file_name + '.graph', 'wb') as graph_file:
            pickle.dump(graph, graph_file)

    source_nodes = get_source_nodes(graph_ids, graph)
    sink_nodes = get_sink_nodes(graph_ids, graph)

    graph_stats.append({
        'graph file': file_name,
        'total nodes': len(graph_ids),
        'has both edges': len(graph_ids) - len(source_nodes) - len(sink_nodes),
        'has only outgoing edges': len(source_nodes),
        'has only incoming edges': len(sink_nodes)
    })

    graph_data.append({
        'file': file_name,
        'graph': graph,
        'all_nodes': graph_ids,
        'source_nodes': source_nodes,
        'sink_nodes': sink_nodes
    })

2017-12-31 19:01:38.420353 Loading graph for graph_root_narrative.txt
2017-12-31 19:01:38.420487 Identifying nodes in graph_root_narrative.txt
2017-12-31 19:01:39.549823 Building sparse matrix for graph_root_narrative.txt
2017-12-31 19:02:08.120678 Finished processing graph_root_narrative.txt
2017-12-31 19:02:08.341382 Loading graph for graph_root_ref.txt
2017-12-31 19:02:08.341453 Identifying nodes in graph_root_ref.txt
2017-12-31 19:02:09.492791 Building sparse matrix for graph_root_ref.txt
2017-12-31 19:02:38.130612 Finished processing graph_root_ref.txt
2017-12-31 19:02:38.353757 Loading graph for graph_transaction_narrative.txt
2017-12-31 19:02:38.353828 Identifying nodes in graph_transaction_narrative.txt
2017-12-31 19:02:39.422285 Building sparse matrix for graph_transaction_narrative.txt
2017-12-31 19:03:05.828895 Finished processing graph_transaction_narrative.txt
2017-12-31 19:03:06.058151 Loading graph for graph_country_ref.txt
2017-12-31 19:03:06.058221 Identifying nodes in

In [34]:
pandas.DataFrame(graph_stats)

Unnamed: 0,graph file,has both edges,has only incoming edges,has only outgoing edges,total nodes
0,graph_root_narrative.txt,293,11451,67,11811
1,graph_root_ref.txt,293,11451,67,11811
2,graph_transaction_narrative.txt,293,11451,67,11811
3,graph_country_ref.txt,1,363,432,796
4,graph_transaction_ref.txt,341,2762,1979,5082
