# Install Packages

In [1]:
%matplotlib inline

In [2]:
from __future__ import print_function
from datetime import datetime
import pickle
import pymongo
import six

# Data Check

If running from a Docker container, make sure to run `aws configure` manually to configure your credentials, or it won't be able to upload to or download from the bucket.

In [3]:
s3_bucket = 'mdang.w210'

In [4]:
conn=pymongo.MongoClient('mongodb', 27017)

conn.database_names()

['admin', 'iati', 'local']

In [5]:
db = conn.iati

activities=db.activities
activities_metadata=db.activities_metadata
transactions=db.transactions
organizations=db.organizations
organizations_metadata=db.organizations_metadata

db.collection_names()

['organizations',
 'activities',
 'activities_metadata',
 'transactions',
 'organizations_metadata']

In [6]:
print(activities_metadata.count(), activities.count())

4542 764159


In [7]:
print(transactions.count())

3602802


In [8]:
print(organizations_metadata.count(), organizations.count())

275 442


# Add Activities

In [9]:
from parse_activity import add_activity_node, publisher_nodes, organization_nodes, activity_nodes, activity_relationships

## Load Activities and Organizations

In [10]:
activities_count = activities.count()

for num_finished, activity in enumerate(activities.find()):
    if num_finished % 10000 == 0:
        print('%s Processed %d of %d' % (datetime.now(), num_finished, activities_count))

    add_activity_node(activity)

print('%s Processed %d of %d' % (datetime.now(), activities_count, activities_count))

2017-12-31 03:42:38.353372 Processed 0 of 764159
2017-12-31 03:43:52.586585 Processed 10000 of 764159
2017-12-31 03:44:06.791817 Processed 20000 of 764159
2017-12-31 03:44:56.346781 Processed 30000 of 764159
2017-12-31 03:45:08.113786 Processed 40000 of 764159
2017-12-31 03:45:30.769070 Processed 50000 of 764159
2017-12-31 03:46:17.180293 Processed 60000 of 764159
2017-12-31 03:48:18.252265 Processed 70000 of 764159
2017-12-31 03:48:46.165216 Processed 80000 of 764159
2017-12-31 03:49:55.098177 Processed 90000 of 764159
2017-12-31 03:50:38.598963 Processed 100000 of 764159
2017-12-31 03:54:10.129745 Processed 110000 of 764159
2017-12-31 03:56:48.849281 Processed 120000 of 764159
2017-12-31 03:59:15.853565 Processed 130000 of 764159
2017-12-31 04:00:07.113563 Processed 140000 of 764159
2017-12-31 04:01:42.543901 Processed 150000 of 764159
2017-12-31 04:02:33.202726 Processed 160000 of 764159
2017-12-31 04:03:56.811284 Processed 170000 of 764159
2017-12-31 04:05:02.101482 Processed 18000

## Check Results

In [11]:
len(publisher_nodes)

378

In [12]:
len(organization_nodes)

5796

In [13]:
len(activity_nodes)

725963

In [14]:
len(activity_relationships)

725963

## Confirm the Keys in Our Nodes

In [15]:
node_keys = set()

for key, node in activity_nodes.items():
    try:
        node_keys |= set(node.keys())
    except:
        break

In [16]:
node_keys

{'description',
 'description_raw',
 'iati-identifier',
 'location',
 'policy-marker',
 'recipient-country',
 'sector'}

## Store as Pickled Files

In [17]:
with open('graph_publisher_nodes.pickle', 'wb') as f:
    pickle.dump(publisher_nodes, f)

In [18]:
with open('graph_organization_nodes.pickle', 'wb') as f:
    pickle.dump(organization_nodes, f)

In [19]:
with open('graph_activity_nodes.pickle', 'wb') as f:
    pickle.dump(activity_nodes, f)

In [20]:
with open('graph_activity_relationships.pickle', 'wb') as f:
    pickle.dump(activity_relationships, f)

# Backup Files

Because commands starting with `!` create a forked process, it's possible that the commands below might fail. To work around that possibility, you may want to restart the kernel to free up memory and only run the subsequent commands after that.

In [21]:
!rm -f graph_pickle.tar.gz
!tar -cf graph_pickle.tar graph_*.pickle
!gzip graph_pickle.tar

In [22]:
s3_bucket = 'mdang.w210'

In [23]:
!aws s3 cp graph_pickle.tar.gz s3://{s3_bucket}/ --acl public-read

upload: ./graph_pickle.tar.gz to s3://mdang.w210/graph_pickle.tar.gz
