# Install Packages

In [1]:
%matplotlib inline

In [2]:
from __future__ import print_function
from datetime import datetime
import io
import json
import os
import pickle

# Reload the Metadata

If running from a Docker container, make sure to run `aws configure` manually to configure your credentials, or it won't be able to upload to or download from the bucket.

In [3]:
s3_bucket = 'mdang.w210'

In [4]:
# If we don't have any of the cached data files, download them from S3

if not os.path.isdir('iati'):
    !aws s3 cp s3://{s3_bucket}/iati.tar.gz .
    !tar -zxf iati.tar.gz

if not os.path.isfile('activities_metadata.json'):
    !aws s3 cp s3://{s3_bucket}/activities_metadata.json.gz .
    !gunzip activities_metadata.json.gz

if not os.path.isdir('iati-orgs'):
    !aws s3 cp s3://{s3_bucket}/iati-orgs.tar.gz .
    !tar -zxf iati-orgs.tar.gz

if not os.path.isfile('organization_metadata.json'):
    !aws s3 cp s3://{s3_bucket}/organization_metadata.json.gz .
    !gunzip organization_metadata.json.gz

## Activities

In [5]:
from mongodb_iati import activities, activities_metadata, transactions, activity_failures, activity_xml_failures, import_activity_document

activity_failures.clear()
activity_xml_failures.clear()

file_names = [
    '%s/%s' % ('iati', file_name)
        for file_name in os.listdir('iati')
            if file_name.endswith('.xml')
]

for num_finished, file_name in enumerate(file_names):
    if num_finished % 100 == 0:
        failure_count = len(activity_failures) + len(activity_xml_failures)
        print('%s Processed %d of %d (%d errors)' % (datetime.now(), num_finished, len(file_names), failure_count))

    with open(file_name) as f:
        import_activity_document(f)

print('%s Processed %d of %d' % (datetime.now(), num_finished, len(file_names)))

2017-12-30 09:23:42.207777 Processed 0 of 4587 (0 errors)
2017-12-30 09:24:00.083647 Processed 100 of 4587 (1 errors)
2017-12-30 09:24:13.791458 Processed 200 of 4587 (1 errors)
2017-12-30 09:24:40.926808 Processed 300 of 4587 (1 errors)
2017-12-30 09:24:58.281917 Processed 400 of 4587 (1 errors)
2017-12-30 09:25:21.872715 Processed 500 of 4587 (1 errors)
2017-12-30 09:25:45.367334 Processed 600 of 4587 (1 errors)
2017-12-30 09:26:00.551300 Processed 700 of 4587 (1 errors)
2017-12-30 09:26:14.954294 Processed 800 of 4587 (1 errors)
2017-12-30 09:26:35.157452 Processed 900 of 4587 (1 errors)
2017-12-30 09:26:53.473205 Processed 1000 of 4587 (2 errors)
2017-12-30 09:27:07.228759 Processed 1100 of 4587 (2 errors)
2017-12-30 09:27:21.662504 Processed 1200 of 4587 (4 errors)
2017-12-30 09:27:40.077492 Processed 1300 of 4587 (6 errors)
2017-12-30 09:28:03.293521 Processed 1400 of 4587 (6 errors)
2017-12-30 09:28:14.643923 Processed 1500 of 4587 (6 errors)
2017-12-30 09:28:38.971616 Processed

In [6]:
print(activities.count())
print(transactions.count())
print(activities_metadata.count())

764159
3602802
4542


In [7]:
activities.find_one()

{'@default-currency': 'USD',
 '@generated-datetime': '2017-11-30T10:00:00Z',
 '@hierarchy': '1',
 '@last-updated-datetime': '2017-11-30T10:00:00Z',
 '@w210-key': 'unitedstates-bh',
 '@xml:lang': 'en',
 '_id': ObjectId('5a475b1e6f8487020fc0baae'),
 'activity-date': [{'@iso-date': '2013-09-30',
   '@type': '2',
   'narrative': None},
  {'@iso-date': '2016-09-30', '@type': '4', 'narrative': None}],
 'activity-scope': {'@code': '4'},
 'activity-status': {'@code': '2'},
 'collaboration-type': {'@code': '1'},
 'conditions': {'@attached': '0'},
 'contact-info': {'@type': '1',
  'email': None,
  'mailing-address': {'narrative': '950 Pennsylvania Avenue, NW Washington, DC 20530-0001'},
  'organisation': {'narrative': 'Department of Justice'},
  'person-name': {'narrative': None},
  'telephone': '800-877-8339',
  'website': 'https://www.justice.gov/'},
 'default-aid-type': {'@code': 'C01'},
 'default-finance-type': {'@code': '110'},
 'default-flow-type': {'@code': '10'},
 'default-tied-status': 

In [8]:
activities_metadata.find_one()

{'@w210-key': 'unitedstates-bh',
 '_id': ObjectId('5a475b1e6f8487020fc0baad'),
 'author': None,
 'author_email': 'ForeignAssistanceWeb@state.gov',
 'creator_user_id': 'bb22bdb2-6839-4392-9874-3a06de82f318',
 'extras': [{'key': 'activity_count', 'value': '9'},
  {'key': 'country', 'value': 'BH'},
  {'key': 'data_updated', 'value': '2017-11-30 10:00:00'},
  {'key': 'filetype', 'value': 'activity'},
  {'key': 'iati_version', 'value': '2.02'},
  {'key': 'language', 'value': 'en'},
  {'key': 'secondary_publisher', 'value': 'NULL'},
  {'key': 'publisher_source_type', 'value': 'primary_source'},
  {'key': 'publisher_organization_type', 'value': '10'},
  {'key': 'publisher_country', 'value': 'US'},
  {'key': 'publisher_iati_id', 'value': 'US-USAGOV'}],
 'groups': [],
 'iati-activities': {'@generated-datetime': '2017-11-30T10:00:00Z',
  '@version': '2.02',
  '@xmlns:usg': 'http://www.foreignassistance.gov/web/IATI/usg-extension'},
 'id': '05b5784e-558f-4c23-8aac-5b04e95ad3e4',
 'isopen': True,


In [9]:
transactions.find_one()

{'@generated-datetime': '2017-11-30T10:00:00Z',
 '@humanitarian': '0',
 '@w210-activity': 'US-GOV-15-BH-37F111',
 '@w210-key': 'unitedstates-bh',
 '_id': ObjectId('5a475b1e6f8487020fc0bab7'),
 'description': {'narrative': 'Fiscal Year 2015 total disbursements in support of international training progra'},
 'sector': [{'@code': '43010',
   '@vocabulary': '1',
   'narrative': 'Multisector aid'},
  {'@code': '9000',
   '@vocabulary': '99',
   'narrative': 'Multi-sector - Unspecified'}],
 'transaction-date': {'@iso-date': '2015-09-30'},
 'transaction-type': {'@code': '3'},
 'usg:treasury-account': {'usg:fiscal-funding-year': {'@begin': '2012',
   '@end': '2013'},
  'usg:main-account': {'#text': 'Nonproliferation, Anti-Terrorism, Demining and Related Programs, International Security Assistance, State',
   '@code': '1075'},
  'usg:regular-account': {'@code': '11'}},
 'value': {'#text': '77575.00', '@value-date': '2015-09-30'}}

## Organizations

In [10]:
from mongodb_iati import organizations, organizations_metadata, import_organization_document, organization_failures, organization_xml_failures

file_names = [
    '%s/%s' % ('iati-orgs', file_name)
        for file_name in os.listdir('iati-orgs')
            if file_name.endswith('.xml')
]

for num_finished, file_name in enumerate(file_names):
    if num_finished % 100 == 0:
        failure_count = len(organization_failures) + len(organization_xml_failures)
        print('%s Processed %d of %d' % (datetime.now(), num_finished, len(file_names)))

    with open(file_name) as f:
        import_organization_document(f)

2017-12-30 09:38:37.332703 Processed 0 of 275
2017-12-30 09:38:38.470181 Processed 100 of 275
2017-12-30 09:38:38.759892 Processed 200 of 275


In [11]:
print(organizations.count())
print(organizations_metadata.count())
print(len(organization_xml_failures))

442
275
0


In [12]:
organizations.find_one()

{'@default-currency': 'EUR',
 '@generated-datetime': '2017-05-01T08:17:05+00:00',
 '@last-updated-datetime': '2017-05-01T08:17:05+00:00',
 '@w210-key': 'bothends-org',
 '@xml:lang': 'en',
 '_id': ObjectId('5a475e9d6f8487020f036ede'),
 'name': {'narrative': {'#text': 'Both ENDS', '@xml:lang': 'en'}},
 'organisation-identifier': 'NL-KVK-41210098',
 'reporting-org': {'@ref': 'NL-KVK-41210098',
  '@type': '22',
  'narrative': {'#text': 'Both ENDS', '@xml:lang': 'en'}}}

In [13]:
organizations_metadata.find_one()

{'@w210-key': 'bothends-org',
 '_id': ObjectId('5a475e9d6f8487020f036edd'),
 'author': None,
 'author_email': 'lm@bothends.org',
 'creator_user_id': '832a72df-3d28-4175-b6ae-133231394fd2',
 'extras': [{'key': 'data_updated', 'value': '2017-05-01 08:17:05'},
  {'key': 'filetype', 'value': 'organisation'},
  {'key': 'iati_version', 'value': '2.02'},
  {'key': 'publisher_source_type', 'value': 'primary_source'},
  {'key': 'publisher_organization_type', 'value': '22'},
  {'key': 'publisher_country', 'value': 'NL'},
  {'key': 'publisher_iati_id', 'value': 'NL-KVK-41210098'}],
 'groups': [],
 'iati-organisations': {'@generated-datetime': '2017-05-01T08:17:05+00:00',
  '@version': '2.02'},
 'id': '07f307a3-dad0-45ec-8421-ba0df69b54d6',
 'isopen': True,
 'license_id': 'cc-by-sa',
 'license_title': 'Creative Commons Attribution Share-Alike',
 'license_url': 'http://www.opendefinition.org/licenses/cc-by-sa',
 'maintainer': None,
 'maintainer_email': None,
 'metadata_created': '2017-05-01T08:17:0

In [14]:
with open('activity_xml_failures', 'wb') as f:
    pickle.dump(activity_xml_failures, f)

In [15]:
with open('activity_failures', 'wb') as f:
    pickle.dump(activity_failures, f)

In [16]:
with open('organization_xml_failures', 'wb') as f:
    pickle.dump(organization_xml_failures, f)

In [17]:
with open('organization_failures', 'wb') as f:
    pickle.dump(organization_xml_failures, f)

# Investigating/Fixing Errors

In [18]:
def check_errors(folder, metadata, failures):
    metadata_titles = {}
    metadata_resources = {}

    for metadata_item in metadata:
        if 'isopen' not in metadata_item:
            continue

        if not metadata_item['isopen']:
            continue

        title = metadata_item['title']
        short_name = metadata_item['name']

        resources = metadata_item['resources']
        resource_urls = [x for x in set([resource_item['url'] for resource_item in resources])]

        metadata_titles[short_name] = title
        metadata_resources[short_name] = resource_urls

    for short_name, resource_urls in metadata_resources.items():
        iati_filename = folder + '/' + short_name + '.xml'

        if iati_filename not in failures:
            continue

        print('Processing %s failed: %s' % (short_name, failures[iati_filename]))
        print('\n'.join(resource_urls))

## Activities

In [19]:
with io.open('activities_metadata.json', 'r', encoding = 'utf8') as f:
    activity_metadata_json = json.load(f)

In [20]:
print(len(activity_xml_failures))
check_errors('iati', activity_metadata_json, activity_xml_failures)

19
Processing ec-near-ba failed: unclosed token: line 24, column 1090057
http://ec.europa.eu/europeaid/files/iati/XI-IATI-EC_NEAR_C_BA.xml
Processing ec-near-qsa failed: unclosed token: line 62, column 27812
http://ec.europa.eu/europeaid/files/iati/XI-IATI-EC_NEAR_C_QSA.xml
Processing ec-near-89 failed: unclosed token: line 62, column 27812
http://ec.europa.eu/europeaid/files/iati/XI-IATI-EC_NEAR_C_QSA.xml
Processing ec-near-rs failed: unclosed token: line 188, column 5342
http://ec.europa.eu/europeaid/files/iati/XI-IATI-EC_NEAR_C_RS.xml
Processing ec-devco-qna failed: unclosed token: line 2, column 3547600
http://ec.europa.eu/europeaid/files/iati/XI-IATI-EC_DEVCO_C_QNA.xml
Processing ec-devco-498 failed: unclosed token: line 2, column 3547600
http://ec.europa.eu/europeaid/files/iati/XI-IATI-EC_DEVCO_C_QNA.xml
Processing ec-devco-td failed: unclosed token: line 745, column 23563
http://ec.europa.eu/europeaid/files/iati/XI-IATI-EC_DEVCO_C_TD.xml
Processing ec-devco-cd failed: unclosed t

In [21]:
print(len(activity_failures))
check_errors('iati', activity_metadata_json, activity_failures)

0


## Organizations

In [22]:
with io.open('organization_metadata.json', 'r', encoding = 'utf8') as f:
    organization_metadata_json = json.load(f)

In [23]:
print(len(organization_xml_failures))
check_errors('iati-orgs', organization_metadata_json, organization_xml_failures)

0


In [24]:
print(len(organization_failures))
check_errors('iati-orgs', organization_metadata_json, organization_failures)

0
