In [27]:
import json
from collections import Counter, defaultdict
from lxml import etree
import pprint

with open('all_codelists.json') as f:
    all_codelists = json.load(f)


In [28]:
with open('combined.xml') as f:
    root = etree.parse(f)

iati_activities = root.xpath('//iati-activity')

In [35]:
results = {}

for codelist in all_codelists:
    path = codelist['path']
    if path.startswith('//iati-organisation'):
        continue
    results[path] = {'codelist': codelist['codelist']}
    codelist_name_map = {}
    for codelist_value in codelist['data']['data']:
        name = codelist_value.get('name')
        if name:
            codelist_output_name = "{} ({})".format(name, codelist_value['code'])
        else:
            codelist_output_name = codelist_value['code']
        codelist_name_map[codelist_value['code']] = codelist_output_name
    
    results[path]['results_total'] = {}
    results[path]['results_unknown_total'] = {}
    all_values = root.xpath(path)
    
    counter = Counter(all_values)
    
    for key, value in counter.items():
        if key not in codelist_name_map:
            results[path]['results_unknown_total'][key] = value
        else:
            results[path]['results_total'][codelist_name_map[key]] = value
    
    results[path]['total_codelist_values'] = len(codelist_name_map)
    results[path]['total_codelist_values_unused'] = len(codelist_name_map) - len(results[path]['results_total'])
    results[path]['codelist_values_unused'] = [codelist_name_map[v] for v in sorted(list(set(codelist_name_map) - set(counter)))]
    
    results_unique_activity = defaultdict(set)
    results_unknown_unique_activity = defaultdict(set)
    for value in all_values:
        parent = value
        while True:
            parent = parent.getparent()
            if parent is None:
                break
            if parent.tag == 'iati-activity':
                if value not in codelist_name_map:
                    results_unknown_unique_activity[value].add(parent)
                else:
                    results_unique_activity[codelist_name_map[value]].add(parent)
                break

        
    results[path]['results_unique_activity'] = dict((key, len(value)) for key, value in results_unique_activity.items())
    results[path]['results_unknown_unique_activity'] = dict((key, len(value)) for key, value in results_unknown_unique_activity.items())



In [38]:
with open('codelist_coverage.json', 'w+') as f:
    json.dump(results, f, indent=2)