In [1]:
%matplotlib inline

In [2]:
from __future__ import print_function
import pickle
import re
import requests
from sklearn.feature_extraction.text import CountVectorizer

# Load Codelists

In [3]:
sector_url = 'http://iatistandard.org/202/codelists/downloads/clv3/json/en/Sector.json'
sectors_json = requests.get(sector_url).json()['data']
sectors = {
    item['code']: item for item in sectors_json
}

In [4]:
sectors

{'11110': {'category': '111',
  'code': '11110',
  'description': 'Education sector policy, planning and programmes; aid to education ministries, administration and management systems; institution capacity building and advice; school management and governance; curriculum and materials development; unspecified education activities.',
  'name': 'Education policy and administrative management'},
 '11120': {'category': '111',
  'code': '11120',
  'description': 'Educational buildings, equipment, materials; subsidiary services to education (boarding facilities, staff housing); language training; colloquia, seminars, lectures, etc.',
  'name': 'Education facilities and training'},
 '11130': {'category': '111',
  'code': '11130',
  'description': 'Teacher education (where the level of education is unspecified); in-service and pre-service training; materials development.',
  'name': 'Teacher training'},
 '11182': {'category': '111',
  'code': '11182',
  'description': 'Research and studies on 

In [5]:
sector_categories_url = 'http://iatistandard.org/202/codelists/downloads/clv3/json/en/SectorCategory.json'
sector_categories_json = requests.get(sector_categories_url).json()['data']
sector_categories = {
    item['code']: item for item in sector_categories_json
}

In [6]:
sector_categories

{'111': {'code': '111',
  'description': 'Education sector policy, planning and programmes; aid to education ministries, administration and management systems; institution capacity building and advice; school management and governance; curriculum and materials development; unspecified education activities.',
  'name': 'Education, level unspecified'},
 '112': {'code': '112',
  'description': 'Formal and non-formal primary education for children; all elementary and first cycle systematic instruction; provision of learning materials.',
  'name': 'Basic education'},
 '113': {'code': '113',
  'description': 'Second cycle systematic instruction at both junior and senior levels.',
  'name': 'Secondary education'},
 '114': {'code': '114',
  'description': 'Degree and diploma programmes at universities, colleges and polytechnics; scholarships.',
  'name': 'Post-secondary education'},
 '121': {'code': '121',
  'description': 'Health sector policy, planning and programmes; aid to health ministrie

In [7]:
policy_marker_url = 'http://iatistandard.org/202/codelists/downloads/clv3/json/en/PolicyMarker.json'
policy_markers_json = requests.get(policy_marker_url).json()['data']
policy_markers = {
    item['code']: item for item in policy_markers_json
}

In [8]:
policy_markers

{'1': {'code': '1', 'name': 'Gender Equality'},
 '2': {'code': '2', 'name': 'Aid to Environment'},
 '3': {'code': '3', 'name': 'Participatory Development/Good Governance'},
 '4': {'code': '4', 'name': 'Trade Development'},
 '5': {'code': '5',
  'name': 'Aid Targeting the Objectives of the Convention on Biological Diversity'},
 '6': {'code': '6',
  'name': 'Aid Targeting the Objectives of the Framework Convention on Climate Change - Mitigation'},
 '7': {'code': '7',
  'name': 'Aid Targeting the Objectives of the Framework Convention on Climate Change - Adaptation'},
 '8': {'code': '8',
  'name': 'Aid Targeting the Objectives of the Convention to Combat Desertification'},
 '9': {'code': '9',
  'name': 'Reproductive, Maternal, Newborn and Child Health (RMNCH)'}}

# Create Term Vectors

For the names and descriptions, we'll tokenize them and store the words that are used in the names/descriptions inside of a dictionary of sets. This will let us perform cosine similarity of search terms against the terms that are actually used in sector codes and policy markers.

In [9]:
def add_attribute(item, item_attribute, search_item, search_attribute):
    if item_attribute not in item:
        search_item[search_attribute + '_raw'] = ''
        search_item[search_attribute] = set()
        return

    raw_value = item[item_attribute]

    if raw_value is None:
        search_item[search_attribute + '_raw'] = ''
        search_item[search_attribute] = set()
        return

    raw_value = raw_value.lower().strip()
    search_item[search_attribute + '_raw'] = raw_value

    if len(raw_value) == 0:
        search_item[search_attribute] = set()
        return

    tokenized_value = set([
        item for item in re.split('[^a-z]+', raw_value.strip().lower())
            if len(item) > 0
    ])

    search_item[search_attribute] = tokenized_value

In [10]:
searchable_sectors = {}

for code, sector in sectors.items():
    category_id = sector['category']

    sector_data = {}

    add_attribute(sector, 'name', sector_data, 'sector_name')
    add_attribute(sector, 'description', sector_data, 'sector_description')

    if category_id in sector_categories:
        category = sector_categories[category_id]
        add_attribute(category, 'name', sector_data, 'category_name')
        add_attribute(category, 'description', sector_data, 'category_description')
    else:
        print('Missing category:', category_id)

    searchable_sectors[code] = sector_data

In [11]:
searchable_sectors

{'11110': {'category_description': {'activities',
   'administration',
   'advice',
   'aid',
   'and',
   'building',
   'capacity',
   'curriculum',
   'development',
   'education',
   'governance',
   'institution',
   'management',
   'materials',
   'ministries',
   'planning',
   'policy',
   'programmes',
   'school',
   'sector',
   'systems',
   'to',
   'unspecified'},
  'category_description_raw': 'education sector policy, planning and programmes; aid to education ministries, administration and management systems; institution capacity building and advice; school management and governance; curriculum and materials development; unspecified education activities.',
  'category_name': {'education', 'level', 'unspecified'},
  'category_name_raw': 'education, level unspecified',
  'sector_description': {'activities',
   'administration',
   'advice',
   'aid',
   'and',
   'building',
   'capacity',
   'curriculum',
   'development',
   'education',
   'governance',
   'institutio

In [12]:
searchable_policy_markers = {}

for code, policy_marker in policy_markers.items():
    policy_marker_data = {}

    add_attribute(policy_marker, 'name', policy_marker_data, 'name')

    searchable_policy_markers[code] = policy_marker_data

In [13]:
searchable_policy_markers

{'1': {'name': {'equality', 'gender'}, 'name_raw': 'gender equality'},
 '2': {'name': {'aid', 'environment', 'to'}, 'name_raw': 'aid to environment'},
 '3': {'name': {'development', 'good', 'governance', 'participatory'},
  'name_raw': 'participatory development/good governance'},
 '4': {'name': {'development', 'trade'}, 'name_raw': 'trade development'},
 '5': {'name': {'aid',
   'biological',
   'convention',
   'diversity',
   'objectives',
   'of',
   'on',
   'targeting',
   'the'},
  'name_raw': 'aid targeting the objectives of the convention on biological diversity'},
 '6': {'name': {'aid',
   'change',
   'climate',
   'convention',
   'framework',
   'mitigation',
   'objectives',
   'of',
   'on',
   'targeting',
   'the'},
  'name_raw': 'aid targeting the objectives of the framework convention on climate change - mitigation'},
 '7': {'name': {'adaptation',
   'aid',
   'change',
   'climate',
   'convention',
   'framework',
   'objectives',
   'of',
   'on',
   'targeting',


# Save the Data

In [14]:
with open('codelist_sectors.pickle', 'wb') as f:
    pickle.dump(searchable_sectors, f)

In [15]:
with open('codelist_policy_markers.pickle', 'wb') as f:
    pickle.dump(searchable_policy_markers, f)

In [16]:
!tar -cf codelist.tar codelist_*
!gzip codelist.tar
!aws s3 cp codelist.tar.gz s3://mdang.w210/ --acl public-read

upload: ./codelist.tar.gz to s3://mdang.w210/codelist.tar.gz
