# Get development indicators from World Bank Databank

10 October 2019

API documentation can be accessed here: 

- https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation
- https://datahelpdesk.worldbank.org/knowledgebase/articles/898590-country-api-queries
- https://datahelpdesk.worldbank.org/knowledgebase/articles/898599-indicator-api-queries


- To request all countries: `http://api.worldbank.org/v2/country?format=json`
- general format for country call: `http://api.worldbank.org/v2/country/<ISO_CODE>?format=json`
- To request all indicators: `http://api.worldbank.org/v2/indicator`
- general format for indicator call: `http://api.worldbank.org/v2/indicators/<INDICATOR_CODE>?format=json`


In [1]:
import requests
import pandas as pd

In [2]:
def page_through(baseurl):
    results = []
    init_call = requests.get(baseurl)
    header = init_call.json()[0]
    init_response = init_call.json()[1]
    pagecount = 1
    
    pages = header['pages']
    total_obs = header['total']
    
    results.extend(init_response)
    
    while pagecount <= pages:
        next_page = pagecount + 1
        next_url = baseurl + '&page=' + str(next_page)
        
        next_call = requests.get(next_url)
        next_response = next_call.json()[1]
        
        results.extend(next_response)
        pagecount += 1
    
    if len(results) == total_obs:
        return results
    else:
        print("Something went wrong")

## Get all WB countries

https://datahelpdesk.worldbank.org/knowledgebase/articles/898590-country-api-queries

In [3]:
allcountries_url = "http://api.worldbank.org/v2/country?format=json"
allcountries = page_through(allcountries_url)
allcountries[0]

{'id': 'ABW',
 'iso2Code': 'AW',
 'name': 'Aruba',
 'region': {'id': 'LCN',
  'iso2code': 'ZJ',
  'value': 'Latin America & Caribbean '},
 'adminregion': {'id': '', 'iso2code': '', 'value': ''},
 'incomeLevel': {'id': 'HIC', 'iso2code': 'XD', 'value': 'High income'},
 'lendingType': {'id': 'LNX', 'iso2code': 'XX', 'value': 'Not classified'},
 'capitalCity': 'Oranjestad',
 'longitude': '-70.0167',
 'latitude': '12.5167'}

In [5]:
allcountries_todf = []

for c in allcountries:
    country = {'id': c['id'], 'iso2code': c['iso2Code'], 'name': c['name'], 
               'region_id': c['region']['id'], 'region_iso2code': c['region']['iso2code'], 'region_name': c['region']['value'], 
               'adminregion_id': c['adminregion']['id'], 'adminregion_iso2code': c['adminregion']['iso2code'], 'adminregion_name': c['adminregion']['value'], 
               'incomelevel_id': c['incomeLevel']['id'], 'incomelevel_iso2code': c['incomeLevel']['iso2code'], 'incomelevel_value': c['incomeLevel']['value'],
               'lendingtype_id': c['lendingType']['id'], 'lendingtype_iso2code': c['lendingType']['iso2code'], 'lendingtype_value': c['lendingType']['value'],
               'capitalcity': c['capitalCity'], 'longitude': c['longitude'], 'latitude': c['latitude']}
    allcountries_todf.append(country)

allcountries_df = pd.DataFrame(allcountries_todf)
allcountries_df.head()

Unnamed: 0,adminregion_id,adminregion_iso2code,adminregion_name,capitalcity,id,incomelevel_id,incomelevel_iso2code,incomelevel_value,iso2code,latitude,lendingtype_id,lendingtype_iso2code,lendingtype_value,longitude,name,region_id,region_iso2code,region_name
0,,,,Oranjestad,ABW,HIC,XD,High income,AW,12.5167,LNX,XX,Not classified,-70.0167,Aruba,LCN,ZJ,Latin America & Caribbean
1,SAS,8S,South Asia,Kabul,AFG,LIC,XM,Low income,AF,34.5228,IDX,XI,IDA,69.1761,Afghanistan,SAS,8S,South Asia
2,,,,,AFR,,,Aggregates,A9,,,,Aggregates,,Africa,,,Aggregates
3,SSA,ZF,Sub-Saharan Africa (excluding high income),Luanda,AGO,LMC,XN,Lower middle income,AO,-8.81155,IBD,XF,IBRD,13.242,Angola,SSF,ZG,Sub-Saharan Africa
4,ECA,7E,Europe & Central Asia (excluding high income),Tirane,ALB,UMC,XT,Upper middle income,AL,41.3317,IBD,XF,IBRD,19.8172,Albania,ECS,Z7,Europe & Central Asia


In [6]:
allcountries_df.columns

Index(['adminregion_id', 'adminregion_iso2code', 'adminregion_name',
       'capitalcity', 'id', 'incomelevel_id', 'incomelevel_iso2code',
       'incomelevel_value', 'iso2code', 'latitude', 'lendingtype_id',
       'lendingtype_iso2code', 'lendingtype_value', 'longitude', 'name',
       'region_id', 'region_iso2code', 'region_name'],
      dtype='object')

In [13]:
countries_df = allcountries_df[['id', 'iso2code', 'name', 'capitalcity', 'latitude', 'longitude', 
                                   'region_name', 'adminregion_name', 'incomelevel_value', 'lendingtype_value']]

region_df = allcountries_df[['region_id', 'region_iso2code', 'region_name']].drop_duplicates()
adminregion_df = allcountries_df[['adminregion_id', 'adminregion_iso2code', 'adminregion_name']].drop_duplicates()
incomelevel_df = allcountries_df[['incomelevel_id', 'incomelevel_iso2code', 'incomelevel_value']].drop_duplicates()
lendingtype_df = allcountries_df[['lendingtype_id', 'lendingtype_iso2code', 'lendingtype_value']].drop_duplicates()

In [16]:
countries_df.to_csv('../Data/WorldBank/countries_list.csv', index=False)
region_df.to_csv('../Data/WorldBank/region_codes.csv', index=False)
adminregion_df.to_csv('../Data/WorldBank/adminregion_codes.csv', index=False)
incomelevel_df.to_csv('../Data/WorldBank/incomelevel_codes.csv', index=False)
lendingtype_df.to_csv('../Data/WorldBank/lendinglevel_codes.csv', index=False)

## Get all WB indicators

https://datahelpdesk.worldbank.org/knowledgebase/articles/898599-indicator-api-queries

In [17]:
allindicators_url = "http://api.worldbank.org/v2/indicator?format=json"
allindicators = page_through(allindicators_url)
allindicators[0]

{'id': '1.0.HCount.1.90usd',
 'name': 'Poverty Headcount ($1.90 a day)',
 'unit': '',
 'source': {'id': '37', 'value': 'LAC Equity Lab'},
 'sourceNote': 'The poverty headcount index measures the proportion of the population with daily per capita income (in 2011 PPP) below the poverty line.',
 'sourceOrganization': 'LAC Equity Lab tabulations of SEDLAC (CEDLAS and the World Bank).',
 'topics': [{'id': '11', 'value': 'Poverty '}]}

In [18]:
len(allindicators)

17326

In [88]:
allindicators_todf = []
indicator_topics = []
source_list = []
topic_list = []

for i in allindicators:
    source = i['source']
    source_list.append(source)
    
    topics = i['topics']
    if len(topics):
        topic_list.extend(topics)
    
    indicator = {'id': i['id'], 'name': i['name'], 'source': source['value'], 
                 'source_note': i['sourceNote'], 'source_organization': i['sourceOrganization']}
    
    
    if len(topics):
        for t in topics:
            try:
                indicator_topic = {'indicator_id': i['id'], 'topic_id': t['id'], 'topic_value': t['value']}
            except KeyError:
                indicator_topic = {'indicator_id': i['id'], 'topic_id': '', 'topic_value': ''}
            
            indicator_topics.append(indicator_topic)
    
    allindicators_todf.append(indicator)

allindicators_df = pd.DataFrame(allindicators_todf).drop_duplicates()
allindicators_df.head()

Unnamed: 0,id,name,source,source_note,source_organization
0,1.0.HCount.1.90usd,Poverty Headcount ($1.90 a day),LAC Equity Lab,The poverty headcount index measures the propo...,LAC Equity Lab tabulations of SEDLAC (CEDLAS a...
1,1.0.HCount.2.5usd,Poverty Headcount ($2.50 a day),LAC Equity Lab,The poverty headcount index measures the propo...,LAC Equity Lab tabulations of SEDLAC (CEDLAS a...
2,1.0.HCount.Mid10to50,Middle Class ($10-50 a day) Headcount,LAC Equity Lab,The poverty headcount index measures the propo...,LAC Equity Lab tabulations of SEDLAC (CEDLAS a...
3,1.0.HCount.Ofcl,Official Moderate Poverty Rate-National,LAC Equity Lab,The poverty headcount index measures the propo...,LAC Equity Lab tabulations of data from Nation...
4,1.0.HCount.Poor4uds,Poverty Headcount ($4 a day),LAC Equity Lab,The poverty headcount index measures the propo...,LAC Equity Lab tabulations of SEDLAC (CEDLAS a...


In [89]:
source_list_df = pd.DataFrame(source_list).drop_duplicates()
source_list_df['id'] = source_list_df['id'].astype(int)
source_list_df = source_list_df.sort_values(by=['id']).set_index('id')

In [90]:
topic_list_df = pd.DataFrame(topic_list).drop_duplicates().dropna()
topic_list_df['id'] = topic_list_df['id'].astype(int)
topic_list_df = topic_list_df.sort_values(by=['id']).set_index('id')

In [91]:
indicator_topics_df = pd.DataFrame(indicator_topics)

In [92]:
wdi = allindicators_df[allindicators_df['source'] == 'World Development Indicators']

In [93]:
wdi_wtopics = wdi.merge(indicator_topics_df, left_on='id', right_on='indicator_id')
wdi_wtopics = wdi_wtopics.drop(columns=['indicator_id', 'topic_id'])

In [94]:
wdi_wtopics['topic_value'].value_counts()

Health                               252
Economy & Growth                     247
Private Sector                       165
Gender                               163
Social Protection & Labor            156
Education                            154
Trade                                146
Environment                          135
Public Sector                         97
Climate Change                        76
Aid Effectiveness                     70
Financial Sector                      64
External Debt                         61
Energy & Mining                       50
Infrastructure                        47
Agriculture & Rural Development       44
Social Development                    34
Poverty                               25
Urban Development                     21
Science & Technology                  13
Millenium development goals            5
                                       2
Name: topic_value, dtype: int64

In [96]:
allindicators_df.to_csv('../Data/WorldBank/indicators_list.csv', index=False)
source_list_df.to_csv('../Data/WorldBank/indicator-sources_codes.csv', index=False)
topic_list_df.to_csv('../Data/WorldBank/indicator-topics_codes.csv', index=False)
wdi_wtopics.to_csv('../Data/WorldBank/indicators-wdi-wtopics.csv', index=False)

## Get indicator time-series data for relevant countries

https://datahelpdesk.worldbank.org/knowledgebase/articles/898581-api-basic-call-structures

### Make list of WDI indicator ids

In [137]:
wdi_list = list(wdi['id'])

### Make list of relevant countries

Note: I did confirm that the worldbank country ID is the same as the ISO alpha3 code

In [136]:
country_lists_df = pd.read_csv("../Data/CShapes/country_shapes.csv", usecols=['iso_alpha2', 'iso_alpha3', 'iso_name']).drop_duplicates().dropna()
relevant_countries = list(country_lists_df['iso_alpha3'])

### Request indicator data per country

The API appears to be having problems, and will randomly return only an error heading. Need to modify the page_through function to account for this first.

In [185]:
def page_through2(baseurl):
    # initiate page count and dummy number of pages
    pagecount = 1
    pages = 1
    # to store results from request
    results = []
    # set initial url for first page
    url = baseurl
    try:
        while pagecount <= pages:
            # attempt first call
            init_call = requests.get(url)
            # compensate for error in call
            if init_call.status_code != 200:
                continue
            elif len(init_call.json()) < 2:
                continue
            else:
                # get results if valid call
                header = init_call.json()[0]
                init_response = init_call.json()[1]
                # track pages and number of observations so knows when to stop
                pages = header['pages']
                total_obs = header['total']
                # add data to results (50 per page)
                results.extend(init_response)

                # increment page number to get next page
                pagecount += 1
                url = baseurl + '&page=' + str(pagecount)
    except requests.exceptions:
        return baseurl

    if len(results) == total_obs:
        return results
    else:
        print("Something went wrong")

In [190]:
time_series_results = []

bufferurls = []
for c in relevant_countries:
    for i in wdi_list:
        indicatorsbycountry_baseurl = "http://api.worldbank.org/v2/country/" + c + "/indicator/" + i + "?format=json"

        raw = page_through2(indicatorsbycountry_baseurl)
        if isinstance(raw, list):
            for r in raw:
                row = {'country': c, 'indicator': i, 'year': r['date'], 'value': r['value'], 
                       'unit': r['unit'], 'obs_status': r['obs_status'], 'decimal': r['decimal']}
                time_series_results.append(row)
        else:
            bufferurls.append(indicatorsbycountry_baseurl)

bufferurls2 = []          
for url in bufferurls:
    raw = page_through2(url)
    if isinstance(raw, list):
        for r in raw:
            row = {'country': c, 'indicator': i, 'year': r['date'], 'value': r['value'], 
                    'unit': r['unit'], 'obs_status': r['obs_status'], 'decimal': r['decimal']}
            time_series_results.append(row)
    else:
        bufferurls2.append(url)

ConnectionError: HTTPConnectionPool(host='api.worldbank.org', port=80): Max retries exceeded with url: /v2/country/GUY/indicator/IC.CRD.INFO.XQ?format=json&page=2 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x11f441860>: Failed to establish a new connection: [Errno 60] Operation timed out',))

In [147]:
time_series_df = pd.DataFrame(time_series_results)
time_series_df

Unnamed: 0,country,decimal,indicator,obs_status,unit,value,year
0,GUY,0,AG.AGR.TRAC.NO,,,,2018
1,GUY,0,AG.AGR.TRAC.NO,,,,2017
2,GUY,0,AG.AGR.TRAC.NO,,,,2016
3,GUY,0,AG.AGR.TRAC.NO,,,,2015
4,GUY,0,AG.AGR.TRAC.NO,,,,2014
5,GUY,0,AG.AGR.TRAC.NO,,,,2013
6,GUY,0,AG.AGR.TRAC.NO,,,,2012
7,GUY,0,AG.AGR.TRAC.NO,,,,2011
8,GUY,0,AG.AGR.TRAC.NO,,,,2010
9,GUY,0,AG.AGR.TRAC.NO,,,,2009
