# Get Time Series data for WDI via SDMX API

Documentation: 

- https://datahelpdesk.worldbank.org/knowledgebase/articles/1886701-sdmx-api-queries
- https://sdmx.org/?page_id=2555/

Basic URL:

http://api.worldbank.org/v2/sdmx/rest/data/WDI/../?startperiod=&endPeriod=



In [1]:
import requests
import time
import pandas as pd

## get inputs

Use all WDI indicators, gathered from previous query to WB

Use ISO alpha 3 codes from CShapes file, which maps CoW, UCDP/PRIO, and ISO codes for countries. Exclude any codes that are not present in the WB, determined from previous query to WB.

In [2]:
wdi_df = pd.read_csv('../Data/WorldBank/Raw_API/indicators-wdi-wtopics.csv')
i_list = list(wdi_df['id'].unique())

countries_df = pd.read_csv("../Data/CShapes/Raw/country_shapes.csv", usecols=['iso_alpha2', 'iso_alpha3', 'iso_name']).drop_duplicates().dropna()
c_list = list(countries_df['iso_alpha3'])

wb_countries = pd.read_csv('../Data/WorldBank/Raw_API/countries_list.csv')
wb_c_list = list(wb_countries['id'].unique())

In [3]:
invalid_countries = []
for c in c_list:
    if c not in wb_c_list:
        invalid_countries.append(c)
print(invalid_countries)

['DDR', 'CSK', 'YUG', 'SUN', 'YMD', 'VDR', 'SCG']


In [4]:
c_list_mod = [x for x in c_list if x not in invalid_countries]
c_list_mod = list(set(c_list_mod))
len(c_list_mod)

193

In [5]:
len(i_list)

1387

## function to page through results

Needs to account for errors - if the call was unsuccessful, if the WB returns an error code, etc.
If successful, returns the data. If unsuccessful, returns the call (to investigate why).

In [6]:
def page_through(baseurl):
    # keep track in case of errors
    tries = 0
    # initiate page count and dummy number of pages
    pagecount = 1
    pages = 1
    # to store results from request
    results = []
    # set initial url for first page
    url = baseurl
    while pagecount <= pages:
        # in case something is fundamentally wrong with query
        if tries > 5:
            return data_call
            break
        else:
            # attempt data request
            data_call = requests.get(url)
            # compensate for error in call
            if data_call.status_code != 200:
                tries += 1
                time.sleep(5)
                continue
            elif len(data_call.json()) < 2:
                tries += 1
                time.sleep(5)
                continue
            else:
                # reset error tracking vars
                tries = 0
                # get results if valid call
                header = data_call.json()[0]
                response = data_call.json()[1]
                # track pages and number of observations so knows when to stop
                pages = header['pages']
                total_obs = header['total']
                # add data to results (10,000 per page)
                results.extend(response)

                # increment page number to get next page
                pagecount += 1
                url = baseurl + '&page=' + str(pagecount)
                time.sleep(5)

    if len(results) == total_obs:
        return results
    else:
        print("Something went wrong")

## function to create url for queries

Each URL will include all countries, and 20 indicators.

In [7]:
def cycle_through(countries, indicators):
    
    country_list = ";".join(countries)
    current_indicator = 0
    total_indicators = len(indicators)
    
    url_list = []
    
    while current_indicator <= total_indicators:
        
        if current_indicator < total_indicators-20:
            indicator_list = ";".join(indicators[current_indicator:current_indicator+20])
        else:
            indicator_list = ";".join(indicators[current_indicator:])
        
        url = "http://api.worldbank.org/v2/country/" + country_list + "/indicator/" + indicator_list + "?source=2" + "&format=json&per_page=10000"
        url_list.append(url)
        current_indicator += 20
    
    return url_list

## create urls

In [8]:
url_list = cycle_through(countries=c_list_mod, indicators=i_list)
len(url_list)

70

## request data and transform into a dataframe

In [9]:
raw_data = []
skipped_urls = []
for url in url_list:
    chunk = page_through(url)
    if type(chunk) != list:
        err_result = {url: chunk}
        skipped_urls.append(err_result)
    else:
        raw_data.extend(chunk)

In [10]:
len(raw_data)

15978240

In [11]:
len(skipped_urls)

0

In [12]:
raw_data[0]

{'indicator': {'id': 'AG.AGR.TRAC.NO',
  'value': 'Agricultural machinery, tractors'},
 'country': {'id': 'AF', 'value': 'Afghanistan'},
 'countryiso3code': 'AFG',
 'date': '2019',
 'value': None,
 'scale': '',
 'unit': '',
 'obs_status': '',
 'decimal': 0}

In [13]:
time_series_results = []
for r in raw_data:
    row = {'country': r['countryiso3code'], 'indicator': r['indicator']['id'], 'year': r['date'], 'value': r['value'], 
                       'unit': r['unit'], 'obs_status': r['obs_status'], 'decimal': r['decimal'], 'scale': ''}
    if 'scale' in r:
        row['scale'] = r['scale']
    
    time_series_results.append(row)

In [14]:
time_series_df = pd.DataFrame(time_series_results)
time_series_df

Unnamed: 0,country,decimal,indicator,obs_status,scale,unit,value,year
0,AFG,0,AG.AGR.TRAC.NO,,,,,2019
1,AFG,0,AG.AGR.TRAC.NO,,,,,2018
2,AFG,0,AG.AGR.TRAC.NO,,,,,2017
3,AFG,0,AG.AGR.TRAC.NO,,,,,2016
4,AFG,0,AG.AGR.TRAC.NO,,,,,2015
5,AFG,0,AG.AGR.TRAC.NO,,,,,2014
6,AFG,0,AG.AGR.TRAC.NO,,,,,2013
7,AFG,0,AG.AGR.TRAC.NO,,,,,2012
8,AFG,0,AG.AGR.TRAC.NO,,,,,2011
9,AFG,0,AG.AGR.TRAC.NO,,,,,2010


## Inspect and clean up the dataframe

In [15]:
time_series_df.columns

Index(['country', 'decimal', 'indicator', 'obs_status', 'scale', 'unit',
       'value', 'year'],
      dtype='object')

In [16]:
time_series_df['scale'].unique()

array([''], dtype=object)

In [17]:
time_series_df['unit'].unique()

array([''], dtype=object)

In [18]:
time_series_df['obs_status'].unique()

array([''], dtype=object)

In [19]:
time_series_df['decimal'].unique()

array([0, 1, 2])

In [20]:
time_series_df['year'].unique()

array(['2019', '2018', '2017', '2016', '2015', '2014', '2013', '2012',
       '2011', '2010', '2009', '2008', '2007', '2006', '2005', '2004',
       '2003', '2002', '2001', '2000', '1999', '1998', '1997', '1996',
       '1995', '1994', '1993', '1992', '1991', '1990', '1989', '1988',
       '1987', '1986', '1985', '1984', '1983', '1982', '1981', '1980',
       '1979', '1978', '1977', '1976', '1975', '1974', '1973', '1972',
       '1971', '1970', '1969', '1968', '1967', '1966', '1965', '1964',
       '1963', '1962', '1961', '1960'], dtype=object)

In [21]:
len(time_series_df['year'].unique())

60

In [22]:
time_series_df['country'].unique()

array(['AFG', 'AGO', 'ALB', 'AND', 'ARE', 'ARG', 'ARM', 'ATG', 'AUS',
       'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHR',
       'BHS', 'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRB', 'BRN', 'BTN',
       'BWA', 'CAF', 'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD',
       'COG', 'COL', 'COM', 'CPV', 'CRI', 'CUB', 'CYP', 'CZE', 'DEU',
       'DJI', 'DMA', 'DNK', 'DOM', 'DZA', 'ECU', 'EGY', 'ERI', 'ESP',
       'EST', 'ETH', 'FIN', 'FJI', 'FRA', 'FSM', 'GAB', 'GBR', 'GEO',
       'GHA', 'GIN', 'GMB', 'GNB', 'GNQ', 'GRC', 'GRD', 'GTM', 'GUY',
       'HND', 'HRV', 'HTI', 'HUN', 'IDN', 'IND', 'IRL', 'IRN', 'IRQ',
       'ISL', 'ISR', 'ITA', 'JAM', 'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ',
       'KHM', 'KIR', 'KNA', 'KOR', 'KWT', 'LAO', 'LBN', 'LBR', 'LBY',
       'LCA', 'LIE', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAR', 'MCO',
       'MDA', 'MDG', 'MDV', 'MEX', 'MHL', 'MKD', 'MLI', 'MLT', 'MMR',
       'MNE', 'MNG', 'MOZ', 'MRT', 'MUS', 'MWI', 'MYS', 'NER', 'NGA',
       'NIC', 'NLD',

In [23]:
len(time_series_df['country'].unique())

192

In [24]:
len(time_series_df['indicator'].unique())

1387

In [25]:
set(c_list_mod) - set(time_series_df['country'].unique())

{'TWN'}

In [26]:
len(set(c_list_mod))

193

NOTE: DEU, VNM are repeated twice in original list - need to investigate why.

NOTE: no results returned for TWN, but did exist in WB country list. 

see: https://datahelpdesk.worldbank.org/knowledgebase/articles/114933-where-are-your-data-on-taiwan
and: https://datahelpdesk.worldbank.org/knowledgebase/articles/378834-how-does-the-world-bank-classify-countries

In [27]:
time_series_df = time_series_df.drop(columns = ['scale', 'unit', 'obs_status'])
time_series_df

Unnamed: 0,country,decimal,indicator,value,year
0,AFG,0,AG.AGR.TRAC.NO,,2019
1,AFG,0,AG.AGR.TRAC.NO,,2018
2,AFG,0,AG.AGR.TRAC.NO,,2017
3,AFG,0,AG.AGR.TRAC.NO,,2016
4,AFG,0,AG.AGR.TRAC.NO,,2015
5,AFG,0,AG.AGR.TRAC.NO,,2014
6,AFG,0,AG.AGR.TRAC.NO,,2013
7,AFG,0,AG.AGR.TRAC.NO,,2012
8,AFG,0,AG.AGR.TRAC.NO,,2011
9,AFG,0,AG.AGR.TRAC.NO,,2010


In [28]:
time_series_df.duplicated(subset=['country', 'indicator', 'year']).sum()

0

## Export to csv

In [29]:
time_series_df.to_csv("../Data/WorldBank/wdi_timeseries_full.csv", index=False)