In [1]:
import json
import urllib.request
import pandas as pd
import copy

In [2]:
"""idd_base_url = 'http://stats.oecd.org/sdmx-json/data/IDD'
dimension_filter = 'all' # grab all data
time_filter = 'all?startTime=2001&endTime=2014'
optional_filters = '&detail=Full' # pick up the measure unit's..."""

"idd_base_url = 'http://stats.oecd.org/sdmx-json/data/IDD'\ndimension_filter = 'all' # grab all data\ntime_filter = 'all?startTime=2001&endTime=2014'\noptional_filters = '&detail=Full' # pick up the measure unit's..."

In [13]:
def fetch_oecd_data(idd_base_url = 'http://stats.oecd.org/sdmx-json/data/IDD',
                    dimension_filter = 'all', # grab all data
                    time_filter = 'all?startTime=2001&endTime=2014',
                    # pick up the measure unit's...
                    optional_filters = '&detail=Full'): 
    """parse json data into DataFrame:

    the json comes with a 'structure' section which explains how to
    interpret the keys (e.g. 1:2:3:4:5) for the observations.

    We first need to parse the 'structure' to figure out what the
    dimension names (and order) is. Then, we need to unpack
    observation values into floats (as opposed to arrays)
    """
    data_query = idd_base_url + '/' + dimension_filter + '/' + time_filter + optional_filters
    stats_oecd_idd = json.loads(urllib.request.urlopen(data_query).read().decode('utf-8'))

    metadata_stats_oecd_idd = stats_oecd_idd['structure']['dimensions']
    print(stats_oecd_idd['structure'].keys())

    col_names = []
    for dim in metadata_stats_oecd_idd['series']:
        col_names.append(dim['id'].lower())

    idd_dataframe = pd.DataFrame()

    """
    The observation attributes come in a particularly ordered array, e.g.: [1, 20, 3, None]...
    0  => attributes_stats_oecd_idd['series'][0][1],
    20 => attributes_stats_oecd_idd['series'][1][20],
    3 => attributes_stats_oecd_idd['series'][2][3],
    None => !!! this shouldn't be mapped.
    """
    
    observation_attribute_map = stats_oecd_idd['structure']['attributes']['series']
    attribute_column_names = [attribute_map['id'].lower() for attribute_map in observation_attribute_map]
    #= [(x + "_id").lower() for x in attribute_ids]
    #print(attribute_ids)

    for key, value in stats_oecd_idd['dataSets'][0]['series'].items():
        """
        Example (key, value):
        ('19:8:0:0:0',
        {'attributes': [0, 20, 0, None],
         'observations': {'0': [43696.0, None], '6': [45934.0, None]}})
        So, extract info from key, and value.attributes
        """      
        data = pd.DataFrame(dict(zip(col_names, [[x] for x in key.split(":")])))
        data_attributes = [str(x) for x in value['attributes']] # metadata should be in string form, since it's categorical
        data = pd.concat([data,
                          pd.DataFrame(dict(zip(attribute_column_names, data_attributes)), index=[0])],
                         axis = 1)    

        observations = copy.copy(value['observations'])

        for time, measure in observations.items():
            observations[time] = measure[0]

        observations = pd.DataFrame(list(observations.items()), columns=['time_period','observation'])

        observations['location'] = data['location'].iloc[0]

        data = pd.merge(right=data,
                        left=observations,
                        on='location',
                        how='outer')

        idd_dataframe = idd_dataframe.append(data, ignore_index=False)
    return idd_dataframe, metadata_stats_oecd_idd, observation_attribute_map

In [14]:
def append_metadata_to_oecd_stats(idd_dataframe, metadata_stats_oecd_idd, observation_attribute_map):
    """
    Append meta data to OECD stats dataframe:

    the returned JSON has a 'structure' object with 'dimensions'
    child that contains all the information to translate series
    identifiers x:y:z:w:a into something human parsable.

    A series of merges should be anticipated. Hence, it makes
    sense to store all the metadata in a dictionary where the
    keys will be the merge indices.
    """
    oecd_metadata = {}
    for metadata in [metadata_stats_oecd_idd['series'], observation_attribute_map]:
        for dim in metadata:            
            dim_dataframe = pd.DataFrame(dim['values'])
            dim_id = dim['id'].lower()

            # if dim_dataframe is empty, we cannot rename the columns
            try:
                dim_dataframe.columns = [dim_id + '_code', dim_id + '_name']
            except ValueError:
                pass

            dim_dataframe[dim_id] = dim_dataframe.index.astype(str)
            oecd_metadata[dim_id] = dim_dataframe

    # make time metadata
    time_metadata = metadata_stats_oecd_idd['observation'][0]
    time_metadata_index = time_metadata['role'].lower()
    time_metadata_df = pd.DataFrame(time_metadata['values'])
    time_metadata_df.columns = ['year', time_metadata_index]
    time_metadata_df[time_metadata_index] = time_metadata_df.index.astype(str)

    oecd_metadata[time_metadata_index] = time_metadata_df

    for key, df in oecd_metadata.items():
        # no point merging on empty data
        if df.empty:
            continue
        idd_dataframe = pd.merge(left=idd_dataframe,
                                 right=df,
                                 how='left',
                                 on=key)
    return idd_dataframe

In [15]:
def fetch_and_format_oecd_data(idd_base_url = 'http://stats.oecd.org/sdmx-json/data/IDD',
                               dimension_filter = 'all', # grab all data
                               time_filter = 'all?startTime=2001&endTime=2014',
                               optional_filters = '&detail=Full'):
    return append_metadata_to_oecd_stats(*fetch_oecd_data(idd_base_url, dimension_filter, time_filter, optional_filters))

In [None]:
X = fetch_and_format_oecd_data()

In [16]:
X.columns

dict_keys(['description', 'annotations', 'dimensions', 'name', 'attributes', 'links'])


Index(['time_period', 'observation', 'location', 'age', 'definition',
       'measure', 'methodo', 'powercode', 'referenceperiod', 'time_format',
       'unit', 'time_format_code', 'time_format_name', 'year', 'location_code',
       'location_name', 'unit_code', 'unit_name', 'powercode_code',
       'powercode_name', 'definition_code', 'definition_name', 'age_code',
       'age_name', 'measure_code', 'measure_name', 'methodo_code',
       'methodo_name'],
      dtype='object')

In [27]:
X.to_csv("/Users/user/repos/middle_middle/oecd_idd.csv",index=False,header=True)