In [1]:
import re
import json
import requests
import unicodedata
import pandas as pd

from time import sleep

def slugify(value, allow_unicode=False):
    """
    Taken from https://github.com/django/django/blob/master/django/utils/text.py
    Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated
    dashes to single dashes. Remove characters that aren't alphanumerics,
    underscores, or hyphens. Convert to lowercase. Also strip leading and
    trailing whitespace, dashes, and underscores.
    """
    value = str(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
    else:
        value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub(r'[^\w\s-]', '', value.lower())
    return re.sub(r'[-\s]+', '-', value).strip('-_')

def prepare_unit_part(readable_json):
    """
    Converts the unit's json's contents to a dataframe obiect, changing 
    names of the columns to fit the new context and drops an unused column.
    """
    df_part = pd.json_normalize(readable_json,
                                record_path=['results'])
    df_part.rename(columns={'id'       : 'ID_unit',
                            'parentId' :  'ID_unitPatent'},
                    inplace=True)
    df_part.drop('hasDescription', axis=1, inplace=True)
    df_part.set_index(['ID_unitPatent', 'ID_unit'], inplace=True)
    return df_part


### Iterators made for partial downloading
All the data is paginated and has a limit of 100 entries per page (so we pass the pagesize=100 parameter by default). It makes downloading the whole variable by one request not possible. Thus a need to change pages by passing the "page=" parameter.

FrameIter iterator class has to:

1. Hold parameters such as:
    - declared on construction:
        - url: Address of an API request
        - id: ID of the variable
        - sleeptime: Length of a break between sent requests (in seconds)
        - verbose: Flag of a verbose mode
        - prepare_df: Function converting json_file, obtained from the request, into a dataframe object. 
    - and declared on the construction of an iterator:
        - page: Page number
        - previous_url: Address of the previous request
        - first_url: Address of the first request
2. Update the next request's page via the __update_page() method.
3. During the call of an iterator: Download one page of the data, convert it using prepare_df and update the page / cancel the iteration on the last page. Each call returns a dataframe object.

In [14]:
class FrameIter:
    
    def __init__(self, var_id, sleeptime = 0, verbose = False):
        self.url = ''
        self.id = var_id
        self.sleeptime = sleeptime
        self.verbose = verbose
        # I declare a void function as a parameter. Each child class will have that parameter filled with its own function.
        # I wanted __next()__ constructor to be inherited from FrameIter, and each child class to use an unique prepare_df function within it.
        
        def prepare_df(json_file): 
            pass
        self.prepare_df = prepare_df
        
    def __update_page(self):
        previous = self.page
        self.page +=1
        self.url = self.url.replace(f'&page={previous}', f'&page={self.page}')
    
    def __iter__(self):
        self.page = 0
        self.previous_url = ''
        self.first_url = self.url
        return self
        
    def __next__(self):
        
        if self.verbose:
            print(f'Waiting for {self.sleeptime}s')
        sleep(self.sleeptime)
        if self.verbose:
            print(f'Requiring {self.id} data, page {self.page}.')
            
        try: 
            json_page = requests.get(self.url)
        except Exception as exc:
            print(exc)
        readable = json.loads(json_page.text)
            
        if self.previous_url != readable.get('links', {'last' : self.first_url})['last']:                
            df = self.prepare_df(readable)
            self.previous_url = self.url
            self.__update_page()
            return df
        else:
            
            if self.verbose:
                print(f'Variable {self.id} has been downloaded. ')
                
            raise StopIteration
            
            
class VariableFrameIter(FrameIter):
    
    def __init__(self, var_id, sleeptime = 0, verbose = False):
        self.id = var_id
        self.sleeptime = sleeptime
        self.verbose = verbose
        
        if self.verbose:
            print(f'Requiring {var_id} info.')
            
        jsone_page = requests.get(f'https://bdl.stat.gov.pl/api/v1/variables/{var_id}?format=json')
        json_file = json.loads(jsone_page.text)
        
        self.lvl = json_file.get('level', -1)
        if self.lvl >= 0:  
            self.lvl = 5 if self.lvl > 5 else self.lvl # Agregates up to the level nearest to powiat (5) if it's lower.
            self.parent_id = json_file.get('subjectId')
            self.name = slugify((json_file.get('n1') + '-' + json_file.get('n2','')).replace('*', 'S'), allow_unicode=True)
            self.url = f'https://bdl.stat.gov.pl/api/v1/data/by-variable/{var_id}?format=json&unit-level={self.lvl}&page-size=100&page=0'
        else:
            raise Exception('The variable does not exist.', f'ID provided: {self.id}')
        
        def prepare_df(json_file):
            result_levels = ['POLSKA','Macroregion', 'ID_wojewodztwa','ID_region','ID_subregion','ID_powiatu','ID_gminy','ID_stat_unit']
            df = pd.json_normalize(json_file,
                                   record_path=['results','values'],
                                   meta=[['results','id']])
            df.rename(columns={'results.id' :  result_levels[self.lvl],
                               'val'        :  slugify(self.parent_id, allow_unicode=True) + '-' + self.name},
                      inplace=True)
            df.drop('attrId', axis=1, inplace=True)
            df.set_index([result_levels[self.lvl],'year'], inplace=True)
            return df
        self.prepare_df = prepare_df
    
    def get_name(self):
        return self.name
    
    def get_id(self):
        return self.id
    
    def get_lvl(self):
        return self.lvl
    
    def get_parent_id(self):
        return self.parent_id
    
class UnitsFrameIter(FrameIter):
    
    def __init__(self):
        self.page = 0
        self.url = 'https://bdl.stat.gov.pl/api/v1/units?format=json&page-size=100&page=0'
        
        def prepare_df(json_file): 
            df = pd.json_normalize(json_file, record_path=['results'])
            df.rename(columns={'id'       : 'ID_unit',
                               'parentId' : 'ID_unitPatent'},
                      inplace=True)
            df.drop('hasDescription', axis=1, inplace=True)
            df.set_index(['ID_unitPatent', 'ID_unit'], inplace=True)
            return df

        self.prepare_df = prepare_df

In [19]:
PIDs = {3350 : [399257],
        2013 : [80823, 80827, 80831, 80835, 80839,
                80822, 80826, 80830, 80834, 80838,
                80821, 80825, 80829, 80833, 80837,
                1539748],
        2759 : [148074, 148128],
        2497 : [64428, 64429],
        2504 : [64535, 64536, 64538, 64540],
        2861 : [155055],
        3603 : [498861, 634994],
        3571 : [472364, 472366, 472368, 472370],
        2420 : [60533, 454131],
        1767 : [7859, 7860, 7861, 458935],
        2617 : [75956, 75954, 75957, 75955],
        2424 : [60555, 60554],
        2813 : [152391],
        2670 : [79214],
        3441 : [452355, 452356],
        3303 : [395921],
        2596 : [73854, 73855, 73849, 73852, 73851],
        3501 : [458426]}

### Downloading the data of all levels
Along with:
- A cathegorisation into folders of specified level (Region, subregion and powiat)
- Breaks of 10 seconds between requests (without breaking limits of 100 requests per 15min and 1000 in 12h for BDL's API)
- Writing out the steps (the verbose mode prints out a page too much)

In [21]:
result_levels = ['POLSKA', 'Macroregion', 'Wojewodztwo','Region','Subregion','Powiat','Gmina','Stat_unit']
for _, IDs in PIDs.items():
    for ID in IDs:
        var_iter = VariableFrameIter(ID, 10, True)
        
        var_name = var_iter.get_name()
        var_parent = var_iter.get_parent_id()
        var_level = var_iter.get_lvl()
        filename = result_levels[var_level] + '/' + var_parent + '-' + var_name
        
        df = pd.DataFrame()
        df = pd.concat([df_part for df_part in var_iter])
        df.to_csv(f'data/{filename}.csv')

Requiring 399257 info.
Waiting for 10s
Requiring 399257 data, page 0.
Waiting for 10s
Requiring 399257 data, page 1.
Waiting for 10s
Requiring 399257 data, page 2.
Waiting for 10s
Requiring 399257 data, page 3.
Waiting for 10s
Requiring 399257 data, page 4.
Variable 399257 has been downloaded. 
Requiring 80823 info.
Waiting for 10s
Requiring 80823 data, page 0.
Waiting for 10s
Requiring 80823 data, page 1.
Waiting for 10s
Requiring 80823 data, page 2.
Waiting for 10s
Requiring 80823 data, page 3.
Waiting for 10s
Requiring 80823 data, page 4.
Variable 80823 has been downloaded. 
Requiring 80827 info.
Waiting for 10s
Requiring 80827 data, page 0.
Waiting for 10s
Requiring 80827 data, page 1.
Waiting for 10s
Requiring 80827 data, page 2.
Waiting for 10s
Requiring 80827 data, page 3.
Waiting for 10s
Requiring 80827 data, page 4.
Variable 80827 has been downloaded. 
Requiring 80831 info.
Waiting for 10s
Requiring 80831 data, page 0.
Waiting for 10s
Requiring 80831 data, page 1.
Waiting for 

### Downloading the hierarchy of units

In [5]:
units_iter = UnitsFrameIter()

df = pd.DataFrame()
df = pd.concat([df_part for df_part in units_iter])
df.to_csv(f'data/jednostki.csv')

### Downloading data of level lower than powiat

In [15]:
var_iter_1539748 = VariableFrameIter(1539748, 5, True)

var_name = var_iter_1539748.get_name()
var_parent = var_iter_1539748.get_parent_id()
filename = var_parent + '-' + var_name

df = pd.DataFrame()
df = pd.concat([df_part for df_part in var_iter_1539748])
df.to_csv(f'data/{filename}.csv')

Requiring 1539748 info.
Waiting for 5s
Requiring 1539748 data, page 0.
Waiting for 5s
Requiring 1539748 data, page 1.
Waiting for 5s
Requiring 1539748 data, page 2.
Waiting for 5s
Requiring 1539748 data, page 3.
Waiting for 5s
Requiring 1539748 data, page 4.
Variable 1539748 has been downloaded. 


### Downloading data of level equal to powiat

In [3]:
var_iter_399257 = VariableFrameIter(399257)

var_name = var_iter_399257.get_name()
var_parent = var_iter_399257.get_parent_id()
filename = var_parent + '-' + var_name

df = pd.DataFrame()
df = pd.concat([df_part for df_part in var_iter_399257])
df.to_csv(f'data/{filename}.csv')

### Downloading data of level higher than powiat

In [4]:
var_iter_64535 = VariableFrameIter(64535)

var_name = var_iter_64535.get_name()
var_parent = var_iter_64535.get_parent_id()
filename = var_parent + '-' + var_name

df = pd.DataFrame()
df = pd.concat([df_part for df_part in var_iter_64535])
df.to_csv(f'data/{filename}.csv')