# Eurostat bioenergy balance (February 2021 edition)

Extract bioenergy related data from an archive containing XLSB files, one for each EU country which contain multiple sheets for each year (1990-2019).

Data from Eurostat's [energy balances](https://ec.europa.eu/eurostat/web/energy/data/energy-balances) February 2021 edition.

Walk through excel files (country spreadsheets) and parse selected variables and fuels for each year (sheet in country's spreadsheet).

Somewhere on Eurostat there might be a better source for this data, but I did not find it.

In [1]:
import os
import zipfile
import requests
import pandas as pd
import numpy as np
import pyxlsb

In [2]:
def parse_values_for_country(file, country, variables, fuels):
    """Reads fuel variable in multiple sheets 2002-2018.
    Sums the values across multiple columns if relevant.
    Returns: dict
    """
    country_data = {}
    
    for year in range(2002,2020):
        df = pd.read_excel(
            file,
            engine='pyxlsb',
            sheet_name=str(year),
            skiprows=[0,1,2,3],
            index_col=1,
            na_values=':',
            )
        for variable in variables:
            for fuel, start, end in fuels:             
                try:
                    country_data[(country, year, fuel, variable.lower().replace(' ', '_'))] = df.loc[variable, start:end].sum()
                except TypeError:
                    country_data[(country, year, fuel, variable.lower().replace(' ', '_'))] = pd.to_numeric(df.loc[variable, start:end], errors='coerce').sum()

    return country_data

In [3]:
def walk_through_excel_files(directory, variables, fuels):
    d = {}
    
    for filename in os.listdir(directory):
        if '!' not in filename and '.pdf' not in filename: # skip readme files 
            country = filename.split('-')[0]
            excel_path = os.path.join(directory, filename)
            data = parse_values_for_country(excel_path, country, variables, fuels)
            d.update(data)
    return d

In [4]:
# Selected variables for bioenergy and some other for context
variables = [
'Primary production',
'Imports',
'Exports',
'Gross inland consumption',
]

fuels = [
    ('total', 'Total', 'Total'),
    ('renewables', 'Renewables and biofuels', 'Renewables and biofuels'),
    ('bioenergy', 'Bioenergy', 'Bioenergy',),
    ('solid_biomass', 'Primary solid biofuels', 'Primary solid biofuels'),
    ('biofuels', 'Pure biogasoline', 'Other liquid biofuels'),
    ('biogas', 'Biogases', 'Biogases'),
    ('ren_mun_waste', 'Renewable municipal waste', 'Renewable municipal waste'),
    ]

In [5]:
url = 'https://ec.europa.eu/eurostat/documents/38154/4956218/Energy-balance-sheets-February-2021-edition.zip/4b1d6665-f303-be7d-a7e5-1e0da16ec0d9?t=1612709565471'

r = requests.get(url)

with open('eurostat_balances_2021.zip', 'wb') as f:
    f.write(r.content)

In [6]:
with zipfile.ZipFile('eurostat_balances_2021.zip', 'r') as zip_archive:
    zip_archive.extractall(path='balances/')

In [7]:
# This is quite slow, opening many files, one time for each sheet
# There must be a better way

%time data_dict = walk_through_excel_files('balances/', variables, fuels)

CPU times: user 1min 52s, sys: 1.37 s, total: 1min 53s
Wall time: 1min 54s


In [8]:
# https://stackoverflow.com/questions/44012099/creating-a-dataframe-from-a-dict-where-keys-are-tuples
df1 = pd.Series(data_dict).reset_index()
df1.columns = ['country', 'year', 'fuel', 'variable', 'value']

In [9]:
df1.head(3)

Unnamed: 0,country,year,fuel,variable,value
0,HU,2002,total,primary_production,11191.268
1,HU,2002,renewables,primary_production,877.233
2,HU,2002,bioenergy,primary_production,772.81


In [10]:
df2 = df1.set_index(['country', 'year', 'fuel', 'variable']).unstack(level=3)

In [11]:
df2.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,value,value,value,value
Unnamed: 0_level_1,Unnamed: 1_level_1,variable,exports,gross_inland_consumption,imports,primary_production
country,year,fuel,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AT,2002,bioenergy,203.428,2913.585,196.6,2920.414
AT,2002,biofuels,0.0,20.347,0.0,20.347
AT,2002,biogas,0.0,21.95,0.0,21.95


In [12]:
df2.columns = df2.columns.droplevel(0).values

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4536 entries, ('AT', 2002, 'bioenergy') to ('XK', 2019, 'total')
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   exports                   4536 non-null   float64
 1   gross_inland_consumption  4536 non-null   float64
 2   imports                   4536 non-null   float64
 3   primary_production        4536 non-null   float64
dtypes: float64(4)
memory usage: 157.6+ KB


In [14]:
df2.sort_index(ascending=True, inplace=True)

In [15]:
df2['dependency'] = (df2['imports'] - df2['exports']) / df2['gross_inland_consumption']
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,exports,gross_inland_consumption,imports,primary_production,dependency
country,year,fuel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AT,2002,bioenergy,203.428,2913.585,196.600,2920.414,-0.002344
AT,2002,biofuels,0.000,20.347,0.000,20.347,0.000000
AT,2002,biogas,0.000,21.950,0.000,21.950,0.000000
AT,2002,ren_mun_waste,0.000,47.053,0.000,47.053,0.000000
AT,2002,renewables,172.781,6482.194,164.493,6490.482,-0.001279
...,...,...,...,...,...,...,...
XK,2019,biogas,0.000,0.000,0.000,0.000,
XK,2019,ren_mun_waste,0.000,0.000,0.000,0.000,
XK,2019,renewables,0.042,402.360,55.816,346.586,0.138617
XK,2019,solid_biomass,0.042,374.978,55.816,319.205,0.148739


In [16]:
df2.to_csv(
    'balances_bioenergy_2002_2019_ktoe.csv',
    decimal=',',
    )

In [17]:
df3 = df2.copy()

In [18]:
tj_ktoe = 41.868

df3 = df3.loc[:, 'exports': 'primary_production'] * tj_ktoe

# Keep the share based on the original data in ktoe
df3['dependency'] = df2['dependency']
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,exports,gross_inland_consumption,imports,primary_production,dependency
country,year,fuel,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AT,2002,bioenergy,8517.123504,121985.976780,8231.248800,122271.893352,-0.002344
AT,2002,biofuels,0.000000,851.888196,0.000000,851.888196,0.000000
AT,2002,biogas,0.000000,919.002600,0.000000,919.002600,0.000000
AT,2002,ren_mun_waste,0.000000,1970.015004,0.000000,1970.015004,0.000000
AT,2002,renewables,7233.994908,271396.498392,6886.992924,271743.500376,-0.001279
...,...,...,...,...,...,...,...
XK,2019,biogas,0.000000,0.000000,0.000000,0.000000,
XK,2019,ren_mun_waste,0.000000,0.000000,0.000000,0.000000,
XK,2019,renewables,1.758456,16846.008480,2336.904288,14510.862648,0.138617
XK,2019,solid_biomass,1.758456,15699.578904,2336.904288,13364.474940,0.148739


In [19]:
df3.to_csv(
    'balances_bioenergy_2002_2019_tj.csv',
    decimal=',',
    )

In [20]:
# Some minimal testing
idx = pd.IndexSlice

In [21]:
df2.loc[idx['CZ', 2018, 'bioenergy'], ['exports']]

exports    549.453
Name: (CZ, 2018, bioenergy), dtype: float64

In [22]:
assert df2.loc[idx['CZ', 2018, 'bioenergy'], ['exports']].item() == 549.453

In [23]:
df2.loc[idx['CZ', 2009, 'bioenergy'], ['primary_production']]

primary_production    2761.8
Name: (CZ, 2009, bioenergy), dtype: float64

In [24]:
assert df2.loc[idx['CZ', 2009, 'bioenergy'], ['primary_production']].item() == 2761.8

In [25]:
result_cz_2009_bioenergy = df2.loc[idx['CZ', 2009, 'bioenergy']]
result_cz_2009_bioenergy

exports                      318.821000
gross_inland_consumption    2568.609000
imports                      123.617000
primary_production          2761.800000
dependency                    -0.075996
Name: (CZ, 2009, bioenergy), dtype: float64

In [26]:
cz_2009_bioenergy = pd.Series(
    {'exports': 318.821,
     'gross_inland_consumption': 2568.609,
     'imports': 123.617,
     'primary_production': 2761.8,
     'dependency': -0.075996,
    })

In [27]:
cz_2009_bioenergy
cz_2009_bioenergy.name = ('CZ', 2009, 'bioenergy')

In [28]:
pd.testing.assert_series_equal(cz_2009_bioenergy, result_cz_2009_bioenergy)