In [None]:
import os
import datetime
import glob
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

DOWNLOAD_DIR = '<your-path-here>'

In [None]:
def read_single_csv_entso_e(file):
    return pd.read_csv(file, sep='\t', encoding='utf-16', parse_dates=["DateTime"])


def load_complete_entso_e_data(directory):
    pattern = Path(directory) / '*.csv'
    files = glob.glob(str(pattern))

    print(f'Concatenating {len(files)} csv files...')

    each_csv_file = [read_single_csv_entso_e(file) for file in files]
    data = pd.concat(each_csv_file, ignore_index=True)

    data = data.sort_values(by=["AreaName", "DateTime"])

    print("Loading done.")

    return data


power_demand = load_complete_entso_e_data(DOWNLOAD_DIR)

What is in there?

In [None]:
power_demand

Hm... that's a lot of information. Let's reduce it...

In [None]:
power_demand.columns

Which countries and regions are there?

In [None]:
power_demand["AreaName"].unique()

puh... a lot! Is Austria there?

In [None]:
"Austria" in power_demand["AreaName"].values

Let's get Austrian data then and...

In [None]:
def get_country_data(data, country):
    ret_data = data[data["AreaName"] == country]
    ret_data = ret_data.set_index("DateTime")
    return ret_data


power_demand_at = get_country_data(power_demand, "Austria")

## Exercise 1

Let's do sum summary statistics. Calculate mean, standard deviation, min, max and the 25%, 50% and 75% quantile of the distribution of the load column. Hint: There may be a single pandas function that does it for you...

In a second step, do the same for Germany. Does the *10 rule hold? (everything in Germany is ten times as big as in Austria)

Now let's plot Austrian data.

In [None]:
def plot_data(data, xlabel="Time"):
    data["TotalLoadValue"].plot()
    plt.xlabel(xlabel)
    plt.ylabel("Electricity load (MW)")


plot_data(power_demand_at)

Hm... there may be outliers. But how complete is Austrian data?

In [None]:
def check_data_completeness(data):
    number_of_nas = np.sum(pd.isna(data["TotalLoadValue"]))
    number_of_0s = np.sum(data["TotalLoadValue"] == 0)

    print(f'The data contains {number_of_nas} NAs and {number_of_0s} zeros')


check_data_completeness(power_demand_at)

How complete is e.g. North Macedonian data in comparison?

In [None]:
power_demand_nm = get_country_data(power_demand, "North Macedonia")
check_data_completeness(power_demand_nm)

## Exercise 2

What is the time resolution of the dataset? There are several ways of finding it, try to find at least two.

Let's aggregate the data to hourly data. 

In [None]:
power_demand_at_hourly = power_demand_at.resample('1h').mean()

In [None]:
power_demand_at_hourly

In [None]:
plot_data(power_demand_at_hourly)

Let's look into the data more closely and find some regularities...

In [None]:
power_demand_at_yearly = power_demand_at_hourly.resample('1y').mean()

In [None]:
power_demand_at_yearly

In [None]:
plot_data(power_demand_at_yearly)

Hm... 2014 and 2020 are incomplete. We should select the correct period therefore...

In [None]:
power_demand_at_yearly_full_years = power_demand_at_yearly['2015-01-01':'2019-12-31']

plot_data(power_demand_at_yearly_full_years)

Hm... hard to tell if there is a trend. Time-series too short. Let's neglect it for the moment.

In [None]:
power_demand_at_monthly = power_demand_at_hourly.resample('1m').mean()
power_demand_at_monthly = power_demand_at_monthly['2015-01-01':'2019-12-31']

plot_data(power_demand_at_monthly)

hm... seems to be seasonal, right? But let's also select the correct period to show full years only.

To understand it better, we could simply take the monthly average...

In [None]:
power_demand_at_monthly_mean = power_demand_at.groupby(
    power_demand_at.index.month).mean()

plot_data(power_demand_at_monthly_mean, "Month of Year")

Hm... power demand seems to be seasonal.


## Exercise 3

There are at least two other time scales over which data shows seasonality. Can you find them and plot them? Hint: ```power_demand_at_hourly.index.weekday``` may be very useful here.