# Dataset creation

In [1]:
from datetime import timedelta
from meteostat import Hourly
import pandas as pd
import requests

In [2]:
def time_deltas(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values('Datetime').reset_index(drop=True)
    df['delta'] = df['Datetime'].diff()
    print(df['delta'].value_counts().head(20))
    gaps = df[df['delta'] == pd.Timedelta('135min')]
    for i, row in gaps.iterrows():
        start = df.loc[i - 1, 'Datetime']
        end = row['Datetime']
        print(f"{start} - {end} ({row['delta']})")

In [3]:
def load_power_plant_data(path: str = 'data/power_plant.csv') -> pd.DataFrame:
    df_power_plant = pd.read_csv(path, delimiter=';')
    df_power_plant['Datetime'] = pd.to_datetime(df_power_plant['Datetime'], format='%Y-%m-%d %H:%M:00')
    df_power_plant['Datetime'] = (df_power_plant['Datetime']
                                    .dt.tz_localize('Europe/Berlin', ambiguous='infer', nonexistent='shift_forward')
                                    .dt.tz_convert('UTC'))
    return df_power_plant

df = load_power_plant_data()

In [4]:
def fetch_holiday_data(years: list[int], region: str = 'de-be') -> pd.DataFrame:
    holiday_dates = []
    for year in years:
        url = f"https://digidates.de/api/v1/germanpublicholidays?year={year}&region={region}"
        response = requests.get(url)
        holidays = response.json()
        [holiday_dates.append(holiday) for holiday in holidays.keys()]

    df_holidays = pd.DataFrame(data={"Holiday": holiday_dates})
    return df_holidays

# years = df['Datetime'].dt.strftime("%Y").unique()
# df_holidays = fetch_holiday_data(years=years)
# df_holidays.to_csv('data/holidays.csv')

df_holidays = pd.read_csv('data/holidays.csv')
df_holidays['Holiday'] = pd.to_datetime(df_holidays['Holiday'], format='%Y-%m-%d').dt.date

In [6]:
def fetch_weather_data(start: pd.Timestamp, end: pd.Timestamp, station_id: str) -> pd.DataFrame:
    df_weather = Hourly(station_id, start, end).fetch()
    df_weather.index = df_weather.index.tz_localize('UTC')
    df_weather = df_weather.reset_index()

    df_weather = df_weather.rename(columns={
        'time': 'Datetime',
        'temp': 'Temperature',
        'dwpt': 'Dew Point',
        'rhum': 'Relative Humidity',
        'prcp': 'Precipitation',
        'snow': 'Snow Depth',
        'wdir': 'Wind Direction',
        'wspd': 'Average Wind Speed',
        'wpgt': 'Peak Wind Speed',
        'pres': 'Average Sea-Level Air Pressure',
        'tsun': 'Sunshine Duration',
        'coco': 'Weather Condition Code'
    })

    df_weather = (df_weather
                    .set_index('Datetime')
                    .sort_index()
                    .resample('15min')
                    .interpolate(method='linear')
                    .reset_index())
    
    return df_weather

# start = df['Datetime'].min().tz_localize(None)
# end = df['Datetime'].max().tz_localize(None) + timedelta(hours=1)
# df_weather = fetch_weather_data(start=start, end=end, station_id='10582')
# df_weather.to_csv('data/weather.csv', index=False)

df_weather = pd.read_csv('data/weather.csv')
df_weather['Datetime'] = pd.to_datetime(df_weather['Datetime'])

In [7]:
# time-based features
df['Holiday'] = df['Datetime'].dt.date.isin(df_holidays['Holiday'])
df['Hour'] = df['Datetime'].dt.hour
df['DayOfWeek'] = df['Datetime'].dt.dayofweek
df['Month'] = df['Datetime'].dt.month
df['IsWeekend'] = df['DayOfWeek'].isin([5,6]).astype(int)

In [9]:
def load_market_data(path: str = 'data/day_ahead_prices.csv') -> pd.DataFrame:
    df_market = pd.read_csv(path, delimiter=';')
    df_market['Datetime'] = pd.to_datetime(df_market['Datetime'], format='%Y-%m-%d %H:%M:00')
    df_market['Datetime'] = (df_market['Datetime']
                                .dt.tz_localize('Europe/Berlin', ambiguous='infer', nonexistent='shift_forward')
                                .dt.tz_convert('UTC'))
    return df_market

df_market = load_market_data()

In [None]:
# save dataset
df = pd.merge(df, df_weather, on=['Datetime'], how='left')
df = pd.merge(df, df_market, on=['Datetime'], how='left')
# df.to_csv('data/dataset.csv', sep=';', index=False)