In [None]:
from datetime import datetime, timedelta
from workalendar.europe import Netherlands
import gc
import random
import pandas as pd
import sys
import numpy as np

raw_path = '../data/raw/'
processed_path = '../data/processed/'
sys.path.append(raw_path)

In [10]:
merged_data = pd.read_feather(processed_path + "df_merged_unfiltered.feather")

In [13]:
merged_data = merged_data.head(100) # Filter for a smaller df
merged_data

Unnamed: 0,RND_ID,2023-01-01 00:00,2023-01-01 00:15,2023-01-01 00:30,2023-01-01 00:45,2023-01-01 01:00,2023-01-01 01:15,2023-01-01 01:30,2023-01-01 01:45,2023-01-01 02:00,...,2023-12-31 22:00,2023-12-31 22:15,2023-12-31 22:30,2023-12-31 22:45,2023-12-31 23:00,2023-12-31 23:15,2023-12-31 23:30,2023-12-31 23:45,Baseload_profile,Connection category
0,8423,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,010,AC4A
1,6756,36.0,32.0,32.0,36.0,32.0,36.0,32.0,36.0,32.0,...,34.09,33.31,30.57,29.65,32.06,29.13,28.25,27.85,E3B,AC4B
2,1077,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,...,1.50,2.00,1.50,1.50,2.00,1.50,2.00,1.50,008,AC4A
3,8061,16.0,8.0,12.0,8.0,12.0,16.0,12.0,12.0,12.0,...,8.80,11.00,10.19,9.19,8.80,9.00,11.00,12.40,001,AC4B
4,10575,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,...,3.54,1.15,1.10,1.10,1.10,1.10,1.10,1.12,008,AC4B
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,11466,16.0,16.0,16.0,12.0,16.0,16.0,16.0,12.0,16.0,...,7.79,8.09,8.09,8.09,6.90,6.59,6.90,6.90,018,AC4A
96,810,12.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,...,2.39,2.39,2.39,2.39,2.10,2.39,2.39,2.39,E3C,AC4B
97,12125,16.0,16.0,16.0,16.0,20.0,16.0,20.0,20.0,16.0,...,14.00,13.35,14.24,19.64,13.32,13.03,14.19,14.96,008,AC4B
98,2901,12.0,8.0,4.0,4.0,4.0,4.0,4.0,0.0,4.0,...,2.70,2.70,2.44,2.35,2.39,2.24,2.75,2.89,E3C,AC4A


In [14]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from workalendar.europe import Netherlands

def is_workday(date):
    """
    Check if the date is a workday (Monday to Friday).
    """
    return date.weekday() < 5

def is_working_hours(time):
    """
    Check if the time is within working hours (9:00 to 17:00).
    """
    return 9 <= time.hour < 17

def is_not_holiday(date, holidays):
    """
    Check if the given date is not a holiday.
    """
    return date not in holidays

def get_dutch_holidays(year):
    """
    Get Dutch holidays for a specified year.
    """
    cal = Netherlands()
    return {date: name for date, name in cal.holidays(year)}

def generate_time_slots(start_date, end_date, interval='15min'):
    """
    Generate time slots between two dates at a specified interval.
    """
    return pd.date_range(start=start_date, end=end_date, freq=interval)

# Initialize Dutch calendar
def get_low_high_tariff():
    cal = Netherlands()
    # Get holidays for a specific year
    year = 2023
    nl_holidays_2023 = cal.holidays(year)

    # Define start and end date
    start_date = datetime(2023, 1, 1, 0, 0)
    end_date = datetime(2023, 12, 31, 23, 45)

    # Initialize list to store timestamps
    timestamps = []

    # Generate timestamps with 15-minute intervals
    current_date = start_date
    while current_date <= end_date:
        timestamps.append(current_date)
        current_date += timedelta(minutes=15)

    # Generate the binary sequence
    high_tariff = []
    for timestamp in timestamps:
        if is_workday(timestamp) and is_working_hours(timestamp.time()) and is_not_holiday(timestamp, nl_holidays_2023):
            high_tariff.append(1)
        else:
            high_tariff.append(0)
    return high_tariff, timestamps

# Get the high/low tariff and timestamps
high_tariff, timestamps = get_low_high_tariff()

# Convert timestamps to match the format in the dataset
timestamp_str = [ts.strftime('%Y-%m-%d %H:%M') for ts in timestamps]

# Create a DataFrame with the new features
tariff_df = pd.DataFrame({
    'Timestamp': timestamp_str,
    'High_Tariff': high_tariff
})
tariff_df['Timestamp'] = pd.to_datetime(tariff_df['Timestamp'])

In [16]:
# Extract date-time columns and RND_ID, Baseload_profile, Connection category columns
date_time_cols = [col for col in merged_data.columns if '2023' in col]
id_cols = [col for col in merged_data.columns if col not in date_time_cols]

# Convert date-time columns to long format
merged_data = merged_data.melt(id_vars=id_cols, value_vars=date_time_cols, var_name='Timestamp', value_name='Consumption')

# Convert 'Timestamp' to string to ensure format consistency
merged_data['Timestamp'] = merged_data['Timestamp'].astype(str)

In [17]:
# Convert 'Timestamp' to datetime
merged_data['Timestamp'] = pd.to_datetime(merged_data['Timestamp'])

# Merge tariff information with the main data
merged_data = pd.merge(merged_data, tariff_df, on='Timestamp', how='left')

In [18]:
# Add time-based features
merged_data['hour'] = merged_data['Timestamp'].dt.hour
merged_data['day_of_week'] = merged_data['Timestamp'].dt.dayofweek
merged_data['month'] = merged_data['Timestamp'].dt.month
merged_data['is_weekend'] = merged_data['day_of_week'] >= 5

# Compute monthly total and average consumption
monthly_total = merged_data.groupby(['RND_ID', 'month'])['Consumption'].sum().reset_index()
monthly_avg = merged_data.groupby(['RND_ID', 'month'])['Consumption'].mean().reset_index()

# Merge monthly total and average back to the main data
merged_data = pd.merge(merged_data, monthly_total, on=['RND_ID', 'month'], suffixes=('', '_monthly_total'))
merged_data = pd.merge(merged_data, monthly_avg, on=['RND_ID', 'month'], suffixes=('', '_monthly_avg'))

In [19]:
merged_data

Unnamed: 0,RND_ID,Baseload_profile,Connection category,Timestamp,Consumption,High_Tariff,hour,day_of_week,month,is_weekend,Consumption_monthly_total,Consumption_monthly_avg
0,8423,010,AC4A,2023-01-01 00:00:00,0.00,0,0,6,1,True,23104.00,7.763441
1,8423,010,AC4A,2023-01-01 00:15:00,0.00,0,0,6,1,True,23104.00,7.763441
2,8423,010,AC4A,2023-01-01 00:30:00,0.00,0,0,6,1,True,23104.00,7.763441
3,8423,010,AC4A,2023-01-01 00:45:00,0.00,0,0,6,1,True,23104.00,7.763441
4,8423,010,AC4A,2023-01-01 01:00:00,4.00,0,1,6,1,True,23104.00,7.763441
...,...,...,...,...,...,...,...,...,...,...,...,...
3503995,4109,E3A,AC4B,2023-12-31 22:45:00,7.51,0,22,6,12,True,21328.84,7.166949
3503996,4109,E3A,AC4B,2023-12-31 23:00:00,7.20,0,23,6,12,True,21328.84,7.166949
3503997,4109,E3A,AC4B,2023-12-31 23:15:00,7.28,0,23,6,12,True,21328.84,7.166949
3503998,4109,E3A,AC4B,2023-12-31 23:30:00,7.48,0,23,6,12,True,21328.84,7.166949
