In [1]:
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime, timedelta
from workalendar.europe import Netherlands
import gc
import random
import pandas as pd
import sys
import numpy as np

In [None]:
# Function to reshape the data
def reshape_data(df):
    melted_df = df.melt(id_vars=['RND_ID', 'BASELOAD_PROFILE', 'CONNECTION_CATEGORY'], var_name='Timestamp', value_name='Consumption')
    melted_df['Timestamp'] = pd.to_datetime(melted_df['Timestamp'])
    melted_df = melted_df[melted_df['Consumption'] > 0]
    return melted_df

# Function to add time-based features and calculate monthly statistics
def add_time_features(df):
    df['hour'] = df['Timestamp'].dt.hour
    df['day_of_week'] = df['Timestamp'].dt.dayofweek
    df['month'] = df['Timestamp'].dt.month
    df['is_weekend'] = df['day_of_week'] >= 5

    monthly_total = df.groupby(['RND_ID', 'month'])['Consumption'].sum().reset_index()
    monthly_avg = df.groupby(['RND_ID', 'month'])['Consumption'].mean().reset_index()

    df = pd.merge(df, monthly_total, on=['RND_ID', 'month'], suffixes=('', '_monthly_total'))
    df = pd.merge(df, monthly_avg, on=['RND_ID', 'month'], suffixes=('', '_monthly_avg'))
    return df

# Function to normalize the data using Min-Max Scaler
def normalize_data(df):
    scaler = MinMaxScaler()
    df[['Consumption', 'Consumption_monthly_total', 'Consumption_monthly_avg']] = scaler.fit_transform(df[['Consumption', 'Consumption_monthly_total', 'Consumption_monthly_avg']])
    return df

# Function to calculate rolling window features
def rolling_window(df, window_size=4):
    df['Consumption_rolling_mean'] = df.groupby('RND_ID')['Consumption'].rolling(window=window_size).mean().reset_index(level=0, drop=True)
    #df['Consumption_rolling_median'] = df.groupby('RND_ID')['Consumption'].rolling(window=window_size).median().reset_index(level=0, drop=True)
    #df['Consumption_rolling_std'] = df.groupby('RND_ID')['Consumption'].rolling(window=window_size).std().reset_index(level=0, drop=True)
    return df

# Function to combine all steps into a pipeline and select first 100 customers
def data_pipeline(df, num_customers=100):
    df = reshape_data(df)
    first_100_customers = df['RND_ID'].unique()[:num_customers]
    df = df[df['RND_ID'].isin(first_100_customers)]
    df = add_time_features(df)
    df = normalize_data(df)
    df = rolling_window(df)
    return df

# Example usage with your original DataFrame
df = pd.read_csv('merged_dfs.csv')  # Replace with your actual data file
processed_data = data_pipeline(df, num_customers=100)
print(processed_data.head(10))