In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mpld3

In [2]:
telemetry_df=pd.read_feather("telemetry_large_consumers_DCW.feather")

## Transform the Wide Format to Long Format

In [3]:
telemetry_df

Unnamed: 0,RND_ID,2023-01-01 00:00,2023-01-01 00:15,2023-01-01 00:30,2023-01-01 00:45,2023-01-01 01:00,2023-01-01 01:15,2023-01-01 01:30,2023-01-01 01:45,2023-01-01 02:00,...,2023-12-31 21:30,2023-12-31 21:45,2023-12-31 22:00,2023-12-31 22:15,2023-12-31 22:30,2023-12-31 22:45,2023-12-31 23:00,2023-12-31 23:15,2023-12-31 23:30,2023-12-31 23:45
0,8423,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
1,6756,36.0,32.0,32.0,36.0,32.0,36.0,32.0,36.0,32.0,...,35.89,35.43,34.09,33.31,30.57,29.65,32.06,29.13,28.25,27.85
2,1077,0.0,4.0,0.0,4.0,0.0,4.0,0.0,4.0,0.0,...,2.00,1.50,1.50,2.00,1.50,1.50,2.00,1.50,2.00,1.50
3,8061,16.0,8.0,12.0,8.0,12.0,16.0,12.0,12.0,12.0,...,8.40,8.80,8.80,11.00,10.19,9.19,8.80,9.00,11.00,12.40
4,10575,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,...,6.67,6.66,3.54,1.15,1.10,1.10,1.10,1.10,1.10,1.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17097,12403,4.0,8.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,6.50,5.00,5.50,5.50,5.50,4.50,4.00,4.00,3.50,4.00
17098,3076,16.0,16.0,16.0,16.0,12.0,16.0,12.0,16.0,12.0,...,0.73,0.44,0.44,0.46,0.44,0.44,0.46,0.44,0.46,0.46
17099,3076,16.0,16.0,16.0,16.0,12.0,16.0,12.0,16.0,12.0,...,0.73,0.44,0.44,0.46,0.44,0.44,0.46,0.44,0.46,0.46
17100,5060,4.0,4.0,8.0,4.0,4.0,4.0,4.0,8.0,4.0,...,5.86,5.87,5.98,5.91,5.71,6.41,5.70,7.20,5.62,5.94


In [10]:
# Check for missing values
missing_data = telemetry_df.isnull().sum()
missing_data


RND_ID               0
2023-01-01 00:00     3
2023-01-01 00:15     3
2023-01-01 00:30     3
2023-01-01 00:45     3
                    ..
2023-12-31 22:45    13
2023-12-31 23:00    34
2023-12-31 23:15    34
2023-12-31 23:30    34
2023-12-31 23:45    34
Length: 35041, dtype: int64

In [4]:
df_long = pd.melt(telemetry_df, id_vars=['RND_ID'], var_name='datetime', value_name='load')

In [6]:
df_long.to_csv("telemetry_long_df.csv")

In [7]:
# Convert the datetime column to datetime format
df_long['datetime'] = pd.to_datetime(df_long['datetime'])

In [8]:
# Set the index for easier time series manipulation
df_long.set_index(['RND_ID', 'datetime'], inplace=True)

In [9]:
df_long.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 599254080 entries, ('8423', Timestamp('2023-01-01 00:00:00')) to ('6693', Timestamp('2023-12-31 23:45:00'))
Data columns (total 1 columns):
 #   Column  Dtype  
---  ------  -----  
 0   load    float64
dtypes: float64(1)
memory usage: 7.8+ GB


# Create lags for 15min, 30min, 45min, and 1 hour

In [None]:
# Create lagged features
for lag in [1, 2,3,4]:  # Create lags for 15min, 30min, 45min, and 1 hour
    df_long[f'load_lag_{lag}'] = df_long.groupby('RND_ID')['load'].shift(lag)

# Drop NaN values that result from lagging
df_long = df_long.dropna()

# Reset index to have a clean DataFrame
df_long = df_long.reset_index()

In [11]:
df_long

Unnamed: 0_level_0,Unnamed: 1_level_0,load,load_lag_1,load_lag_2,load_lag_3,load_lag_4
RND_ID,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4080,2023-01-01 00:30:00,24.00,20.00,20.00,16.00,16.00
4080,2023-01-01 00:30:00,24.00,24.00,20.00,20.00,16.00
5864,2023-01-01 00:30:00,0.00,0.00,0.00,0.00,0.00
5864,2023-01-01 00:30:00,0.00,0.00,0.00,0.00,0.00
1163,2023-01-01 00:30:00,12.00,16.00,16.00,12.00,12.00
...,...,...,...,...,...,...
12403,2023-12-31 23:45:00,4.00,3.50,4.00,4.00,4.50
3076,2023-12-31 23:45:00,0.46,0.46,0.46,0.44,0.44
3076,2023-12-31 23:45:00,0.46,0.46,0.46,0.46,0.44
5060,2023-12-31 23:45:00,5.94,5.62,7.20,5.70,6.41


# Create historical statistics features (mean, median, std)

In [1]:
# Create historical statistics features (mean, median, std)
window_size = 4  # Example window size for rolling statistics (1 hour)
df_long['load_mean'] = df_long.groupby('RND_ID')['load'].transform(lambda x: x.rolling(window=window_size).mean())
df_long['load_median'] = df_long.groupby('RND_ID')['load'].transform(lambda x: x.rolling(window=window_size).median())
df_long['load_std'] = df_long.groupby('RND_ID')['load'].transform(lambda x: x.rolling(window=window_size).std())

NameError: name 'df_long' is not defined

In [None]:
# Create Fourier series terms for capturing periodic patterns
def create_fourier_terms(df, period):
    t = np.arange(len(df))
    df[f'sin_{period}'] = np.sin(2 * np.pi * t / period)
    df[f'cos_{period}'] = np.cos(2 * np.pi * t / period)
    return df

periods = [96, 672]  # Example periods for daily (96 intervals) and weekly (672 intervals) patterns
for period in periods:
    df_long = df_long.groupby('RND_ID').apply(create_fourier_terms, period=period).reset_index(drop=True)

# Drop NaN values that result from lagging and rolling calculations
df_long = df_long.dropna()

# Reset index to have a clean DataFrame
df_long = df_long.reset_index()

# The resulting DataFrame df_long now contains the lagged features, historical statistics, and Fourier series terms
print(df_long.head())