#### Library

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import os

#### Data Loading

In [2]:
# Folder containing CSV files
folder_path = "/home/kevin/Downloads/BESS/Jobs/Inferencing/data/raw"

df_list = []
# Loop through each file in the folder
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        df_name = os.path.splitext(file)[0].split('20')[0]  # Remove the .csv extension for naming
        globals()[df_name] = pd.read_csv(file_path)  # Create a variable dynamically
        print(f"Loaded DataFrame: {df_name}, Shape: {globals()[df_name].shape}")
        df_list.append(df_name)

# Sort the list of DataFrames
df_list.sort()


Loaded DataFrame: wind_generation_, Shape: (24, 3)
Loaded DataFrame: windspeed_edmonton_, Shape: (24, 3)
Loaded DataFrame: price_, Shape: (48, 4)
Loaded DataFrame: temperature_fortmc_, Shape: (24, 3)
Loaded DataFrame: temperature_edmonton_, Shape: (24, 3)
Loaded DataFrame: windspeed_fortmc_, Shape: (24, 3)
Loaded DataFrame: temperature_calgary_, Shape: (24, 3)
Loaded DataFrame: windspeed_calgary_, Shape: (24, 3)
Loaded DataFrame: AIL_, Shape: (48, 3)
Loaded DataFrame: solar_generation_, Shape: (24, 3)


In [3]:
for df_name in df_list:
    print(f"\nDataFrame: {df_name}")
    display(globals()[df_name].head())


DataFrame: AIL_


Unnamed: 0.1,Unnamed: 0,begin_datetime_mpt,alberta_internal_load
0,16,2025-03-19 16:00,10756
1,17,2025-03-19 17:00,10818
2,18,2025-03-19 18:00,10688
3,19,2025-03-19 19:00,10744
4,20,2025-03-19 20:00,10899



DataFrame: price_


Unnamed: 0.1,Unnamed: 0,begin_datetime_mpt,forecast_pool_price,rolling_30day_avg_price
0,16,2025-03-19 16:00:00,33.97,28.32
1,17,2025-03-19 17:00:00,33.08,28.31
2,18,2025-03-19 18:00:00,55.44,28.33
3,19,2025-03-19 19:00:00,59.55,28.36
4,20,2025-03-19 20:00:00,26.38,28.35



DataFrame: solar_generation_


Unnamed: 0.1,Unnamed: 0,datetime_,solar_generation
0,0,2025-03-20 16:00,727.12
1,1,2025-03-20 17:00,575.42
2,2,2025-03-20 18:00,259.93
3,3,2025-03-20 19:00,28.76
4,4,2025-03-20 20:00,0.0



DataFrame: temperature_calgary_


Unnamed: 0.1,Unnamed: 0,Timestamp,Temperature (°C)
0,16,2025-03-20 16:00:00,8.1
1,17,2025-03-20 17:00:00,7.4
2,18,2025-03-20 18:00:00,4.9
3,19,2025-03-20 19:00:00,4.6
4,20,2025-03-20 20:00:00,3.5



DataFrame: temperature_edmonton_


Unnamed: 0.1,Unnamed: 0,Timestamp,Temperature (°C)
0,16,2025-03-20 16:00:00,2.4
1,17,2025-03-20 17:00:00,2.5
2,18,2025-03-20 18:00:00,2.2
3,19,2025-03-20 19:00:00,1.3
4,20,2025-03-20 20:00:00,0.4



DataFrame: temperature_fortmc_


Unnamed: 0.1,Unnamed: 0,Timestamp,Temperature (°C)
0,16,2025-03-20 16:00:00,-6.4
1,17,2025-03-20 17:00:00,-5.7
2,18,2025-03-20 18:00:00,-5.5
3,19,2025-03-20 19:00:00,-5.9
4,20,2025-03-20 20:00:00,-7.1



DataFrame: wind_generation_


Unnamed: 0.1,Unnamed: 0,datetime_,wind_generation
0,0,2025-03-20 16:00,2073.92
1,1,2025-03-20 17:00,2085.39
2,2,2025-03-20 18:00,1900.94
3,3,2025-03-20 19:00,1803.97
4,4,2025-03-20 20:00,1785.97



DataFrame: windspeed_calgary_


Unnamed: 0.1,Unnamed: 0,Timestamp,Wind Speed (km/h)
0,16,2025-03-20 16:00:00,26.9
1,17,2025-03-20 17:00:00,25.3
2,18,2025-03-20 18:00:00,28.5
3,19,2025-03-20 19:00:00,24.3
4,20,2025-03-20 20:00:00,11.8



DataFrame: windspeed_edmonton_


Unnamed: 0.1,Unnamed: 0,Timestamp,Wind Speed (km/h)
0,16,2025-03-20 16:00:00,3.8
1,17,2025-03-20 17:00:00,5.4
2,18,2025-03-20 18:00:00,5.4
3,19,2025-03-20 19:00:00,3.7
4,20,2025-03-20 20:00:00,2.0



DataFrame: windspeed_fortmc_


Unnamed: 0.1,Unnamed: 0,Timestamp,Wind Speed (km/h)
0,16,2025-03-20 16:00:00,10.5
1,17,2025-03-20 17:00:00,10.2
2,18,2025-03-20 18:00:00,9.8
3,19,2025-03-20 19:00:00,9.7
4,20,2025-03-20 20:00:00,9.4


#### Column Name Adjustment

In [4]:
for df_name in df_list:
    print(f"\nDataFrame: {df_name}")
    globals()[df_name].drop(['Unnamed: 0'], axis=1, inplace=True)
    display(globals()[df_name].head())


DataFrame: AIL_


Unnamed: 0,begin_datetime_mpt,alberta_internal_load
0,2025-03-19 16:00,10756
1,2025-03-19 17:00,10818
2,2025-03-19 18:00,10688
3,2025-03-19 19:00,10744
4,2025-03-19 20:00,10899



DataFrame: price_


Unnamed: 0,begin_datetime_mpt,forecast_pool_price,rolling_30day_avg_price
0,2025-03-19 16:00:00,33.97,28.32
1,2025-03-19 17:00:00,33.08,28.31
2,2025-03-19 18:00:00,55.44,28.33
3,2025-03-19 19:00:00,59.55,28.36
4,2025-03-19 20:00:00,26.38,28.35



DataFrame: solar_generation_


Unnamed: 0,datetime_,solar_generation
0,2025-03-20 16:00,727.12
1,2025-03-20 17:00,575.42
2,2025-03-20 18:00,259.93
3,2025-03-20 19:00,28.76
4,2025-03-20 20:00,0.0



DataFrame: temperature_calgary_


Unnamed: 0,Timestamp,Temperature (°C)
0,2025-03-20 16:00:00,8.1
1,2025-03-20 17:00:00,7.4
2,2025-03-20 18:00:00,4.9
3,2025-03-20 19:00:00,4.6
4,2025-03-20 20:00:00,3.5



DataFrame: temperature_edmonton_


Unnamed: 0,Timestamp,Temperature (°C)
0,2025-03-20 16:00:00,2.4
1,2025-03-20 17:00:00,2.5
2,2025-03-20 18:00:00,2.2
3,2025-03-20 19:00:00,1.3
4,2025-03-20 20:00:00,0.4



DataFrame: temperature_fortmc_


Unnamed: 0,Timestamp,Temperature (°C)
0,2025-03-20 16:00:00,-6.4
1,2025-03-20 17:00:00,-5.7
2,2025-03-20 18:00:00,-5.5
3,2025-03-20 19:00:00,-5.9
4,2025-03-20 20:00:00,-7.1



DataFrame: wind_generation_


Unnamed: 0,datetime_,wind_generation
0,2025-03-20 16:00,2073.92
1,2025-03-20 17:00,2085.39
2,2025-03-20 18:00,1900.94
3,2025-03-20 19:00,1803.97
4,2025-03-20 20:00,1785.97



DataFrame: windspeed_calgary_


Unnamed: 0,Timestamp,Wind Speed (km/h)
0,2025-03-20 16:00:00,26.9
1,2025-03-20 17:00:00,25.3
2,2025-03-20 18:00:00,28.5
3,2025-03-20 19:00:00,24.3
4,2025-03-20 20:00:00,11.8



DataFrame: windspeed_edmonton_


Unnamed: 0,Timestamp,Wind Speed (km/h)
0,2025-03-20 16:00:00,3.8
1,2025-03-20 17:00:00,5.4
2,2025-03-20 18:00:00,5.4
3,2025-03-20 19:00:00,3.7
4,2025-03-20 20:00:00,2.0



DataFrame: windspeed_fortmc_


Unnamed: 0,Timestamp,Wind Speed (km/h)
0,2025-03-20 16:00:00,10.5
1,2025-03-20 17:00:00,10.2
2,2025-03-20 18:00:00,9.8
3,2025-03-20 19:00:00,9.7
4,2025-03-20 20:00:00,9.4


In [5]:
# Rename columns
price_.rename(columns={'begin_datetime_mpt': 'datetime_'}, inplace=True)

AIL_.rename(columns={'begin_datetime_mpt': 'datetime_'}, inplace=True)

solar_generation_.rename(columns={'Date (MPT)': 'datetime_', 'Volume':'solar_generation'}, inplace=True)

wind_generation_.rename(columns={'Date (MPT)': 'datetime_', 'Volume':'wind_generation'}, inplace=True)

temperature_calgary_.rename(columns={'Timestamp_mst': 'datetime_', 'Temperature (°C)':'temp_calgary'}, inplace=True)

temperature_edmonton_.rename(columns={'Timestamp_mst': 'datetime_', 'Temperature (°C)':'temp_edmonton'}, inplace=True)

temperature_fortmc_.rename(columns={'Timestamp_mst': 'datetime_', 'Temperature (°C)':'temp_fortmc'}, inplace=True)

windspeed_calgary_.rename(columns={'Timestamp_mst': 'datetime_', 'Wind Speed (km/h)':'ws_calgary'}, inplace=True)

windspeed_edmonton_.rename(columns={'Timestamp_mst': 'datetime_', 'Wind Speed (km/h)':'ws_edmonton'}, inplace=True)

windspeed_fortmc_.rename(columns={'Timestamp_mst': 'datetime_', 'Wind Speed (km/h)':'ws_fortmc'}, inplace=True)

In [6]:
for df_name in df_list:
    print(f"\nDataFrame: {df_name}")
    display(globals()[df_name].head())


DataFrame: AIL_


Unnamed: 0,datetime_,alberta_internal_load
0,2025-03-19 16:00,10756
1,2025-03-19 17:00,10818
2,2025-03-19 18:00,10688
3,2025-03-19 19:00,10744
4,2025-03-19 20:00,10899



DataFrame: price_


Unnamed: 0,datetime_,forecast_pool_price,rolling_30day_avg_price
0,2025-03-19 16:00:00,33.97,28.32
1,2025-03-19 17:00:00,33.08,28.31
2,2025-03-19 18:00:00,55.44,28.33
3,2025-03-19 19:00:00,59.55,28.36
4,2025-03-19 20:00:00,26.38,28.35



DataFrame: solar_generation_


Unnamed: 0,datetime_,solar_generation
0,2025-03-20 16:00,727.12
1,2025-03-20 17:00,575.42
2,2025-03-20 18:00,259.93
3,2025-03-20 19:00,28.76
4,2025-03-20 20:00,0.0



DataFrame: temperature_calgary_


Unnamed: 0,Timestamp,temp_calgary
0,2025-03-20 16:00:00,8.1
1,2025-03-20 17:00:00,7.4
2,2025-03-20 18:00:00,4.9
3,2025-03-20 19:00:00,4.6
4,2025-03-20 20:00:00,3.5



DataFrame: temperature_edmonton_


Unnamed: 0,Timestamp,temp_edmonton
0,2025-03-20 16:00:00,2.4
1,2025-03-20 17:00:00,2.5
2,2025-03-20 18:00:00,2.2
3,2025-03-20 19:00:00,1.3
4,2025-03-20 20:00:00,0.4



DataFrame: temperature_fortmc_


Unnamed: 0,Timestamp,temp_fortmc
0,2025-03-20 16:00:00,-6.4
1,2025-03-20 17:00:00,-5.7
2,2025-03-20 18:00:00,-5.5
3,2025-03-20 19:00:00,-5.9
4,2025-03-20 20:00:00,-7.1



DataFrame: wind_generation_


Unnamed: 0,datetime_,wind_generation
0,2025-03-20 16:00,2073.92
1,2025-03-20 17:00,2085.39
2,2025-03-20 18:00,1900.94
3,2025-03-20 19:00,1803.97
4,2025-03-20 20:00,1785.97



DataFrame: windspeed_calgary_


Unnamed: 0,Timestamp,ws_calgary
0,2025-03-20 16:00:00,26.9
1,2025-03-20 17:00:00,25.3
2,2025-03-20 18:00:00,28.5
3,2025-03-20 19:00:00,24.3
4,2025-03-20 20:00:00,11.8



DataFrame: windspeed_edmonton_


Unnamed: 0,Timestamp,ws_edmonton
0,2025-03-20 16:00:00,3.8
1,2025-03-20 17:00:00,5.4
2,2025-03-20 18:00:00,5.4
3,2025-03-20 19:00:00,3.7
4,2025-03-20 20:00:00,2.0



DataFrame: windspeed_fortmc_


Unnamed: 0,Timestamp,ws_fortmc
0,2025-03-20 16:00:00,10.5
1,2025-03-20 17:00:00,10.2
2,2025-03-20 18:00:00,9.8
3,2025-03-20 19:00:00,9.7
4,2025-03-20 20:00:00,9.4


In [7]:
# convert columns to datetime
for df_name in df_list:
    print(f"\nDataFrame: {df_name}")
    try:
        globals()[df_name]['datetime_'] = pd.to_datetime(globals()[df_name]['datetime_'])
        display(globals()[df_name].head())
    except:
        globals()[df_name]['datetime_'] = pd.to_datetime(globals()[df_name]['Timestamp'])
        globals()[df_name].drop(['Timestamp'], axis=1, inplace=True)
        display(globals()[df_name].head())


DataFrame: AIL_


Unnamed: 0,datetime_,alberta_internal_load
0,2025-03-19 16:00:00,10756
1,2025-03-19 17:00:00,10818
2,2025-03-19 18:00:00,10688
3,2025-03-19 19:00:00,10744
4,2025-03-19 20:00:00,10899



DataFrame: price_




Unnamed: 0,datetime_,forecast_pool_price,rolling_30day_avg_price
0,2025-03-19 16:00:00,33.97,28.32
1,2025-03-19 17:00:00,33.08,28.31
2,2025-03-19 18:00:00,55.44,28.33
3,2025-03-19 19:00:00,59.55,28.36
4,2025-03-19 20:00:00,26.38,28.35



DataFrame: solar_generation_


Unnamed: 0,datetime_,solar_generation
0,2025-03-20 16:00:00,727.12
1,2025-03-20 17:00:00,575.42
2,2025-03-20 18:00:00,259.93
3,2025-03-20 19:00:00,28.76
4,2025-03-20 20:00:00,0.0



DataFrame: temperature_calgary_


Unnamed: 0,temp_calgary,datetime_
0,8.1,2025-03-20 16:00:00
1,7.4,2025-03-20 17:00:00
2,4.9,2025-03-20 18:00:00
3,4.6,2025-03-20 19:00:00
4,3.5,2025-03-20 20:00:00



DataFrame: temperature_edmonton_


Unnamed: 0,temp_edmonton,datetime_
0,2.4,2025-03-20 16:00:00
1,2.5,2025-03-20 17:00:00
2,2.2,2025-03-20 18:00:00
3,1.3,2025-03-20 19:00:00
4,0.4,2025-03-20 20:00:00



DataFrame: temperature_fortmc_


Unnamed: 0,temp_fortmc,datetime_
0,-6.4,2025-03-20 16:00:00
1,-5.7,2025-03-20 17:00:00
2,-5.5,2025-03-20 18:00:00
3,-5.9,2025-03-20 19:00:00
4,-7.1,2025-03-20 20:00:00



DataFrame: wind_generation_


Unnamed: 0,datetime_,wind_generation
0,2025-03-20 16:00:00,2073.92
1,2025-03-20 17:00:00,2085.39
2,2025-03-20 18:00:00,1900.94
3,2025-03-20 19:00:00,1803.97
4,2025-03-20 20:00:00,1785.97



DataFrame: windspeed_calgary_


Unnamed: 0,ws_calgary,datetime_
0,26.9,2025-03-20 16:00:00
1,25.3,2025-03-20 17:00:00
2,28.5,2025-03-20 18:00:00
3,24.3,2025-03-20 19:00:00
4,11.8,2025-03-20 20:00:00



DataFrame: windspeed_edmonton_


Unnamed: 0,ws_edmonton,datetime_
0,3.8,2025-03-20 16:00:00
1,5.4,2025-03-20 17:00:00
2,5.4,2025-03-20 18:00:00
3,3.7,2025-03-20 19:00:00
4,2.0,2025-03-20 20:00:00



DataFrame: windspeed_fortmc_


Unnamed: 0,ws_fortmc,datetime_
0,10.5,2025-03-20 16:00:00
1,10.2,2025-03-20 17:00:00
2,9.8,2025-03-20 18:00:00
3,9.7,2025-03-20 19:00:00
4,9.4,2025-03-20 20:00:00


In [8]:
# Initialize the merged dataframe with the first dataframe in the list
merged_df = globals()[df_list[0]]

# Loop through the remaining dataframes and merge them one by one
for df_name in df_list[1:]:
    merged_df = merged_df.merge(globals()[df_name], on='datetime_', how='outer')

# Display the merged dataframe
display(merged_df.head())

Unnamed: 0,datetime_,alberta_internal_load,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,ws_edmonton,ws_fortmc
0,2025-03-19 16:00:00,10756,33.97,28.32,,,,,,,,
1,2025-03-19 17:00:00,10818,33.08,28.31,,,,,,,,
2,2025-03-19 18:00:00,10688,55.44,28.33,,,,,,,,
3,2025-03-19 19:00:00,10744,59.55,28.36,,,,,,,,
4,2025-03-19 20:00:00,10899,26.38,28.35,,,,,,,,


In [9]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 47
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   datetime_                48 non-null     datetime64[ns]
 1   alberta_internal_load    48 non-null     int64         
 2   forecast_pool_price      47 non-null     float64       
 3   rolling_30day_avg_price  47 non-null     float64       
 4   solar_generation         24 non-null     float64       
 5   temp_calgary             24 non-null     float64       
 6   temp_edmonton            24 non-null     float64       
 7   temp_fortmc              24 non-null     float64       
 8   wind_generation          24 non-null     float64       
 9   ws_calgary               24 non-null     float64       
 10  ws_edmonton              24 non-null     float64       
 11  ws_fortmc                24 non-null     float64       
dtypes: datetime64[ns](1), float64(10), int

In [10]:
merged_df.shape

(48, 12)

In [11]:
merged_df.isnull().sum()

datetime_                   0
alberta_internal_load       0
forecast_pool_price         1
rolling_30day_avg_price     1
solar_generation           24
temp_calgary               24
temp_edmonton              24
temp_fortmc                24
wind_generation            24
ws_calgary                 24
ws_edmonton                24
ws_fortmc                  24
dtype: int64

#### Feature Engineering

##### time based variables

In [12]:
merged_df['hour_of_day'] = merged_df['datetime_'].dt.hour
merged_df['day_of_week'] = merged_df['datetime_'].dt.dayofweek
merged_df['day_of_month'] = merged_df['datetime_'].dt.day
# Calculate the week of the month (from 1 to 5)
merged_df['week_of_month'] = merged_df['datetime_'].apply(lambda x: (x.day - 1) // 7 + 1)
merged_df['month'] = merged_df['datetime_'].dt.month
merged_df['year'] = merged_df['datetime_'].dt.year

# Display the updated dataframe
display(merged_df.head())

Unnamed: 0,datetime_,alberta_internal_load,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,ws_edmonton,ws_fortmc,hour_of_day,day_of_week,day_of_month,week_of_month,month,year
0,2025-03-19 16:00:00,10756,33.97,28.32,,,,,,,,,16,2,19,3,3,2025
1,2025-03-19 17:00:00,10818,33.08,28.31,,,,,,,,,17,2,19,3,3,2025
2,2025-03-19 18:00:00,10688,55.44,28.33,,,,,,,,,18,2,19,3,3,2025
3,2025-03-19 19:00:00,10744,59.55,28.36,,,,,,,,,19,2,19,3,3,2025
4,2025-03-19 20:00:00,10899,26.38,28.35,,,,,,,,,20,2,19,3,3,2025


##### seasonality

In [13]:
# Define a function to determine the season
def get_season(month):
    if month in [5, 6, 7, 8, 9]:
        return 0 # Summer
    else:
        return 1 # Winter

# Apply the function to create a new column 'season'
merged_df['is_winter'] = merged_df['month'].apply(get_season)

# Display the updated dataframe
display(merged_df.head())

Unnamed: 0,datetime_,alberta_internal_load,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,ws_edmonton,ws_fortmc,hour_of_day,day_of_week,day_of_month,week_of_month,month,year,is_winter
0,2025-03-19 16:00:00,10756,33.97,28.32,,,,,,,,,16,2,19,3,3,2025,1
1,2025-03-19 17:00:00,10818,33.08,28.31,,,,,,,,,17,2,19,3,3,2025,1
2,2025-03-19 18:00:00,10688,55.44,28.33,,,,,,,,,18,2,19,3,3,2025,1
3,2025-03-19 19:00:00,10744,59.55,28.36,,,,,,,,,19,2,19,3,3,2025,1
4,2025-03-19 20:00:00,10899,26.38,28.35,,,,,,,,,20,2,19,3,3,2025,1


##### pool_price_lagged_values_till_24_hours

In [14]:
def generate_lagged_features(df, column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'pool_price_lag_{lag}'] = df[column].shift(lag)
    return df

# Generate 24-hour lagged values for the pool_price column
merged_df = generate_lagged_features(merged_df, 'forecast_pool_price', 24)

# Display the updated dataframe
display(merged_df.head())

Unnamed: 0,datetime_,alberta_internal_load,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,ws_edmonton,ws_fortmc,hour_of_day,day_of_week,day_of_month,week_of_month,month,year,is_winter,pool_price_lag_1,pool_price_lag_2,pool_price_lag_3,pool_price_lag_4,pool_price_lag_5,pool_price_lag_6,pool_price_lag_7,pool_price_lag_8,pool_price_lag_9,pool_price_lag_10,pool_price_lag_11,pool_price_lag_12,pool_price_lag_13,pool_price_lag_14,pool_price_lag_15,pool_price_lag_16,pool_price_lag_17,pool_price_lag_18,pool_price_lag_19,pool_price_lag_20,pool_price_lag_21,pool_price_lag_22,pool_price_lag_23,pool_price_lag_24
0,2025-03-19 16:00:00,10756,33.97,28.32,,,,,,,,,16,2,19,3,3,2025,1,,,,,,,,,,,,,,,,,,,,,,,,
1,2025-03-19 17:00:00,10818,33.08,28.31,,,,,,,,,17,2,19,3,3,2025,1,33.97,,,,,,,,,,,,,,,,,,,,,,,
2,2025-03-19 18:00:00,10688,55.44,28.33,,,,,,,,,18,2,19,3,3,2025,1,33.08,33.97,,,,,,,,,,,,,,,,,,,,,,
3,2025-03-19 19:00:00,10744,59.55,28.36,,,,,,,,,19,2,19,3,3,2025,1,55.44,33.08,33.97,,,,,,,,,,,,,,,,,,,,,
4,2025-03-19 20:00:00,10899,26.38,28.35,,,,,,,,,20,2,19,3,3,2025,1,59.55,55.44,33.08,33.97,,,,,,,,,,,,,,,,,,,,


##### AIL_lagged_values_till_24_hours

In [16]:
def generate_lagged_features(df, column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'{column}_lag_{lag}'] = df[column].shift(lag)
    return df

# Generate 24-hour lagged values for the AIL column
merged_df = generate_lagged_features(merged_df, 'alberta_internal_load', 24)

# Display the updated dataframe
display(merged_df.head())

Unnamed: 0,datetime_,alberta_internal_load,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,ws_edmonton,ws_fortmc,hour_of_day,day_of_week,day_of_month,week_of_month,month,year,is_winter,pool_price_lag_1,pool_price_lag_2,pool_price_lag_3,pool_price_lag_4,pool_price_lag_5,pool_price_lag_6,pool_price_lag_7,pool_price_lag_8,pool_price_lag_9,pool_price_lag_10,pool_price_lag_11,pool_price_lag_12,pool_price_lag_13,pool_price_lag_14,pool_price_lag_15,pool_price_lag_16,pool_price_lag_17,pool_price_lag_18,pool_price_lag_19,pool_price_lag_20,pool_price_lag_21,pool_price_lag_22,pool_price_lag_23,pool_price_lag_24,alberta_internal_load_lag_1,alberta_internal_load_lag_2,alberta_internal_load_lag_3,alberta_internal_load_lag_4,alberta_internal_load_lag_5,alberta_internal_load_lag_6,alberta_internal_load_lag_7,alberta_internal_load_lag_8,alberta_internal_load_lag_9,alberta_internal_load_lag_10,alberta_internal_load_lag_11,alberta_internal_load_lag_12,alberta_internal_load_lag_13,alberta_internal_load_lag_14,alberta_internal_load_lag_15,alberta_internal_load_lag_16,alberta_internal_load_lag_17,alberta_internal_load_lag_18,alberta_internal_load_lag_19,alberta_internal_load_lag_20,alberta_internal_load_lag_21,alberta_internal_load_lag_22,alberta_internal_load_lag_23,alberta_internal_load_lag_24
0,2025-03-19 16:00:00,10756,33.97,28.32,,,,,,,,,16,2,19,3,3,2025,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2025-03-19 17:00:00,10818,33.08,28.31,,,,,,,,,17,2,19,3,3,2025,1,33.97,,,,,,,,,,,,,,,,,,,,,,,,10756.0,,,,,,,,,,,,,,,,,,,,,,,
2,2025-03-19 18:00:00,10688,55.44,28.33,,,,,,,,,18,2,19,3,3,2025,1,33.08,33.97,,,,,,,,,,,,,,,,,,,,,,,10818.0,10756.0,,,,,,,,,,,,,,,,,,,,,,
3,2025-03-19 19:00:00,10744,59.55,28.36,,,,,,,,,19,2,19,3,3,2025,1,55.44,33.08,33.97,,,,,,,,,,,,,,,,,,,,,,10688.0,10818.0,10756.0,,,,,,,,,,,,,,,,,,,,,
4,2025-03-19 20:00:00,10899,26.38,28.35,,,,,,,,,20,2,19,3,3,2025,1,59.55,55.44,33.08,33.97,,,,,,,,,,,,,,,,,,,,,10744.0,10688.0,10818.0,10756.0,,,,,,,,,,,,,,,,,,,,


In [17]:
merged_df = merged_df[merged_df['datetime_'] >= '2025-03-20 16:00:00'] # replace by datetime on which data is fetched

merged_df.isnull().sum().sort_values(ascending=False) 

forecast_pool_price             1
rolling_30day_avg_price         1
alberta_internal_load_lag_24    0
pool_price_lag_6                0
pool_price_lag_1                0
pool_price_lag_2                0
pool_price_lag_3                0
pool_price_lag_4                0
pool_price_lag_5                0
pool_price_lag_8                0
pool_price_lag_7                0
year                            0
pool_price_lag_9                0
pool_price_lag_10               0
pool_price_lag_11               0
pool_price_lag_12               0
is_winter                       0
week_of_month                   0
month                           0
pool_price_lag_14               0
day_of_month                    0
day_of_week                     0
hour_of_day                     0
ws_fortmc                       0
ws_edmonton                     0
ws_calgary                      0
wind_generation                 0
temp_fortmc                     0
temp_edmonton                   0
temp_calgary  

In [18]:
# forward fill the NA values
merged_df.fillna(method='ffill', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


#### Save Data

In [19]:
merged_df.to_csv('/home/kevin/Downloads/BESS/Jobs/Inferencing/data/raw/merged_df.csv', index=False)