#### Library

In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import pytz

# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import os

#### Data Loading

In [27]:
import os

# Change this to your absolute BESS path
bess_path = '/Users/kishanakbari/Downloads/BESS'

os.chdir(bess_path)
print("Current working directory:", os.getcwd())

Current working directory: /Users/kishanakbari/Downloads/BESS


In [29]:
# Folder containing CSV files
folder_path = "Jobs/Inferencing/data/raw" 

df_list = []
# Loop through each file in the folder
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        file_path = os.path.join(folder_path, file)
        df_name = os.path.splitext(file)[0].split('20')[0]  # Remove the .csv extension for naming
        globals()[df_name] = pd.read_csv(file_path)  # Create a variable dynamically
        print(f"Loaded DataFrame: {df_name}, Shape: {globals()[df_name].shape}")
        df_list.append(df_name)

# Sort the list of DataFrames
df_list.sort()

Loaded DataFrame: temperature_fortmc_, Shape: (24, 3)
Loaded DataFrame: wind_generation_, Shape: (24, 3)
Loaded DataFrame: windspeed_fortmc_, Shape: (24, 3)
Loaded DataFrame: AIL_, Shape: (48, 3)
Loaded DataFrame: windspeed_calgary_, Shape: (24, 3)
Loaded DataFrame: temperature_calgary_, Shape: (24, 3)
Loaded DataFrame: temperature_edmonton_, Shape: (24, 3)
Loaded DataFrame: price_, Shape: (48, 4)
Loaded DataFrame: solar_generation_, Shape: (24, 3)
Loaded DataFrame: windspeed_edmonton_, Shape: (24, 3)


In [30]:
for df_name in df_list:
    print(f"\nDataFrame: {df_name}")
    # display(globals()[df_name].head())


DataFrame: AIL_

DataFrame: price_

DataFrame: solar_generation_

DataFrame: temperature_calgary_

DataFrame: temperature_edmonton_

DataFrame: temperature_fortmc_

DataFrame: wind_generation_

DataFrame: windspeed_calgary_

DataFrame: windspeed_edmonton_

DataFrame: windspeed_fortmc_


#### Column Name Adjustment

In [31]:
for df_name in df_list:
    print(f"\nDataFrame: {df_name}")
    globals()[df_name].drop(['Unnamed: 0'], axis=1, inplace=True)
    # display(globals()[df_name].head())


DataFrame: AIL_

DataFrame: price_

DataFrame: solar_generation_

DataFrame: temperature_calgary_

DataFrame: temperature_edmonton_

DataFrame: temperature_fortmc_

DataFrame: wind_generation_

DataFrame: windspeed_calgary_

DataFrame: windspeed_edmonton_

DataFrame: windspeed_fortmc_


In [32]:
# Rename columns
price_.rename(columns={'begin_datetime_mpt': 'datetime_'}, inplace=True)

AIL_.rename(columns={'begin_datetime_mpt': 'datetime_'}, inplace=True)

solar_generation_.rename(columns={'Date (MPT)': 'datetime_', 'Volume':'solar_generation'}, inplace=True)

wind_generation_.rename(columns={'Date (MPT)': 'datetime_', 'Volume':'wind_generation'}, inplace=True)

temperature_calgary_.rename(columns={'Timestamp_mst': 'datetime_', 'Temperature (°C)':'temp_calgary'}, inplace=True)

temperature_edmonton_.rename(columns={'Timestamp_mst': 'datetime_', 'Temperature (°C)':'temp_edmonton'}, inplace=True)

temperature_fortmc_.rename(columns={'Timestamp_mst': 'datetime_', 'Temperature (°C)':'temp_fortmc'}, inplace=True)

windspeed_calgary_.rename(columns={'Timestamp_mst': 'datetime_', 'Wind Speed (km/h)':'ws_calgary'}, inplace=True)

windspeed_edmonton_.rename(columns={'Timestamp_mst': 'datetime_', 'Wind Speed (km/h)':'ws_edmonton'}, inplace=True)

windspeed_fortmc_.rename(columns={'Timestamp_mst': 'datetime_', 'Wind Speed (km/h)':'ws_fortmc'}, inplace=True)

In [33]:
for df_name in df_list:
    print(f"\nDataFrame: {df_name}")
    # display(globals()[df_name].head())


DataFrame: AIL_

DataFrame: price_

DataFrame: solar_generation_

DataFrame: temperature_calgary_

DataFrame: temperature_edmonton_

DataFrame: temperature_fortmc_

DataFrame: wind_generation_

DataFrame: windspeed_calgary_

DataFrame: windspeed_edmonton_

DataFrame: windspeed_fortmc_


In [34]:
# convert columns to datetime
for df_name in df_list:
    print(f"\nDataFrame: {df_name}")
    try:
        globals()[df_name]['datetime_'] = pd.to_datetime(globals()[df_name]['datetime_'])
        # display(globals()[df_name].head())
    except:
        globals()[df_name]['datetime_'] = pd.to_datetime(globals()[df_name]['Timestamp'])
        globals()[df_name].drop(['Timestamp'], axis=1, inplace=True)
        # display(globals()[df_name].head())


DataFrame: AIL_

DataFrame: price_

DataFrame: solar_generation_

DataFrame: temperature_calgary_

DataFrame: temperature_edmonton_

DataFrame: temperature_fortmc_

DataFrame: wind_generation_

DataFrame: windspeed_calgary_

DataFrame: windspeed_edmonton_

DataFrame: windspeed_fortmc_


In [43]:
# print(AIL_.datetime_.min(), AIL_.datetime_.max())


for df_name in df_list:
    print(df_name)
    print(globals()[df_name].datetime_.min(), globals()[df_name].datetime_.max())
    print(globals()[df_name].isnull().sum())
    

AIL_
2025-05-21 18:00:00 2025-05-23 17:00:00
datetime_                0
alberta_internal_load    0
dtype: int64
price_
2025-05-21 18:00:00 2025-05-23 17:00:00
datetime_                  0
forecast_pool_price        3
rolling_30day_avg_price    3
dtype: int64
solar_generation_
2025-05-22 18:00:00 2025-05-23 17:00:00
datetime_           0
solar_generation    0
dtype: int64
temperature_calgary_
2025-05-22 18:00:00 2025-05-23 17:00:00
temp_calgary    0
datetime_       0
dtype: int64
temperature_edmonton_
2025-05-22 18:00:00 2025-05-23 17:00:00
temp_edmonton    0
datetime_        0
dtype: int64
temperature_fortmc_
2025-05-22 18:00:00 2025-05-23 17:00:00
temp_fortmc    0
datetime_      0
dtype: int64
wind_generation_
2025-05-22 18:00:00 2025-05-23 17:00:00
datetime_          0
wind_generation    0
dtype: int64
windspeed_calgary_
2025-05-22 18:00:00 2025-05-23 17:00:00
ws_calgary    0
datetime_     0
dtype: int64
windspeed_edmonton_
2025-05-22 18:00:00 2025-05-23 17:00:00
ws_edmonton    0
dat

In [44]:
# Initialize the merged dataframe with the first dataframe in the list
merged_df = globals()[df_list[0]]

# Loop through the remaining dataframes and merge them one by one
for df_name in df_list[1:]:
    merged_df = merged_df.merge(globals()[df_name], on='datetime_', how='outer')

# Display the merged dataframe
# display(merged_df.head())

In [45]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 0 to 47
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   datetime_                48 non-null     datetime64[ns]
 1   alberta_internal_load    48 non-null     int64         
 2   forecast_pool_price      45 non-null     float64       
 3   rolling_30day_avg_price  45 non-null     float64       
 4   solar_generation         24 non-null     float64       
 5   temp_calgary             24 non-null     float64       
 6   temp_edmonton            24 non-null     float64       
 7   temp_fortmc              24 non-null     float64       
 8   wind_generation          24 non-null     float64       
 9   ws_calgary               24 non-null     float64       
 10  ws_edmonton              24 non-null     float64       
 11  ws_fortmc                24 non-null     float64       
dtypes: datetime64[ns](1), float64(10), int

In [46]:
merged_df.shape

(48, 12)

In [47]:
merged_df.isnull().sum()

datetime_                   0
alberta_internal_load       0
forecast_pool_price         3
rolling_30day_avg_price     3
solar_generation           24
temp_calgary               24
temp_edmonton              24
temp_fortmc                24
wind_generation            24
ws_calgary                 24
ws_edmonton                24
ws_fortmc                  24
dtype: int64

#### Feature Engineering

##### time based variables

In [48]:
merged_df['hour_of_day'] = merged_df['datetime_'].dt.hour
merged_df['day_of_week'] = merged_df['datetime_'].dt.dayofweek
merged_df['day_of_month'] = merged_df['datetime_'].dt.day
# Calculate the week of the month (from 1 to 5)
merged_df['week_of_month'] = merged_df['datetime_'].apply(lambda x: (x.day - 1) // 7 + 1)
merged_df['month'] = merged_df['datetime_'].dt.month
merged_df['year'] = merged_df['datetime_'].dt.year

# Display the updated dataframe
# display(merged_df.head())

##### seasonality

In [49]:
# Define a function to determine the season
def get_season(month):
    if month in [5, 6, 7, 8, 9]:
        return 0 # Summer
    else:
        return 1 # Winter

# Apply the function to create a new column 'season'
merged_df['is_winter'] = merged_df['month'].apply(get_season)

# Display the updated dataframe
# display(merged_df.head())

##### pool_price_lagged_values_till_24_hours

In [50]:
def generate_lagged_features(df, column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'pool_price_lag_{lag}'] = df[column].shift(lag)
    return df

# Generate 24-hour lagged values for the pool_price column
merged_df = generate_lagged_features(merged_df, 'forecast_pool_price', 24)

# Display the updated dataframe
# display(merged_df.head())

In [53]:
print(merged_df.shape)
print(merged_df.datetime_.min(), merged_df.datetime_.max())
merged_df.head()

(48, 43)
2025-05-21 18:00:00 2025-05-23 17:00:00


Unnamed: 0,datetime_,alberta_internal_load,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,...,pool_price_lag_15,pool_price_lag_16,pool_price_lag_17,pool_price_lag_18,pool_price_lag_19,pool_price_lag_20,pool_price_lag_21,pool_price_lag_22,pool_price_lag_23,pool_price_lag_24
0,2025-05-21 18:00:00,9723,15.12,36.51,,,,,,,...,,,,,,,,,,
1,2025-05-21 19:00:00,9706,10.11,36.34,,,,,,,...,,,,,,,,,,
2,2025-05-21 20:00:00,9679,15.79,36.18,,,,,,,...,,,,,,,,,,
3,2025-05-21 21:00:00,9542,22.42,35.96,,,,,,,...,,,,,,,,,,
4,2025-05-21 22:00:00,9465,24.43,35.87,,,,,,,...,,,,,,,,,,


##### AIL_lagged_values_till_24_hours

In [54]:
def generate_lagged_features(df, column, num_lags):
    for lag in range(1, num_lags + 1):
        df[f'{column}_lag_{lag}'] = df[column].shift(lag)
    return df

# Generate 24-hour lagged values for the AIL column
merged_df = generate_lagged_features(merged_df, 'alberta_internal_load', 24)

# Display the updated dataframe
# display(merged_df.head())

In [55]:
print(merged_df.shape)
print(merged_df.datetime_.min(), merged_df.datetime_.max())
merged_df.head()

(48, 67)
2025-05-21 18:00:00 2025-05-23 17:00:00


Unnamed: 0,datetime_,alberta_internal_load,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,...,alberta_internal_load_lag_15,alberta_internal_load_lag_16,alberta_internal_load_lag_17,alberta_internal_load_lag_18,alberta_internal_load_lag_19,alberta_internal_load_lag_20,alberta_internal_load_lag_21,alberta_internal_load_lag_22,alberta_internal_load_lag_23,alberta_internal_load_lag_24
0,2025-05-21 18:00:00,9723,15.12,36.51,,,,,,,...,,,,,,,,,,
1,2025-05-21 19:00:00,9706,10.11,36.34,,,,,,,...,,,,,,,,,,
2,2025-05-21 20:00:00,9679,15.79,36.18,,,,,,,...,,,,,,,,,,
3,2025-05-21 21:00:00,9542,22.42,35.96,,,,,,,...,,,,,,,,,,
4,2025-05-21 22:00:00,9465,24.43,35.87,,,,,,,...,,,,,,,,,,


In [62]:
print(merged_df.isnull().sum(axis=1))

0     56
1     54
2     52
3     50
4     48
5     46
6     44
7     42
8     40
9     38
10    36
11    34
12    32
13    30
14    28
15    26
16    24
17    22
18    20
19    18
20    16
21    14
22    12
23    10
24     0
25     0
26     0
27     0
28     0
29     0
30     0
31     0
32     0
33     0
34     0
35     0
36     0
37     0
38     0
39     0
40     0
41     0
42     0
43     0
44     0
45     2
46     3
47     4
dtype: int64


In [64]:
# Get the current time in MST
# mst_timezone = pytz.timezone('MST')
filter_time = datetime.now() # mst_timezone

# Convert to string format if needed
filter_time_str = filter_time.strftime('%Y-%m-%d %H:%M:%S')
print(f"Filtering data from: {filter_time_str}")

Filtering data from: 2025-05-22 17:51:17


In [66]:
merged_df[merged_df['datetime_'] >= filter_time_str].isnull().sum(axis=1)

24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    2
46    3
47    4
dtype: int64

In [67]:
# Get the current time in MST
# mst_timezone = pytz.timezone('MST')
filter_time = datetime.now()#datetime.now(mst_timezone)

# Convert to string format if needed
filter_time_str = filter_time.strftime('%Y-%m-%d %H:%M:%S')
print(f"Filtering data from: {filter_time_str}")
# Apply the filter
merged_df = merged_df[merged_df['datetime_'] >= filter_time_str]

merged_df.isnull().sum().sort_values(ascending=False) 

Filtering data from: 2025-05-22 17:52:17


forecast_pool_price             3
rolling_30day_avg_price         3
pool_price_lag_1                2
pool_price_lag_2                1
alberta_internal_load_lag_3     0
alberta_internal_load_lag_8     0
alberta_internal_load_lag_7     0
alberta_internal_load_lag_6     0
alberta_internal_load_lag_5     0
alberta_internal_load_lag_4     0
datetime_                       0
alberta_internal_load_lag_2     0
pool_price_lag_24               0
pool_price_lag_23               0
pool_price_lag_22               0
pool_price_lag_21               0
pool_price_lag_20               0
alberta_internal_load_lag_1     0
alberta_internal_load_lag_10    0
alberta_internal_load_lag_9     0
pool_price_lag_18               0
alberta_internal_load_lag_11    0
alberta_internal_load_lag_12    0
alberta_internal_load_lag_13    0
alberta_internal_load_lag_14    0
alberta_internal_load_lag_15    0
alberta_internal_load_lag_16    0
alberta_internal_load_lag_17    0
alberta_internal_load_lag_18    0
alberta_intern

In [68]:
# forward fill the NA values
merged_df.fillna(method='ffill', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


#### Save Data

In [69]:
merged_df.to_csv('Jobs/Inferencing/data/raw/merged_df.csv', index=False)