#### Library

In [1]:
import pandas as pd
# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

#### Data Loading

In [2]:
df = pd.read_csv('/home/kevin/Downloads/BESS/data/raw/2021/merged_df_2021.csv')

#### Data Cleaning

In [3]:
df.shape

(8763, 67)

In [4]:
df.head()

Unnamed: 0,datetime_,alberta_internal_load,forecast_alberta_internal_load,pool_price,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,ws_edmonton,ws_fortmc,hour_of_day,day_of_week,week_of_month,month,is_winter,pool_price_lag_1,pool_price_lag_2,pool_price_lag_3,pool_price_lag_4,pool_price_lag_5,pool_price_lag_6,pool_price_lag_7,pool_price_lag_8,pool_price_lag_9,pool_price_lag_10,pool_price_lag_11,pool_price_lag_12,pool_price_lag_13,pool_price_lag_14,pool_price_lag_15,pool_price_lag_16,pool_price_lag_17,pool_price_lag_18,pool_price_lag_19,pool_price_lag_20,pool_price_lag_21,pool_price_lag_22,pool_price_lag_23,pool_price_lag_24,alberta_internal_load_lag_1,alberta_internal_load_lag_2,alberta_internal_load_lag_3,alberta_internal_load_lag_4,alberta_internal_load_lag_5,alberta_internal_load_lag_6,alberta_internal_load_lag_7,alberta_internal_load_lag_8,alberta_internal_load_lag_9,alberta_internal_load_lag_10,alberta_internal_load_lag_11,alberta_internal_load_lag_12,alberta_internal_load_lag_13,alberta_internal_load_lag_14,alberta_internal_load_lag_15,alberta_internal_load_lag_16,alberta_internal_load_lag_17,alberta_internal_load_lag_18,alberta_internal_load_lag_19,alberta_internal_load_lag_20,alberta_internal_load_lag_21,alberta_internal_load_lag_22,alberta_internal_load_lag_23,alberta_internal_load_lag_24
0,2021-01-01 00:00:00,9655.0,9718.0,29.92,32.91,38.45,0.0,-3.2,-5.8,-8.9,1470.686241,13.0,13.0,,0,4,53,1,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2021-01-01 01:00:00,9513.0,9573.0,27.48,27.1,38.44,0.0,2.5,-7.8,-8.6,1525.467843,30.0,13.0,,1,4,53,1,1,29.92,,,,,,,,,,,,,,,,,,,,,,,,9655.0,,,,,,,,,,,,,,,,,,,,,,,
2,2021-01-01 02:00:00,9437.0,9446.0,28.62,27.13,38.44,0.0,2.0,-10.4,-8.5,1535.146498,23.0,9.0,,2,4,53,1,1,27.48,29.92,,,,,,,,,,,,,,,,,,,,,,,9513.0,9655.0,,,,,,,,,,,,,,,,,,,,,,
3,2021-01-01 03:00:00,9376.0,9366.0,33.55,32.14,38.43,0.0,2.4,-11.9,-8.5,1484.0514,24.0,4.0,,3,4,53,1,1,28.62,27.48,29.92,,,,,,,,,,,,,,,,,,,,,,9437.0,9513.0,9655.0,,,,,,,,,,,,,,,,,,,,,
4,2021-01-01 04:00:00,9356.0,9357.0,35.36,35.64,38.43,0.0,2.4,-9.6,-8.9,1446.955595,22.0,4.0,,4,4,53,1,1,33.55,28.62,27.48,29.92,,,,,,,,,,,,,,,,,,,,,9376.0,9437.0,9513.0,9655.0,,,,,,,,,,,,,,,,,,,,


In [5]:
df.isnull().sum().sort_values(ascending=False)

ws_fortmc                         7223
temp_edmonton                      725
ws_edmonton                        725
pool_price_lag_24                   25
alberta_internal_load_lag_24        25
alberta_internal_load_lag_23        24
pool_price_lag_23                   24
alberta_internal_load_lag_22        23
pool_price_lag_22                   23
pool_price_lag_21                   22
alberta_internal_load_lag_21        22
alberta_internal_load_lag_20        21
pool_price_lag_20                   21
alberta_internal_load_lag_19        20
pool_price_lag_19                   20
alberta_internal_load_lag_18        19
pool_price_lag_18                   19
pool_price_lag_17                   18
alberta_internal_load_lag_17        18
pool_price_lag_16                   17
alberta_internal_load_lag_16        17
alberta_internal_load_lag_15        16
pool_price_lag_15                   16
pool_price_lag_14                   15
alberta_internal_load_lag_14        15
pool_price_lag_13        

In [6]:
# Filter columns that do not start with 'alberta' or 'pool'
filtered_columns = [col for col in df.columns if not col.startswith('alberta_internal_load_') and not col.startswith('pool_price_')]

# Select columns with more than 1 missing value
columns_with_missing_values = df[filtered_columns].isnull().sum()
columns_with_missing_values = columns_with_missing_values[columns_with_missing_values > 0]

# Display the columns with more than 1 missing value
print(columns_with_missing_values.index)

Index(['alberta_internal_load', 'forecast_alberta_internal_load', 'pool_price',
       'forecast_pool_price', 'rolling_30day_avg_price', 'solar_generation',
       'temp_calgary', 'temp_edmonton', 'temp_fortmc', 'wind_generation',
       'ws_calgary', 'ws_edmonton', 'ws_fortmc'],
      dtype='object')


In [7]:
# impute median for missing values

for col in columns_with_missing_values.index:
    df[col].fillna(df[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [8]:
df.isnull().sum()

datetime_                          0
alberta_internal_load              0
forecast_alberta_internal_load     0
pool_price                         0
forecast_pool_price                0
rolling_30day_avg_price            0
solar_generation                   0
temp_calgary                       0
temp_edmonton                      0
temp_fortmc                        0
wind_generation                    0
ws_calgary                         0
ws_edmonton                        0
ws_fortmc                          0
hour_of_day                        0
day_of_week                        0
week_of_month                      0
month                              0
is_winter                          0
pool_price_lag_1                   2
pool_price_lag_2                   3
pool_price_lag_3                   4
pool_price_lag_4                   5
pool_price_lag_5                   6
pool_price_lag_6                   7
pool_price_lag_7                   8
pool_price_lag_8                   9
p

In [9]:
# fill NAN with bfill method
lagged_columns = [f'pool_price_lag_{i}' for i in range(1, 25)] + [f'alberta_internal_load_lag_{i}' for i in range(1, 25)]

df[lagged_columns] = df[lagged_columns].fillna(method='bfill')
# Verify if NaN values are filled
print(df[lagged_columns].isnull().sum().sort_values(ascending=False))

pool_price_lag_1                0
pool_price_lag_2                0
alberta_internal_load_lag_3     0
alberta_internal_load_lag_4     0
alberta_internal_load_lag_5     0
alberta_internal_load_lag_6     0
alberta_internal_load_lag_7     0
alberta_internal_load_lag_8     0
alberta_internal_load_lag_9     0
alberta_internal_load_lag_10    0
alberta_internal_load_lag_11    0
alberta_internal_load_lag_12    0
alberta_internal_load_lag_13    0
alberta_internal_load_lag_14    0
alberta_internal_load_lag_15    0
alberta_internal_load_lag_16    0
alberta_internal_load_lag_17    0
alberta_internal_load_lag_18    0
alberta_internal_load_lag_19    0
alberta_internal_load_lag_20    0
alberta_internal_load_lag_21    0
alberta_internal_load_lag_22    0
alberta_internal_load_lag_23    0
alberta_internal_load_lag_2     0
alberta_internal_load_lag_1     0
pool_price_lag_24               0
pool_price_lag_12               0
pool_price_lag_3                0
pool_price_lag_4                0
pool_price_lag

  df[lagged_columns] = df[lagged_columns].fillna(method='bfill')


In [10]:
df.isnull().sum().sort_values(ascending=False)

datetime_                         0
alberta_internal_load_lag_8       0
pool_price_lag_18                 0
pool_price_lag_19                 0
pool_price_lag_20                 0
pool_price_lag_21                 0
pool_price_lag_22                 0
pool_price_lag_23                 0
pool_price_lag_24                 0
alberta_internal_load_lag_1       0
alberta_internal_load_lag_2       0
alberta_internal_load_lag_3       0
alberta_internal_load_lag_4       0
alberta_internal_load_lag_5       0
alberta_internal_load_lag_6       0
alberta_internal_load_lag_7       0
alberta_internal_load_lag_9       0
pool_price_lag_16                 0
alberta_internal_load_lag_10      0
alberta_internal_load_lag_11      0
alberta_internal_load_lag_12      0
alberta_internal_load_lag_13      0
alberta_internal_load_lag_14      0
alberta_internal_load_lag_15      0
alberta_internal_load_lag_16      0
alberta_internal_load_lag_17      0
alberta_internal_load_lag_18      0
alberta_internal_load_lag_19

#### Save csv

In [11]:
df.to_csv('/home/kevin/Downloads/BESS/data/raw/2021/merged_df_2021_cleaned.csv', index=False)