In [1]:
# This notebook cleans the data and creates a new csv file with the cleaned data

#### Library

In [1]:
import pandas as pd
# Set pandas options to display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)

#### Data Loading

In [3]:
df = pd.read_csv('/home/kevin/Downloads/BESS/Jobs/Retraining/data/raw/merged_df.csv')

#### Data Cleaning

In [4]:
df.shape

(24244, 69)

In [5]:
df.head()

Unnamed: 0,datetime_,alberta_internal_load,forecast_alberta_internal_load,pool_price,forecast_pool_price,rolling_30day_avg_price,solar_generation,temp_calgary,temp_edmonton,temp_fortmc,wind_generation,ws_calgary,ws_edmonton,ws_fortmc,hour_of_day,day_of_week,day_of_month,week_of_month,month,year,is_winter,pool_price_lag_1,pool_price_lag_2,pool_price_lag_3,pool_price_lag_4,pool_price_lag_5,pool_price_lag_6,pool_price_lag_7,pool_price_lag_8,pool_price_lag_9,pool_price_lag_10,pool_price_lag_11,pool_price_lag_12,pool_price_lag_13,pool_price_lag_14,pool_price_lag_15,pool_price_lag_16,pool_price_lag_17,pool_price_lag_18,pool_price_lag_19,pool_price_lag_20,pool_price_lag_21,pool_price_lag_22,pool_price_lag_23,pool_price_lag_24,alberta_internal_load_lag_1,alberta_internal_load_lag_2,alberta_internal_load_lag_3,alberta_internal_load_lag_4,alberta_internal_load_lag_5,alberta_internal_load_lag_6,alberta_internal_load_lag_7,alberta_internal_load_lag_8,alberta_internal_load_lag_9,alberta_internal_load_lag_10,alberta_internal_load_lag_11,alberta_internal_load_lag_12,alberta_internal_load_lag_13,alberta_internal_load_lag_14,alberta_internal_load_lag_15,alberta_internal_load_lag_16,alberta_internal_load_lag_17,alberta_internal_load_lag_18,alberta_internal_load_lag_19,alberta_internal_load_lag_20,alberta_internal_load_lag_21,alberta_internal_load_lag_22,alberta_internal_load_lag_23,alberta_internal_load_lag_24
0,2022-03-23 00:00:00,9146,9170,44.21,41.26,88.14,0.0,6.4,2.9,5.4,1543.927681,10.0,13,5,0,2,23,4,3,2022,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,2022-03-23 01:00:00,9083,9075,47.55,47.55,88.09,0.0,8.9,3.5,5.6,1548.392596,7.0,13,3,1,2,23,4,3,2022,1,44.21,,,,,,,,,,,,,,,,,,,,,,,,9146.0,,,,,,,,,,,,,,,,,,,,,,,
2,2022-03-23 02:00:00,9020,9029,47.55,47.55,88.05,0.0,6.7,4.2,6.0,1523.83849,4.0,12,6,2,2,23,4,3,2022,1,47.55,44.21,,,,,,,,,,,,,,,,,,,,,,,9083.0,9146.0,,,,,,,,,,,,,,,,,,,,,,
3,2022-03-23 03:00:00,9007,9012,47.11,47.86,88.01,0.0,5.5,3.8,4.9,1498.064082,2.0,6,9,3,2,23,4,3,2022,1,47.55,47.55,44.21,,,,,,,,,,,,,,,,,,,,,,9020.0,9083.0,9146.0,,,,,,,,,,,,,,,,,,,,,
4,2022-03-23 04:00:00,9043,9023,46.95,47.03,87.98,0.0,4.6,2.0,4.5,1507.651054,2.0,6,10,4,2,23,4,3,2022,1,47.11,47.55,47.55,44.21,,,,,,,,,,,,,,,,,,,,,9007.0,9020.0,9083.0,9146.0,,,,,,,,,,,,,,,,,,,,


In [6]:
df.isnull().sum().sort_values(ascending=False)

alberta_internal_load_lag_24      24
pool_price_lag_24                 24
alberta_internal_load_lag_23      23
pool_price_lag_23                 23
alberta_internal_load_lag_22      22
pool_price_lag_22                 22
pool_price_lag_21                 21
alberta_internal_load_lag_21      21
alberta_internal_load_lag_20      20
pool_price_lag_20                 20
pool_price_lag_19                 19
alberta_internal_load_lag_19      19
pool_price_lag_18                 18
alberta_internal_load_lag_18      18
alberta_internal_load_lag_17      17
pool_price_lag_17                 17
alberta_internal_load_lag_16      16
pool_price_lag_16                 16
alberta_internal_load_lag_15      15
pool_price_lag_15                 15
pool_price_lag_14                 14
alberta_internal_load_lag_14      14
alberta_internal_load_lag_13      13
pool_price_lag_13                 13
alberta_internal_load_lag_12      12
pool_price_lag_12                 12
alberta_internal_load_lag_11      11
p

In [7]:
# Filter columns that do not start with 'alberta' or 'pool'
filtered_columns = [col for col in df.columns if not col.startswith('alberta_internal_load_') and not col.startswith('pool_price_')]

# Select columns with more than 1 missing value
columns_with_missing_values = df[filtered_columns].isnull().sum()
columns_with_missing_values = columns_with_missing_values[columns_with_missing_values > 0]

# Display the columns with more than 1 missing value
print(columns_with_missing_values.index)

Index(['ws_calgary'], dtype='object')


In [8]:
# impute median for missing values

for col in columns_with_missing_values.index:
    df[col].fillna(df[col].median(), inplace=True)

In [9]:
df.isnull().sum()

datetime_                          0
alberta_internal_load              0
forecast_alberta_internal_load     0
pool_price                         0
forecast_pool_price                0
rolling_30day_avg_price            0
solar_generation                   0
temp_calgary                       0
temp_edmonton                      0
temp_fortmc                        0
wind_generation                    0
ws_calgary                         0
ws_edmonton                        0
ws_fortmc                          0
hour_of_day                        0
day_of_week                        0
day_of_month                       0
week_of_month                      0
month                              0
year                               0
is_winter                          0
pool_price_lag_1                   1
pool_price_lag_2                   2
pool_price_lag_3                   3
pool_price_lag_4                   4
pool_price_lag_5                   5
pool_price_lag_6                   6
p

In [10]:
# fill NAN with bfill method
lagged_columns = [f'pool_price_lag_{i}' for i in range(1, 25)] + [f'alberta_internal_load_lag_{i}' for i in range(1, 25)]

df[lagged_columns] = df[lagged_columns].fillna(method='bfill')
# Verify if NaN values are filled
print(df[lagged_columns].isnull().sum().sort_values(ascending=False))

alberta_internal_load_lag_24    0
alberta_internal_load_lag_23    0
pool_price_lag_22               0
pool_price_lag_21               0
pool_price_lag_20               0
pool_price_lag_19               0
pool_price_lag_18               0
pool_price_lag_17               0
pool_price_lag_16               0
pool_price_lag_15               0
pool_price_lag_14               0
pool_price_lag_13               0
pool_price_lag_12               0
pool_price_lag_11               0
pool_price_lag_10               0
pool_price_lag_9                0
pool_price_lag_8                0
pool_price_lag_7                0
pool_price_lag_6                0
pool_price_lag_5                0
pool_price_lag_4                0
pool_price_lag_3                0
pool_price_lag_2                0
pool_price_lag_23               0
pool_price_lag_24               0
alberta_internal_load_lag_1     0
alberta_internal_load_lag_13    0
alberta_internal_load_lag_22    0
alberta_internal_load_lag_21    0
alberta_intern

In [11]:
df.isnull().sum().sort_values(ascending=False)

alberta_internal_load_lag_24      0
pool_price_lag_4                  0
month                             0
year                              0
is_winter                         0
pool_price_lag_1                  0
pool_price_lag_2                  0
pool_price_lag_3                  0
pool_price_lag_5                  0
pool_price_lag_13                 0
pool_price_lag_6                  0
pool_price_lag_7                  0
pool_price_lag_8                  0
pool_price_lag_9                  0
pool_price_lag_10                 0
pool_price_lag_11                 0
week_of_month                     0
day_of_month                      0
day_of_week                       0
hour_of_day                       0
ws_fortmc                         0
ws_edmonton                       0
ws_calgary                        0
wind_generation                   0
temp_fortmc                       0
temp_edmonton                     0
temp_calgary                      0
solar_generation            

#### Save csv

In [12]:
df.to_csv('/home/kevin/Downloads/BESS/Jobs/Retraining/data/raw/merged_df_cleaned.csv', index=False)