In [1]:
import pandas as pd
import numpy as np
import torch

import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("__file__"))))
from cleaning.pipeline import strategic_fill, shift_forecast_columns

In [2]:

project_root = os.path.dirname(os.path.dirname(os.path.abspath("__file__")))
data_path = os.path.join(project_root, 'data', 'raw_data', 'ercot_data_2025_Jan.csv')
print(f"Trying to load from: {data_path}")
raw_data = pd.read_csv(data_path)

Trying to load from: /Users/aryanahri/epf_datasets/data/raw_data/ercot_data_2025_Jan.csv


In [3]:
raw_data.head()

Unnamed: 0,marketday,hourending,SP_Price_Houston,SP_Price_North,SP_Price_Panh,SP_Price_South,SP_Price_West,DA_Price_Houston,DA_Price_North,DA_Price_Panh,...,ACTUAL_ERC_SLoad,ACTUAL_ERC_WLoad,ACTUAL_ERC_CWind,ACTUAL_ERC_NWind,ACTUAL_ERC_PWind,ACTUAL_ERC_SWind,ACTUAL_ERC_Wind,ACTUAL_ERC_WWind,ACTUAL_ERC_Solar,Outage
0,2023/2/1,1,36.2125,37.6125,38.22,32.9225,38.53,38.42,41.54,41.17,...,16678.09,7712.58,2542.81,92.81,55.0,1283.81,4030.99,56.56,0.24,18327
1,2023/2/1,2,34.275,36.075,36.855,30.0375,37.2525,36.5,38.65,38.39,...,16513.99,7665.68,2818.83,71.46,99.67,1010.19,4064.79,64.64,0.23,18327
2,2023/2/1,3,34.43,41.03,44.56,28.01,46.3375,36.64,39.01,38.76,...,16512.22,7644.05,3061.07,75.72,100.81,930.57,4210.19,42.02,0.23,18327
3,2023/2/1,4,31.735,46.9925,55.9375,21.3975,60.445,37.17,40.12,39.89,...,16436.49,7247.3,2864.8,88.8,60.16,1089.92,4151.2,47.52,0.23,18327
4,2023/2/1,5,33.3625,47.61,56.105,25.0425,60.3875,39.09,43.0,42.53,...,16528.84,7393.56,3047.25,104.76,71.55,1084.94,4331.33,22.83,0.24,18327


In [4]:
raw_data['marketday'] = pd.to_datetime(raw_data['marketday'])

raw_data['time'] = raw_data.apply(
    lambda row: row['marketday'] + pd.Timedelta(hours=row['hourending'] - 1), 
    axis=1
)

raw_data.set_index('time', inplace=True)

print("Data after datetime conversion:")
print(raw_data.head())


Data after datetime conversion:
                     marketday  hourending  SP_Price_Houston  SP_Price_North  \
time                                                                           
2023-02-01 00:00:00 2023-02-01           1           36.2125         37.6125   
2023-02-01 01:00:00 2023-02-01           2           34.2750         36.0750   
2023-02-01 02:00:00 2023-02-01           3           34.4300         41.0300   
2023-02-01 03:00:00 2023-02-01           4           31.7350         46.9925   
2023-02-01 04:00:00 2023-02-01           5           33.3625         47.6100   

                     SP_Price_Panh  SP_Price_South  SP_Price_West  \
time                                                                
2023-02-01 00:00:00        38.2200         32.9225        38.5300   
2023-02-01 01:00:00        36.8550         30.0375        37.2525   
2023-02-01 02:00:00        44.5600         28.0100        46.3375   
2023-02-01 03:00:00        55.9375         21.3975        60.4

In [None]:
# output_path = os.path.join(project_root, 'data', 'cleaned', 'cleaned_ercot_data.csv')

# os.makedirs(os.path.dirname(output_path), exist_ok=True)

# raw_data.to_csv(output_path)
# print(f"Cleaned data saved to: {output_path}")

Cleaned data saved to: /Users/aryanahri/epf_datasets/data/cleaned/cleaned_ercot_data.csv


In [7]:
cleaned_df = pd.read_csv(os.path.join(project_root, 'data', 'cleaned', 'ercot_data_cleaned.csv'))

In [8]:
cleaned_df.columns.tolist()


Column names in cleaned_df:
['time', 'ACTUAL_NetLoad', 'ACTUAL_ERC_Load', 'ACTUAL_ERC_Wind', 'ACTUAL_ERC_Solar', 'NetLoad_Error', 'Load_Error', 'Wind_Error', 'Solar_Error', 'NetLoad', 'ERC_Load', 'ERC_Wind', 'ERC_Solar', 'HoD', 'DoW', 'MoY']


In [9]:
# Convert the 'time' column to datetime if it's not already
cleaned_df['time'] = pd.to_datetime(cleaned_df['time'])

# Calculate the total number of time points
total_points = len(cleaned_df)

# Calculate the split indices (60/20/20)
train_end_idx = int(total_points * 0.6)
val_end_idx = int(total_points * 0.8)

# Get the corresponding dates
train_start = cleaned_df['time'].iloc[0]
train_end = cleaned_df['time'].iloc[train_end_idx]
val_start = cleaned_df['time'].iloc[train_end_idx + 1]
val_end = cleaned_df['time'].iloc[val_end_idx]
test_start = cleaned_df['time'].iloc[val_end_idx + 1]
test_end = cleaned_df['time'].iloc[-1]

# Format the dates for JSON config
print("Train/Val/Test Split Dates (for JSON config):")
print(f'"train_start": "{train_start.strftime("%Y-%m-%d %H:%M:%S+0000")}",')
print(f'"train_end": "{train_end.strftime("%Y-%m-%d %H:%M:%S+0000")}",')
print(f'"val_start": "{val_start.strftime("%Y-%m-%d %H:%M:%S+0000")}",')
print(f'"val_end": "{val_end.strftime("%Y-%m-%d %H:%M:%S+0000")}",')
print(f'"test_start": "{test_start.strftime("%Y-%m-%d %H:%M:%S+0000")}",')
print(f'"test_end": "{test_end.strftime("%Y-%m-%d %H:%M:%S+0000")}"')


Train/Val/Test Split Dates (for JSON config):
"train_start": "2023-02-01 00:00:00+0000",
"train_end": "2024-03-30 22:00:00+0000",
"val_start": "2024-03-30 23:00:00+0000",
"val_end": "2024-08-19 06:00:00+0000",
"test_start": "2024-08-19 07:00:00+0000",
"test_end": "2025-01-07 13:00:00+0000"
