# WESM Price Prediction - ETL/Preprocessing

## Setup and "Loader"

In [17]:
import pandas as pd
import glob
import os

# Paths
PATH_GWAP = "./raw_data/GWAP/"
PATH_RTD = "./raw_data/RTD_Regional/"
PATH_OUTAGES = "./raw_data/Outages/"

# Connects extracted CSV files
def load_and_concatenate_csvs(path):
    files = glob.glob(os.path.join(path, "*.csv"))
    print(f"File Count in {path}: {len(files)}")

    dfs = []
    for file in files:
        df = pd.read_csv(file)
        dfs.append(df)

    concatenated_df = pd.concat(dfs, axis=0, ignore_index=True)
    print(f"Concatenated DataFrame Shape from {path}: {concatenated_df.shape}")
    
    return concatenated_df

## GWAP Processing

In [18]:
df_gwap = load_and_concatenate_csvs(PATH_GWAP)
df_gwap = df_gwap[df_gwap["REGION_NAME"] == "CLUZ"].copy()
df_gwap["datetime"] = pd.to_datetime(df_gwap["TIME_INTERVAL"], format='mixed', dayfirst=False)
df_gwap = df_gwap[["datetime", "GWAP"]].sort_values("datetime")
df_gwap = df_gwap.drop_duplicates(subset=['datetime']) # just in case

print(f"Final GWAP DataFrame Shape: {df_gwap.shape}")
print(df_gwap.head())

File Count in ./raw_data/GWAP/: 59
Concatenated DataFrame Shape from ./raw_data/GWAP/: (85019, 6)
Final GWAP DataFrame Shape: (16992, 2)
               datetime       GWAP
366 2025-10-28 00:05:00  2800.5685
371 2025-10-28 00:10:00  2985.7148
314 2025-10-28 00:15:00  2899.6850
377 2025-10-28 00:20:00  2992.3605
416 2025-10-28 00:25:00  2949.7466


## RTD Processing

In [19]:
df_rtd = load_and_concatenate_csvs(PATH_RTD)
df_rtd = df_rtd[df_rtd["REGION_NAME"] == "CLUZ"].copy()
df_rtd["datetime"] = pd.to_datetime(df_rtd["TIME_INTERVAL"], format='mixed', dayfirst=False)

# Extract Energy (Demand & Supply)
# Commodity Type "En" refers to energy
df_energy = df_rtd[df_rtd["COMMODITY_TYPE"] == "En"].copy()
df_energy = df_energy[["datetime", "MKT_REQT", "GENERATION"]]
df_energy.columns = ["datetime", "energy_demand_mw", "energy_supply_mw"]

# Extract Reserves (Safety Net)
# Anything that is not "En" are reserves
df_reserves = df_rtd[df_rtd["COMMODITY_TYPE"] != "En"].copy()

# Group reserves by datetime then sum them up
df_reserves = df_reserves.groupby("datetime")[["MKT_REQT", "GENERATION"]].sum().reset_index()
df_reserves.columns = ["datetime", "reserve_demand_mw", "reserve_supply_mw"]

# Engineered Feature: Shortage (mostly for the classification task; honestly not sure yet if this will affect it but i'll leave in this column for now)
# Positive Value = We are short (Danger). Negative Value = We have surplus (Safe).
df_reserves['reserve_shortage_mw'] = df_reserves['reserve_demand_mw'] - df_reserves['reserve_supply_mw']
# Merge them back
df_X = pd.merge(df_energy, df_reserves, on="datetime", how="left")
df_X = df_X.fillna(0)  # Fill NaNs with 0

print(f"Final RTD Features DataFrame Shape: {df_X.shape}")
print(df_X.head())

File Count in ./raw_data/RTD_Regional/: 59
Concatenated DataFrame Shape from ./raw_data/RTD_Regional/: (254879, 13)
Final RTD Features DataFrame Shape: (16988, 6)
             datetime  energy_demand_mw  energy_supply_mw  reserve_demand_mw  \
0 2025-10-28 00:05:00           9043.68           9095.22             1495.0   
1 2025-10-28 00:10:00           9002.76           9038.49             1495.0   
2 2025-10-28 00:15:00           8975.73           9026.83             1495.0   
3 2025-10-28 00:20:00           9027.64           9012.70             1495.0   
4 2025-10-28 00:25:00           9003.70           9001.07             1495.0   

   reserve_supply_mw  reserve_shortage_mw  
0             1495.0                  0.0  
1             1495.0                  0.0  
2             1495.0                  0.0  
3             1495.0                  0.0  
4             1495.0                  0.0  


## Outage Processing

In [20]:
df_outages = load_and_concatenate_csvs(PATH_OUTAGES)

# Drop rows with missing Resource Name
initial_len = len(df_outages)
df_outages = df_outages.dropna(subset=['RESOURCE_NAME'])
print(f"Dropped {initial_len - len(df_outages)} rows with missing Resource Name.")

# Filter to Luzon plants only (Prefix 01-03)
df_outages['prefix'] = df_outages['RESOURCE_NAME'].astype(str).str[:2]
luzon_prefixes = ['01', '02', '03']
df_outages = df_outages[df_outages['prefix'].isin(luzon_prefixes)].copy()
print(f"Filtered for Luzon (Prefixes 01-03). Remaining rows: {len(df_outages)}")

df_outages['datetime'] = pd.to_datetime(df_outages['RUN_TIME'], format='mixed', dayfirst=False)
df_out_count = df_outages.groupby('datetime').size().reset_index(name='outage_count') # count of outages per datetime

print(f"Final Outages DataFrame Shape: {df_out_count.shape}")
print(df_out_count.head())

File Count in ./raw_data/Outages/: 59
Concatenated DataFrame Shape from ./raw_data/Outages/: (256054, 7)
Dropped 59 rows with missing Resource Name.
Filtered for Luzon (Prefixes 01-03). Remaining rows: 83792
Final Outages DataFrame Shape: (16988, 2)
             datetime  outage_count
0 2025-10-28 00:05:00             4
1 2025-10-28 00:10:00             4
2 2025-10-28 00:15:00             4
3 2025-10-28 00:20:00             4
4 2025-10-28 00:25:00             4


## Merge

In [None]:
# Merge GWAP with RTD features
final_df = pd.merge(df_gwap, df_X, on="datetime", how="inner")

# Merge with Outages data
final_df = pd.merge(final_df, df_out_count, on="datetime", how="left")
final_df['outage_count'] = final_df['outage_count'].fillna(0)  # Fill NaNs with 0

# Time Lags (for autocorrelation)
final_df['GWAP_Lag_1'] = final_df['GWAP'].shift(1)    # 5 mins ago
final_df['GWAP_Lag_12'] = final_df['GWAP'].shift(12)  # 1 hour ago
final_df['GWAP_Lag_288'] = final_df['GWAP'].shift(288) # 24 hours ago (Yesterday same time)

final_df = final_df.dropna()
final_df.to_csv("final_dataset.csv", index=False)

print(f"Final Dataset Shape: {final_df.shape}")
print(f"Date Range: {final_df['datetime'].min()} to {final_df['datetime'].max()}")
print(final_df.head())

Final Dataset Shape: (16700, 11)
Date Range: 2025-10-29 00:05:00 to 2025-12-26 00:00:00
               datetime       GWAP  energy_demand_mw  energy_supply_mw  \
288 2025-10-29 00:05:00  2258.5866           9504.45           9705.25   
289 2025-10-29 00:10:00  2913.4257           9483.49           9648.89   
290 2025-10-29 00:15:00  2883.5764           9434.35           9627.98   
291 2025-10-29 00:20:00  2888.6233           9417.62           9614.19   
292 2025-10-29 00:25:00  2887.9553           9379.48           9575.17   

     reserve_demand_mw  reserve_supply_mw  reserve_shortage_mw  outage_count  \
288             1428.0             1428.0                  0.0           4.0   
289             1428.0             1428.0                  0.0           4.0   
290             1428.0             1428.0                  0.0           4.0   
291             1428.0             1428.0                  0.0           4.0   
292             1428.0             1428.0                  0.0     