# WESM Price Prediction - ETL/Preprocessing

## Setup and "Loader"

In [None]:
import pandas as pd
import glob
import os

# Paths
PATH_GWAP = "./raw_data/GWAP/"
PATH_RTD = "./raw_data/RTD/"
PATH_OUTAGES = "./raw_data/Outages/"

# Connects extracted CSV files
def load_and_concatenate_csvs(path):
    files = glob.glob(os.path.join(path, "*.csv"))
    print(f"File Count in {path}: {len(files)}")

    dfs = []
    for file in files:
        df = pd.read_csv(file)
        dfs.append(df)

    concatenated_df = pd.concat(dfs, axis=0, ignore_index=True)
    print(f"Concatenated DataFrame Shape from {path}: {concatenated_df.shape}")
    
    return concatenated_df

## GWAP Processing

In [None]:
df_gwap = load_and_concatenate_csvs(PATH_GWAP)
df_gwap = df_gwap[df_gwap["REGION_NAME"] == "CLUZ"].copy()
df_gwap["datetime"] = pd.to_datetime(df_gwap["TIME_INTERVAL"])
df_gwap = df_gwap[["datetime", "GWAP"]].sort_values("datetime")
df_gwap = df_gwap.drop_duplicates(subset=['datetime']) # just in case

print(f"Final GWAP DataFrame Shape: {df_gwap.shape}")
print(df_gwap.head())

## RTD Processing

In [None]:
df_rtd = load_and_concatenate_csvs(PATH_RTD)
df_rtd = df_rtd[df_rtd["REGION_NAME"] == "CLUZ"].copy()
df_rtd["datetime"] = pd.to_datetime(df_rtd["TIME_INTERVAL"])

# Extract Energy (Demand & Supply)
# Commodity Type "En" refers to energy
df_energy = df_rtd[df_rtd["COMMODITY_TYPE"] == "En"].copy()
df_energy = df_energy[["datetime", "MKT_REQT", "GENERATION"]]
df_energy.columns = ["datetime", "energy_demand_mw", "energy_supply_mw"]

# Extract Reserves (Safety Net)
# Anything that is not "En" are reserves
df_reserves = df_rtd[df_rtd["COMMODITY_TYPE"] != "En"].copy()

# Group reserves by datetime then sum them up
df_reserves = df_reserves.groupby("datetime")[["MKT_REQT", "GENERATION"]].sum().reset_index()
df_reserves.columns = ["datetime", "reserve_demand_mw", "reserve_supply_mw"]

# Engineered Feature: Shortage (mostly for the classification task; honestly not sure yet if this will affect it but i'll leave in this column for now)
# Positive Value = We are short (Danger). Negative Value = We have surplus (Safe).
df_reserves['reserve_shortage_mw'] = df_reserves['reserve_demand_mw'] - df_reserves['reserve_supply_mw']
# Merge them back
df_X = pd.merge(df_energy, df_reserves, on="datetime", how="left")
df_X = df_X.fillna(0)  # Fill NaNs with 0

print(f"Final RTD Features DataFrame Shape: {df_X.shape}")
print(df_X.head())