# WESM Price Prediction - ETL/Preprocessing

## Setup and "Loader"

In [None]:
import pandas as pd
import glob
import os

# Paths
PATH_GWAP = "./raw_data/GWAP/"
PATH_RTD = "./raw_data/RTD/"
PATH_OUTAGES = "./raw_data/Outages/"

# Connects extracted CSV files
def load_and_concatenate_csvs(path):
    files = glob.glob(os.path.join(path, "*.csv"))
    print(f"File Count in {path}: {len(files)}")

    dfs = []
    for file in files:
        df = pd.read_csv(file)
        dfs.append(df)

    concatenated_df = pd.concat(dfs, axis=0, ignore_index=True)
    print(f"Concatenated DataFrame Shape from {path}: {concatenated_df.shape}")
    
    return concatenated_df

## GWAP Processing

In [None]:
df_gwap = load_and_concatenate_csvs(PATH_GWAP)
df_gwap = df_gwap[df_gwap["REGION_NAME"] == "CLUZ"].copy()
df_gwap["datetime"] = pd.to_datetime(df_gwap["TIME_INTERVAL"])
df_gwap = df_gwap[["datetime", "GWAP"]].sort_values("datetime")
df_gwap = df_gwap.drop_duplicates(subset=['datetime']) # just in case

print(f"Final GWAP DataFrame Shape: {df_gwap.shape}")
print(df_gwap.head())

## RTD Processing

In [None]:
df_rtd = load_and_concatenate_csvs(PATH_RTD)
df_rtd = df_rtd[df_rtd["REGION_NAME"] == "CLUZ"].copy()
df_rtd["datetime"] = pd.to_datetime(df_rtd["TIME_INTERVAL"])

# Extract Energy (Demand & Supply)
# Note: COMMODITY_TYPE == "En" is the market requirement (demand) for energy
df_energy = df_rtd[df_rtd["COMMODITY_TYPE"] == "En"].copy()
df_energy = df_energy[["datetime", "MKT_REQT", "GENERATION"]]
df_energy.columns = ["datetime", "demand_mw", "supply_mw"]

# Extract Reserves
# Note: Other commodity types (NOT "En") are reserves [probably have to look up each type later but essentially reserves/backups]
df_reserves = df_rtd[df_rtd["COMMODITY_TYPE"] != "En"].copy()

# Group by datetime then sum them up
df_reserves = df_reserves.groupby("datetime")["GENERATION"].sum().reset_index()
df_reserves.columns = ["datetime", "total_reserves_mw"]

# Merge them back
df_X = pd.merge(df_energy, df_reserves, on="datetime", how="left")
df_X["total_reserves_mw"] = df_X["total_reserves_mw"].fillna(0)  # Fill NaNs with 0

print(f"Final RTD Features DataFrame Shape: {df_X.shape}")
print(df_X.head())