In [1]:
import pandas as pd

data_2022 = pd.read_excel("Wholesale_Pricing_Data/ice_electric-2022final.xlsx")
data_2023 = pd.read_excel("Wholesale_Pricing_Data/ice_electric-2023final.xlsx")
data_2024 = pd.read_excel("Wholesale_Pricing_Data/ice_electric-2024final.xlsx")

eia_pricing_data = pd.concat([data_2022, data_2023, data_2024], ignore_index=True, axis=0)

In [5]:
hub_to_iso = {
    'Indiana Hub RT Peak': 'MISO',
    'Mid C Peak': 'Non-ISO (Mid-Columbia)',
    'NP15 EZ Gen DA LMP Peak': 'CAISO',
    'Nepool MH DA LMP Peak': 'ISO-NE',
    'PJM WH Real Time Peak': 'PJM',
    'Palo Verde Peak': 'CAISO',
    'SP15 EZ Gen DA LMP Peak': 'CAISO'
}

eia_pricing_data['ISO'] = eia_pricing_data['Price hub'].map(hub_to_iso)

In [7]:
temp_df = pd.read_csv("output.csv")

temp_df.head()

Unnamed: 0.1,Unnamed: 0,timestamp_utc,iso,Location Name,Location Type,LMP,MCC,MLC,cum_ia,cum_wd,cum_on,avg_days_to_ia,avg_days_to_wd,avg_days_to_on,avg_days_pending
0,0,2023-01-01 00:00:00+00:00,ISO-NE,UN.FRNKLNSQ13.810CC,Node,31.28,0.0,0.0,0.0,2.0,0.0,,136.5,,203.352381
1,1,2023-01-01 00:00:00+00:00,NYISO,GENESE,Node,21.14,-1.63,-0.42,0.0,11.0,0.0,,51.625,,162.119497
2,2,2023-01-01 00:00:00+00:00,NYISO,DUNWOD,Node,33.22,-11.82,1.47,0.0,11.0,0.0,,51.625,,162.119497
3,3,2023-01-01 00:00:00+00:00,NYISO,CENTRL,Node,21.88,-2.0,-0.04,0.0,11.0,0.0,,51.625,,162.119497
4,4,2023-01-01 00:00:00+00:00,NYISO,CAPITL,Node,37.7,-16.8,0.98,0.0,11.0,0.0,,51.625,,162.119497


In [9]:
temp_df['iso'].unique()

array(['ISO-NE', 'NYISO', 'PJM'], dtype=object)

In [11]:
eia_pricing_data.head()

Unnamed: 0,Price hub,Trade date,Delivery start date,Delivery \nend date,High price $/MWh,Low price $/MWh,Wtd avg price $/MWh,Change,Daily volume MWh,Number of trades,Number of counterparties,ISO
0,Indiana Hub RT Peak,2022-01-04 00:00:00,2022-01-05,2022-01-05,50.0,50.0,50.0,-11.0,800,1,2,MISO
1,Indiana Hub RT Peak,2022-01-05 00:00:00,2022-01-06,2022-01-06,70.0,68.0,69.0,19.0,4800,6,6,MISO
2,Indiana Hub RT Peak,2022-01-06 00:00:00,2022-01-07,2022-01-07,81.5,81.5,81.5,12.5,1600,2,3,MISO
3,Indiana Hub RT Peak,2022-01-07 00:00:00,2022-01-10,2022-01-10,75.0,75.0,75.0,-6.5,800,1,2,MISO
4,Indiana Hub RT Peak,2022-01-19 00:00:00,2022-01-20,2022-01-20,85.0,80.0,83.93,8.93,5600,7,8,MISO


## Aggregate EIA data into daily trades per ISO

In [11]:
# Convert dates to datetime
trade_dates = pd.to_datetime(eia_pricing_data['Trade date'], format='mixed', errors='coerce')
delivery_start_dates = pd.to_datetime(eia_pricing_data['Delivery start date'], format='mixed', errors='coerce')
delivery_end_dates = pd.to_datetime(eia_pricing_data['Delivery \nend date'], format='mixed', errors='coerce')

# Check the unique years
print("Trade date years:", sorted(trade_dates.dropna().dt.year.unique()))
print("Delivery start date years:", sorted(delivery_start_dates.dropna().dt.year.unique()))
print("Delivery end date years:", sorted(delivery_end_dates.dropna().dt.year.unique()))

Trade date years: [2021, 2022, 2023, 2024]
Delivery start date years: [2021, 2022, 2023, 2024]
Delivery end date years: [2021, 2022, 2023, 2024]


In [59]:
eia_daily = eia_pricing_data.copy()

# Convert 'Trade date' column to datetime objects 
eia_daily['Trade date'] = pd.to_datetime(eia_daily['Trade date'], format='mixed').dt.date

# Create a new column for weighted price = price × volume
eia_daily['weighted_price'] = eia_daily['Wtd avg price $/MWh'] * eia_daily['Daily volume MWh']

# Group data by Trade date and ISO, and aggregate:
eia_daily_summary = (
    eia_daily
    .groupby(['Trade date', 'ISO'])
    .agg(
        weighted_avg_price=('weighted_price', 'sum'),           # sum of (P × V)
        total_volume=('Daily volume MWh', 'sum'),                # sum of volume
        total_trades=('Number of trades', 'sum'),                # sum of trades
        total_counterparties=('Number of counterparties', 'sum') # sum of counterparties
    )
    .reset_index()
)

# Calculate the volume-weighted average price
eia_daily_summary['Wtd avg price $/MWh'] = eia_daily_summary['weighted_avg_price'] / eia_daily_summary['total_volume']

# Select and reorder the final columns
eia_daily_summary = eia_daily_summary[
    ['Trade date', 'ISO', 'Wtd avg price $/MWh', 'total_volume', 'total_trades', 'total_counterparties']
]

# Filter to include only Trade dates from 2022, 2023, or 2024
eia_daily_summary = eia_daily_summary[
    eia_daily_summary['Trade date'].apply(lambda x: x.year).isin([2022, 2023, 2024])
].reset_index(drop=True)

In [61]:
print(eia_daily_summary['Trade date'].apply(lambda x: x.year).value_counts(), '\n')

eia_daily_summary

Trade date
2023    990
2022    943
2024    913
Name: count, dtype: int64 



Unnamed: 0,Trade date,ISO,Wtd avg price $/MWh,total_volume,total_trades,total_counterparties
0,2022-01-03,CAISO,58.645714,2800,7,10
1,2022-01-03,ISO-NE,86.200000,8000,10,10
2,2022-01-03,Non-ISO (Mid-Columbia),56.710000,12000,29,10
3,2022-01-03,PJM,59.510000,31200,37,29
4,2022-01-04,CAISO,54.716774,12400,31,21
...,...,...,...,...,...,...
2841,2024-12-23,ISO-NE,124.130000,13600,17,15
2842,2024-12-23,Non-ISO (Mid-Columbia),31.330000,37600,46,15
2843,2024-12-23,PJM,36.280000,32000,37,20
2844,2024-12-24,ISO-NE,153.450000,1600,2,3


## Left join existing data with EIA data

In [49]:
temp_df['date'] = pd.to_datetime(temp_df['timestamp_utc']).dt.date

merged_df = temp_df.merge(
    eia_daily_summary,
    how='left',
    left_on=['date', 'iso'],
    right_on=['Trade date', 'ISO']
)

merged_df = merged_df.drop(columns=['Trade date', 'ISO', 'date', 'Unnamed: 0'])

In [51]:
merged_df

Unnamed: 0,timestamp_utc,iso,Location Name,Location Type,LMP,MCC,MLC,cum_ia,cum_wd,cum_on,avg_days_to_ia,avg_days_to_wd,avg_days_to_on,avg_days_pending,Wtd avg price $/MWh,total_volume,total_trades,total_counterparties
0,2023-01-01 00:00:00+00:00,ISO-NE,UN.FRNKLNSQ13.810CC,Node,31.280000,0.000000,0.000000,0.0,2.0,0.0,,136.500,,203.352381,,,,
1,2023-01-01 00:00:00+00:00,NYISO,GENESE,Node,21.140000,-1.630000,-0.420000,0.0,11.0,0.0,,51.625,,162.119497,,,,
2,2023-01-01 00:00:00+00:00,NYISO,DUNWOD,Node,33.220000,-11.820000,1.470000,0.0,11.0,0.0,,51.625,,162.119497,,,,
3,2023-01-01 00:00:00+00:00,NYISO,CENTRL,Node,21.880000,-2.000000,-0.040000,0.0,11.0,0.0,,51.625,,162.119497,,,,
4,2023-01-01 00:00:00+00:00,NYISO,CAPITL,Node,37.700000,-16.800000,0.980000,0.0,11.0,0.0,,51.625,,162.119497,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19649829,2024-01-01 04:00:00+00:00,PJM,LOUDOUN,Node,20.770000,0.340000,0.650000,,,,,,,,,,,
19649830,2024-01-01 04:00:00+00:00,PJM,MORRISVILLE,Node,20.630000,0.330000,0.520000,,,,,,,,,,,
19649831,2024-01-01 04:00:00+00:00,PJM,OX,Node,20.770000,0.350000,0.640000,,,,,,,,,,,
19649832,2024-01-01 04:00:00+00:00,PJM,POSSUM POINT,Node,20.760000,0.360000,0.620000,,,,,,,,,,,


In [53]:
perc_na = merged_df['Wtd avg price $/MWh'].isna().sum()/merged_df.shape[0]
perc_na

0.5926957958016338

In [55]:
merged_df.to_csv('final_df.csv')