In [2]:
import os
import pandas as pd

def combine_quarters(folder, year=None):
    """
    Combines quarterly CSVs in a folder.
    If 'year' is provided, only combines files containing that year in the filename.
    """
    files = sorted([f for f in os.listdir(folder) if f.endswith('.csv')])

    if year is not None:
        files = [f for f in files if str(year) in f]

    dfs = []

    for f in files:
        path = os.path.join(folder, f)
        try:
            df = pd.read_csv(path, low_memory=False)
            dfs.append(df)
            print(f"✅ Loaded {f}")
        except Exception as e:
            print(f"⚠️ Failed to load {f}: {e}")

    if not dfs:
        print(f"⚠️ No files found for year {year} in {folder}")
        return pd.DataFrame()

    combined = pd.concat(dfs, ignore_index=True)
    print(f"🧩 Combined {len(dfs)} files with {len(combined):,} rows for year {year}.")
    return combined

# Combine only MISO 2023 files
miso_2023_df = combine_quarters('iso_data/miso_data', year=2023)

# Combine only ISO-NE 2023 files
isone_2023_df = combine_quarters('iso_data/isone_lmp_data', year=2023)

# Combine only NYISO 2023 files
nyo_2023_df = combine_quarters('iso_data/nyiso_combined_quarters', year=2023)


# Combine only ERCOT 2023 files
ercot_2023_df = combine_quarters('iso_data/ercot_dam_outputs', year=2023)

# Combine only SPP 2023 files
spp_2023_df = combine_quarters('iso_data/spp_lmp_quarters', year=2023)

✅ Loaded 2023_Q1.csv
✅ Loaded 2023_Q2.csv
✅ Loaded 2023_Q3.csv
✅ Loaded 2023_Q4.csv
🧩 Combined 4 files with 2,621,547 rows for year 2023.
✅ Loaded isone_lmp_2023Q1.csv
✅ Loaded isone_lmp_2023Q2.csv
✅ Loaded isone_lmp_2023Q3.csv
✅ Loaded isone_lmp_2023Q4.csv
🧩 Combined 4 files with 10,612,055 rows for year 2023.
✅ Loaded nyiso_combined_2023Q1.csv
✅ Loaded nyiso_combined_2023Q2.csv
✅ Loaded nyiso_combined_2023Q3.csv
✅ Loaded nyiso_combined_2023Q4.csv
🧩 Combined 4 files with 6,153,262 rows for year 2023.
✅ Loaded ERCOT_LMP_2023Q1.csv
✅ Loaded ERCOT_LMP_2023Q2.csv
✅ Loaded ERCOT_LMP_2023Q3.csv
✅ Loaded ERCOT_LMP_2023Q4.csv
🧩 Combined 4 files with 145,255,128 rows for year 2023.
✅ Loaded spp_lmp_2023Q1.csv
✅ Loaded spp_lmp_2023Q2.csv
✅ Loaded spp_lmp_2023Q3.csv
✅ Loaded spp_lmp_2023Q4.csv
🧩 Combined 4 files with 10,165,309 rows for year 2023.


In [3]:
import pandas as pd

# Combine all ISO dataframes
combined_mw_south_raw_df = pd.concat([
    miso_2023_df,
    ercot_2023_df,
    spp_2023_df
], ignore_index=True)

In [4]:
print(combined_mw_south_raw_df.head())

print(f"✅ Combined raw DataFrame shape: {combined_mw_south_raw_df.shape}")

  MARKET_DAY       NODE       TYPE VALUE    HE1    HE2    HE3    HE4    HE5  \
0   1/1/2023       AECI  Interface   LMP  25.35  23.84  23.76  23.39  22.41   
1   1/1/2023       AECI  Interface   MCC  -0.76  -1.11  -0.80  -1.33  -0.87   
2   1/1/2023       AECI  Interface   MLC  -1.12  -1.20  -1.08  -1.18  -1.78   
3   1/1/2023  AECI.ALTW   Loadzone   LMP  27.08  25.70  25.31  25.63  24.41   
4   1/1/2023  AECI.ALTW   Loadzone   MCC  -0.41  -0.17  -0.03  -0.01   0.00   

     HE6  ...  GMTIntervalEnd  Settlement Location  Pnode  LMP MLC MCC MEC  \
0  23.53  ...             NaN                  NaN    NaN  NaN NaN NaN NaN   
1  -0.93  ...             NaN                  NaN    NaN  NaN NaN NaN NaN   
2  -1.18  ...             NaN                  NaN    NaN  NaN NaN NaN NaN   
3  25.37  ...             NaN                  NaN    NaN  NaN NaN NaN NaN   
4   0.00  ...             NaN                  NaN    NaN  NaN NaN NaN NaN   

  date source_file year_quarter  
0  NaN         NaN    

In [10]:
import pandas as pd

def clean_spp(df_spp):
    df = df_spp.rename(columns={
        'GMTIntervalEnd': 'timestamp_utc',
        'Pnode': 'Location Name',
    })
    df['timestamp_utc'] = pd.to_datetime(df['timestamp_utc'])
    df['timestamp_utc'] = df['timestamp_utc'].dt.tz_localize('UTC')
    df['Location Type'] = 'Node'
    df['iso'] = 'SPP'
    return df[['timestamp_utc', 'Location Name', 'Location Type', 'iso', 'LMP', 'MCC', 'MLC']]

def clean_ercot(df_ercot):
    df = df_ercot.copy()

    # Parse deliverydate to datetime
    df['deliverydate'] = pd.to_datetime(df['deliverydate'])

    # Fix 24:00 by shifting the date and setting hour to 0
    mask_24 = df['hourending'] == '24:00'
    df.loc[mask_24, 'deliverydate'] += pd.Timedelta(days=1)
    df.loc[mask_24, 'hourending'] = '00:00'

    # Now safely extract the hour as an integer
    df['hour'] = df['hourending'].str.extract('(\d+)').astype(int)

    
    # Step 1: Combine deliverydate and hourending as strings
    datetime_str = df['deliverydate'].astype(str) + ' ' + df['hourending'].astype(str)

    # Step 2: Parse the combined string into a real datetime
    df['timestamp_utc'] = pd.to_datetime(datetime_str, format='%Y-%m-%d %H:%M', errors='coerce')

    # Step 3: Localize to UTC
    df['timestamp_utc'] = df['timestamp_utc'].dt.tz_localize('UTC')


    # Standardize columns
    df['Location Name'] = df['busname']
    df['Location Type'] = 'Node'
    df['iso'] = 'ERCOT'
    df['MCC'] = None
    df['MLC'] = None
    df = df.rename(columns={'lmp': 'LMP'})
    
    return df[['timestamp_utc', 'Location Name', 'Location Type', 'iso', 'LMP', 'MCC', 'MLC']]


def reshape_miso(df_miso):
    # Identify the hour columns (HE1, HE2, ..., HE24)
    hour_columns = [col for col in df_miso.columns if str(col).startswith('HE')]

    # Melt from wide to long
    df_long = df_miso.melt(
        id_vars=['MARKET_DAY', 'NODE', 'TYPE', 'VALUE'],
        value_vars=hour_columns,
        var_name='Hour Ending',
        value_name='Price'
    )

    # ✅ Safe handling of 'Hour Ending'
    df_long['Hour Ending'] = df_long['Hour Ending'].astype(str)  # Ensure it's string
    df_long['Hour Ending'] = df_long['Hour Ending'].str.replace('HE', '', regex=False)
    df_long = df_long[df_long['Hour Ending'].str.isnumeric()]
    df_long['Hour Ending'] = df_long['Hour Ending'].astype(int)

    # Create local timestamp (interval beginning, so subtract 1 hour)
    df_long['Date'] = pd.to_datetime(df_long['MARKET_DAY'], format='%m/%d/%Y', errors='coerce')
    df_long['timestamp_local'] = df_long['Date'] + pd.to_timedelta(df_long['Hour Ending'] - 1, unit='h')

    # ✅ Handle DST properly
    df_long['timestamp_utc'] = (
        df_long['timestamp_local']
        .dt.tz_localize('US/Central', ambiguous='NaT', nonexistent='shift_forward')
        .dt.tz_convert('UTC')
    )

    # Rename columns to match final schema
    df_long = df_long.rename(columns={
        'NODE': 'Location Name',
        'TYPE': 'Location Type',
        'VALUE': 'Component'
    })

    df_long['iso'] = 'MISO'

    return df_long[['timestamp_utc', 'Location Name', 'Location Type', 'Component', 'Price', 'iso']]

def pivot_components(df_long):
    """
    Pivots 'Component' rows (LMP, MCC, MLC) into separate columns,
    keeping timestamp, location info, and ISO.
    """
    df_wide = df_long.pivot_table(
        index=['timestamp_utc', 'Location Name', 'Location Type', 'iso'],
        columns='Component',
        values='Price',
        aggfunc='first'   # ✅ Critical: Avoid aggregation crash on objects
    ).reset_index()

    df_wide.columns.name = None  # Remove pivot artifacts
    return df_wide



def combine_all(df_spp, df_ercot, df_miso):
    spp_clean = clean_spp(df_spp)
    ercot_clean = clean_ercot(df_ercot)
    df_miso_long = reshape_miso(df_miso)
    miso_clean = pivot_components(df_miso_long)
    combined = pd.concat([spp_clean, ercot_clean, miso_clean], ignore_index=True)
    return combined

combined_df = combine_all(spp_2023_df, ercot_2023_df, miso_2023_df)

In [11]:
combined_df = combined_df.sort_values('timestamp_utc').reset_index(drop=True)

# 🧠 Now set timestamp_utc as index
combined_df = combined_df.set_index('timestamp_utc')


In [12]:
combined_mw_south_df_2023 = combined_df[['iso', 'Location Name', 'Location Type', 'LMP', 'MCC', 'MLC']]
combined_mw_south_df_2023

Unnamed: 0_level_0,iso,Location Name,Location Type,LMP,MCC,MLC
timestamp_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 01:00:00+00:00,ERCOT,E1BB1,Node,10.55,,
2023-01-01 01:00:00+00:00,ERCOT,EB_M032,Node,10.58,,
2023-01-01 01:00:00+00:00,ERCOT,CCRST_9U,Node,10.58,,
2023-01-01 01:00:00+00:00,ERCOT,CCRST_29Z,Node,10.58,,
2023-01-01 01:00:00+00:00,ERCOT,CCRST_9,Node,10.58,,
...,...,...,...,...,...,...
2024-01-01 06:00:00+00:00,SPP,SOUC,Node,26.2113,0.7905,1.0531
2024-01-01 06:00:00+00:00,SPP,SCSE,Node,22.8806,-0.1772,-1.3099
2024-01-01 06:00:00+00:00,SPP,VACS,Node,26.0793,0.7336,0.978
2024-01-01 06:00:00+00:00,SPP,OPPDSARPYUN5_RA,Node,23.7677,-0.021,-0.579


In [14]:
output_path = 'iso_data/combined_mw_south_2023_lmp_data.csv'
combined_mw_south_df_2023.to_csv(output_path)

print(f"✅ Successfully saved to {output_path}")

✅ Successfully saved to iso_data/combined_mw_south_2023_lmp_data.csv


In [15]:
# Combine only PJM 2023 files
pjm_2023_df = combine_quarters('iso_data/pjm_data', year=2023)

✅ Loaded rt_da_monthly_lmps_april2023.csv
✅ Loaded rt_da_monthly_lmps_aug2023.csv
✅ Loaded rt_da_monthly_lmps_dec2023.csv
✅ Loaded rt_da_monthly_lmps_feb2023.csv
✅ Loaded rt_da_monthly_lmps_jan2023.csv
✅ Loaded rt_da_monthly_lmps_july2023.csv
✅ Loaded rt_da_monthly_lmps_june2023.csv
✅ Loaded rt_da_monthly_lmps_march2023.csv
✅ Loaded rt_da_monthly_lmps_may2023.csv
✅ Loaded rt_da_monthly_lmps_nov2023.csv
✅ Loaded rt_da_monthly_lmps_oct2023.csv
✅ Loaded rt_da_monthly_lmps_sept2023.csv
🧩 Combined 12 files with 2,885,733 rows for year 2023.


In [17]:
# Combine NE/Mid-atlantic ISO dataframes
combined_ne_raw_df = pd.concat([
    nyo_2023_df,
    isone_2023_df, 
    pjm_2023_df
], ignore_index=True)

In [19]:
print(combined_ne_raw_df.head())

print(f"✅ Combined raw DataFrame shape: {combined_ne_raw_df.shape}")

         Time Stamp                  Name     PTID  LBMP ($/MWHr)  \
0  01/20/2023 00:00      59TH STREET_GT_1  24138.0          29.80   
1  01/20/2023 00:00      74TH STREET_GT_1  24260.0          29.87   
2  01/20/2023 00:00      74TH STREET_GT_2  24261.0          29.87   
3  01/20/2023 00:00    ADK HUDSON___FALLS  24011.0          30.89   
4  01/20/2023 00:00  ADK RESOURCE___RCVRY  23798.0          30.94   

   Marginal Cost Losses ($/MWHr)  Marginal Cost Congestion ($/MWHr)  \
0                           1.83                              -4.15   
1                           1.91                              -4.16   
2                           1.91                              -4.16   
3                           1.02                              -6.05   
4                           1.07                              -6.06   

                  Date node_type    H Hour Ending  ...  type zone  \
0  2023-01-20 00:00:00     nodal  NaN         NaN  ...   NaN  NaN   
1  2023-01-20 00:00:

In [28]:
import pandas as pd

def clean_pjm(pjm_df):
    """Clean PJM dataframe to standard format."""
    pjm_df = pjm_df.copy()
    pjm_df['timestamp_utc'] = pd.to_datetime(pjm_df['datetime_beginning_utc'], utc = True)
    pjm_df['iso'] = 'PJM'
    pjm_df['Location Name'] = pjm_df['pnode_name']
    pjm_df['Location Type'] = 'Node'
    pjm_df['LMP'] = pjm_df['total_lmp_da']  # or 'total_lmp_rt' if you prefer real-time
    pjm_df['MCC'] = pjm_df['congestion_price_da']
    pjm_df['MLC'] = pjm_df['marginal_loss_price_da']
    
    return pjm_df[['timestamp_utc', 'iso', 'Location Name', 'Location Type', 'LMP', 'MCC', 'MLC']]

def clean_nyiso(nyiso_df):
    """Clean NYISO dataframe to standard format."""
    nyiso_df = nyiso_df.copy()
    nyiso_df['timestamp_utc'] = pd.to_datetime(nyiso_df['Date'], utc = True)
    nyiso_df['iso'] = 'NYISO'
    nyiso_df['Location Name'] = nyiso_df['Name']
    nyiso_df['Location Type'] = 'Node'
    nyiso_df['LMP'] = nyiso_df['LBMP ($/MWHr)'] 
    nyiso_df['MCC'] = nyiso_df['Marginal Cost Congestion ($/MWHr)']
    nyiso_df['MLC'] = nyiso_df['Marginal Cost Losses ($/MWHr)']
    
    return nyiso_df[['timestamp_utc', 'iso', 'Location Name', 'Location Type', 'LMP', 'MCC', 'MLC']]

def clean_isone(isone_df):
    """Clean ISO-NE dataframe to standard format."""
    isone_df = isone_df.copy()

    # Safe handling of 'Hour Ending'
    isone_df['Hour Ending'] = isone_df['Hour Ending'].astype(str)
    isone_df = isone_df[isone_df['Hour Ending'].str.isnumeric()]
    isone_df['Hour Ending'] = isone_df['Hour Ending'].astype(int)
    isone_df['timestamp_utc'] = pd.to_datetime(isone_df['Date']) + pd.to_timedelta(isone_df['Hour Ending'] - 1, unit='h')
    isone_df['timestamp_utc'] = pd.to_datetime(isone_df['timestamp_utc'], utc=True)

    isone_df['iso'] = 'ISO-NE'
    isone_df['Location Name'] = isone_df['Location Name']
    isone_df['Location Type'] = 'Node'  # Simplify network node to Node
    isone_df['LMP'] = isone_df['Locational Marginal Price']
    isone_df['MCC'] = isone_df['Congestion Component']
    isone_df['MLC'] = isone_df['Marginal Loss Component']
    
    return isone_df[['timestamp_utc', 'iso', 'Location Name', 'Location Type', 'LMP', 'MCC', 'MLC']]

def combine_isos(pjm_df, nyiso_df, isone_df):
    """Combine cleaned ISO dataframes into one."""
    pjm_clean = clean_pjm(pjm_df)
    nyiso_clean = clean_nyiso(nyiso_df)
    isone_clean = clean_isone(isone_df)
    
    combined_df = pd.concat([pjm_clean, nyiso_clean, isone_clean], ignore_index=True)
    return combined_df

combined_ne_df = combine_isos(pjm_2023_df, nyo_2023_df, isone_2023_df)

In [29]:
combined_ne_df = combined_ne_df.sort_values('timestamp_utc').reset_index(drop=True)

# 🧠 Now set timestamp_utc as index
combined_ne_df = combined_ne_df.set_index('timestamp_utc')
combined_ne_df

Unnamed: 0_level_0,iso,Location Name,Location Type,LMP,MCC,MLC
timestamp_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:00:00+00:00,ISO-NE,UN.FRNKLNSQ13.810CC,Node,31.280000,0.000000,0.000000
2023-01-01 00:00:00+00:00,NYISO,GENESE,Node,21.140000,-1.630000,-0.420000
2023-01-01 00:00:00+00:00,NYISO,DUNWOD,Node,33.220000,-11.820000,1.470000
2023-01-01 00:00:00+00:00,NYISO,CENTRL,Node,21.880000,-2.000000,-0.040000
2023-01-01 00:00:00+00:00,NYISO,CAPITL,Node,37.700000,-16.800000,0.980000
...,...,...,...,...,...,...
2024-01-01 04:00:00+00:00,PJM,LOUDOUN,Node,20.770000,0.340000,0.650000
2024-01-01 04:00:00+00:00,PJM,MORRISVILLE,Node,20.630000,0.330000,0.520000
2024-01-01 04:00:00+00:00,PJM,OX,Node,20.770000,0.350000,0.640000
2024-01-01 04:00:00+00:00,PJM,POSSUM POINT,Node,20.760000,0.360000,0.620000


In [30]:
combined_ne_df_2023 = combined_ne_df[['iso', 'Location Name', 'Location Type', 'LMP', 'MCC', 'MLC']]
combined_ne_df_2023

Unnamed: 0_level_0,iso,Location Name,Location Type,LMP,MCC,MLC
timestamp_utc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:00:00+00:00,ISO-NE,UN.FRNKLNSQ13.810CC,Node,31.280000,0.000000,0.000000
2023-01-01 00:00:00+00:00,NYISO,GENESE,Node,21.140000,-1.630000,-0.420000
2023-01-01 00:00:00+00:00,NYISO,DUNWOD,Node,33.220000,-11.820000,1.470000
2023-01-01 00:00:00+00:00,NYISO,CENTRL,Node,21.880000,-2.000000,-0.040000
2023-01-01 00:00:00+00:00,NYISO,CAPITL,Node,37.700000,-16.800000,0.980000
...,...,...,...,...,...,...
2024-01-01 04:00:00+00:00,PJM,LOUDOUN,Node,20.770000,0.340000,0.650000
2024-01-01 04:00:00+00:00,PJM,MORRISVILLE,Node,20.630000,0.330000,0.520000
2024-01-01 04:00:00+00:00,PJM,OX,Node,20.770000,0.350000,0.640000
2024-01-01 04:00:00+00:00,PJM,POSSUM POINT,Node,20.760000,0.360000,0.620000


In [31]:
output_path = 'iso_data/combined_ne_2023_lmp_data.csv'
combined_ne_df_2023.to_csv(output_path)

print(f"✅ Successfully saved to {output_path}")

✅ Successfully saved to iso_data/combined_ne_2023_lmp_data.csv
