# Data collection
Gather public SPP Weis data from https://marketplace.spp.org/groups/operational-data-weis

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

# logging
import logging

import pandas as pd
import polars as pl
import duckdb

# define log
logging.basicConfig(level=logging.INFO)

# Set the logging level for the 'py4j' logger to ERROR or WARNING
logging.getLogger("py4j").setLevel(logging.ERROR) 

log = logging.getLogger(__name__)


In [3]:
# base imports
import os
import sys
from io import StringIO
from typing import List, Union, Callable
import tqdm

# logging
import logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

# data
import requests
import pandas as pd
import duckdb
# from meteostat import Hourly, Point

In [4]:
# set env vars
try:
    os.environ['AWS_ACCESS_KEY_ID'] = dbutils.secrets.get(scope = "aws", key = "AWS_ACCESS_KEY_ID")
    os.environ['AWS_SECRET_ACCESS_KEY'] = dbutils.secrets.get(scope = "aws", key = "AWS_SECRET_ACCESS_KEY")
except:
    print('not on DBX')

from dotenv import load_dotenv
load_dotenv()


not on DBX


True

In [5]:
os.environ.get('AWS_S3_BUCKET')
os.environ.get('AWS_S3_FOLDER')

'unity-catalog/7474645306723306/spp-weis/'

In [6]:
os.chdir('../..')
sys.path.append('./src')

In [7]:
import src.data_collection as dc

INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 47
INFO:src.data_collection:adding module path


## Mid Term Load Forecast

![_](../../imgs/mtlf.PNG)

HOUR = {0000, ..., 2300}
DAY = {01, ..., 31}

In [8]:
# weis time are in central time
pd.to_datetime("4/1/2023 07:00:00").tz_localize("America/Chicago")

Timestamp('2023-04-01 07:00:00-0500', tz='America/Chicago')

In [9]:
dc.get_time_components('4/1/2023 07:30')

{'YEAR': '2023',
 'MONTH': '04',
 'DAY': '01',
 'HOUR': '08',
 'MINUTE': '00',
 'YM': '202304',
 'YMD': '20230401',
 'COMBINED': '202304010800',
 'timestamp': Timestamp('2023-04-01 08:00:00-0500', tz='America/Chicago'),
 'timestamp_utc': Timestamp('2023-04-01 13:00:00')}

In [10]:
dc.get_time_components('4/1/2023 07:30', five_min_ceil=True)

{'YEAR': '2023',
 'MONTH': '04',
 'DAY': '01',
 'HOUR': '07',
 'MINUTE': '30',
 'YM': '202304',
 'YMD': '20230401',
 'COMBINED': '202304010730',
 'timestamp': Timestamp('2023-04-01 07:30:00-0500', tz='America/Chicago'),
 'timestamp_utc': Timestamp('2023-04-01 12:30:00')}

In [11]:
tc = dc.get_time_components(five_min_ceil=True)
tc

{'YEAR': '2026',
 'MONTH': '01',
 'DAY': '25',
 'HOUR': '12',
 'MINUTE': '45',
 'YM': '202601',
 'YMD': '20260125',
 'COMBINED': '202601251245',
 'timestamp': Timestamp('2026-01-25 12:45:00-0600', tz='America/Chicago'),
 'timestamp_utc': Timestamp('2026-01-25 18:45:00')}

In [12]:
tc = dc.get_time_components(five_min_ceil=False)
tc

{'YEAR': '2026',
 'MONTH': '01',
 'DAY': '25',
 'HOUR': '13',
 'MINUTE': '00',
 'YM': '202601',
 'YMD': '20260125',
 'COMBINED': '202601251300',
 'timestamp': Timestamp('2026-01-25 13:00:00-0600', tz='America/Chicago'),
 'timestamp_utc': Timestamp('2026-01-25 19:00:00')}

In [13]:
tc = dc.get_time_components('6/7/2025 08:00:00')

In [14]:
mtlf_url = dc.get_hourly_mtlf_url(tc)
print(mtlf_url)
mtlf_url.split('WEIS-')[-1].replace('.csv','.parquet')

https://portal.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2025%2F06%2F07%2FWEIS-OP-MTLF-202506070800.csv


'OP-MTLF-202506070800.parquet'

In [15]:
mtrf_url = dc.get_hourly_mtrf_url(tc)
print(mtrf_url)
mtrf_url.split('WEIS-')[-1].replace('.csv','.parquet')

https://portal.spp.org/file-browser-api/download/mid-term-resource-forecast-mtrf-weis?path=%2F2025%2F06%2F07%2FWEIS-OP-MTRF-202506070800.csv


'OP-MTRF-202506070800.parquet'

In [16]:
lmp_5min = dc.get_5min_lmp_url(tc)
print(lmp_5min)
lmp_5min.split('WEIS-')[-1].replace('.csv','.parquet')

https://portal.spp.org/file-browser-api/download/lmp-by-settlement-location-weis?path=%2F2025%2F06%2FBy_Interval%2F07%2FWEIS-RTBM-LMP-SL-202506070800.csv


'RTBM-LMP-SL-202506070800.parquet'

In [17]:
lmp_daily = dc.get_daily_lmp_url(tc)
print(lmp_daily)
lmp_daily.split('WEIS-')[-1].replace('.csv','.parquet')

https://portal.spp.org/file-browser-api/download/lmp-by-settlement-location-weis?path=%2F2025%2F06%2FBy_Day%2FWEIS-RTBM-LMP-DAILY-SL-20250607.csv


'RTBM-LMP-DAILY-SL-20250607.parquet'

In [18]:
# test error handling
df = dc.get_csv_from_url(mtlf_url+'bad_url')
df

ERROR:src.data_collection:ERROR READING URL: https://portal.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2025%2F06%2F07%2FWEIS-OP-MTLF-202506070800.csvbad_url
ERROR:src.data_collection:


In [19]:
# test error handling
df = dc.get_csv_from_url('a'+mtlf_url)
df

ERROR:src.data_collection:No connection adapters were found for 'ahttps://portal.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2025%2F06%2F07%2FWEIS-OP-MTLF-202506070800.csv'


In [20]:
# test success
df = dc.get_csv_from_url(mtlf_url)
df

Interval,GMTIntervalEnd,MTLF,Averaged Actual
str,str,i64,str
"""06/14/2025 08:00:00""","""06/14/2025 13:00:00""",6506,
"""06/14/2025 07:00:00""","""06/14/2025 12:00:00""",6481,
"""06/14/2025 06:00:00""","""06/14/2025 11:00:00""",6425,
"""06/14/2025 05:00:00""","""06/14/2025 10:00:00""",6474,
"""06/14/2025 04:00:00""","""06/14/2025 09:00:00""",6608,
…,…,…,…
"""06/06/2025 12:00:00""","""06/06/2025 17:00:00""",7593,"""7436"""
"""06/06/2025 11:00:00""","""06/06/2025 16:00:00""",7306,"""7508"""
"""06/06/2025 10:00:00""","""06/06/2025 15:00:00""",7245,"""7543"""
"""06/06/2025 09:00:00""","""06/06/2025 14:00:00""",7151,"""7357"""


In [21]:
# test getting and processing
dc.get_process_mtlf(tc)

INFO:botocore.credentials:Found credentials in environment variables.


's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202506070800.parquet'

In [22]:
# test getting data from a range of timestamps
end_ts = (
    pd.Timestamp.utcnow().tz_convert("America/Chicago").tz_localize(None) - 
    pd.Timedelta('2D')
).tz_localize(None)

range_df = dc.get_range_data_mtlf(end_ts=end_ts, n_periods=24)
range_df

ERROR:src.data_collection:error parsing: 2024-03-10 02:00:00
ERROR:src.data_collection:error parsing: 2024-11-03 01:00:00
ERROR:src.data_collection:error parsing: 2025-03-09 02:00:00
ERROR:src.data_collection:error parsing: 2025-11-02 01:00:00
  0%|                                                                                                                                                                                        | 0/17516 [00:00<?, ?it/s]INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 47
INFO:src.data_collection:N_JOBS: 47
INFO:src.data_collection:adding module path
INFO:src.data_collection:adding module path
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 47
INFO:src.data_collection:adding module path
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 47
INFO:src.data_collection:adding module path
I

['s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241500.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241600.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241700.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241800.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241900.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401242000.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401242100.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401242200.parquet',
 's3://d

In [23]:
pl.read_parquet(range_df[0])

Interval,GMTIntervalEnd,MTLF,Averaged_Actual,timestamp_mst,file_create_time_utc,url
datetime[μs],datetime[μs],f32,f32,datetime[μs],datetime[μs],str
2024-01-26 14:00:00,2024-01-26 20:00:00,8296.0,,2024-01-26 13:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"
2024-01-25 05:00:00,2024-01-25 11:00:00,7676.0,,2024-01-25 04:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"
2024-01-26 20:00:00,2024-01-27 02:00:00,9290.0,,2024-01-26 19:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"
2024-01-29 03:00:00,2024-01-29 09:00:00,7625.0,,2024-01-29 02:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"
2024-01-28 08:00:00,2024-01-28 14:00:00,8176.0,,2024-01-28 07:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"
…,…,…,…,…,…,…
2024-01-30 10:00:00,2024-01-30 16:00:00,8778.0,,2024-01-30 09:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"
2024-01-30 11:00:00,2024-01-30 17:00:00,8474.0,,2024-01-30 10:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"
2024-01-25 16:00:00,2024-01-25 22:00:00,8114.0,,2024-01-25 15:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"
2024-01-31 09:00:00,2024-01-31 15:00:00,8889.0,,2024-01-31 08:00:00,2024-01-24 21:00:00,"""https://portal.spp.org/file-br…"


In [26]:
# sort so actuals are first
parquet_files = sorted([pf for pf in range_df if pf.endswith('.parquet')])
parquet_files

['s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241500.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241600.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241700.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241800.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401241900.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401242000.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401242100.parquet',
 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf/OP-MTLF-202401242200.parquet',
 's3://d

In [None]:
dc.upsert_mtlf(parquet_files)

INFO:src.data_collection:number of files upserting: 17319
  0%|                                                                                                                                                                                        | 0/17319 [00:00<?, ?it/s]INFO:src.data_collection:starting count: 193


## Mid Term Resource Forecast

![_](../../imgs/mtrf.PNG)

In [None]:
range_df = dc.get_range_data_mtrf(end_ts=end_ts,  n_periods=8760*2)
range_df

In [None]:
pl.read_parquet(range_df[0])

In [None]:
# sort so actuals are first
parquet_files = sorted([pf for pf in range_df if pf.endswith('.parquet')])
parquet_files

In [None]:

dc.upsert_mtrf(parquet_files)

## LMP settlement location prices

![_](../../imgs/lmp_settlement_location.PNG)

In [None]:
range_df = dc.get_range_data_interval_5min_lmps(end_ts=end_ts,  n_periods=10)
range_df

In [None]:
pl.read_parquet(range_df[0])

In [None]:
# pl.scan_parquet('s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/lmp.parquet').head().collect()

In [None]:
parquet_files = sorted([pf for pf in range_df if pf.endswith('.parquet')])
parquet_files

In [None]:
dc.upsert_lmp(parquet_files)

### Test LMP daily file

In [None]:
range_df = dc.get_range_data_interval_daily_lmps(end_ts=end_ts,  n_periods=10)
range_df

In [None]:
pl.read_parquet(range_df[0])

In [None]:
# pl.scan_parquet('s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/lmp.parquet').head().collect()

In [None]:
parquet_files = sorted([pf for pf in range_df if pf.endswith('.parquet')])
parquet_files

In [None]:
dc.upsert_lmp(parquet_files)

## Generation capacity by fuel type