# Data collection
Gather public SPP Weis data from https://marketplace.spp.org/groups/operational-data-weis

In [1]:
import os
import pandas as pd
import duckdb

import requests
from io import StringIO

import ibis
import ibis.selectors as s
ibis.options.interactive = True

# logging
import logging

In [2]:
# define log
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)


In [3]:
os.chdir('../..')

In [4]:
import src.data_collection as dc

INFO:src.data_collection:number of cores available: 4
INFO:src.data_collection:N_JOBS: 3


## Set up backfill parameters

In [5]:
start_date = '2024-08-30'
end_date = '2024-09-21'
num_day_step = 7
day_list = pd.date_range(start = start_date, end=end_date, freq=f'{num_day_step }d')
day_list = [pd.Timestamp(d) for d in day_list]
day_list[:5]

[Timestamp('2024-08-30 00:00:00'),
 Timestamp('2024-09-06 00:00:00'),
 Timestamp('2024-09-13 00:00:00'),
 Timestamp('2024-09-20 00:00:00')]

## Mid Term Load Forecast

![_](../../imgs/mtlf.PNG)

HOUR = {0000, ..., 2300}
DAY = {01, ..., 31}

In [6]:
for d in day_list:
    dc.collect_upsert_mtlf(end_ts=d, n_periods=24*num_day_step+1, backfill=True)

INFO:src.data_collection:end_ts: 2024-08-30 00:00:00
INFO:src.data_collection:n_periods: 169
  0%|          | 0/169 [00:00<?, ?it/s]INFO:src.data_collection:number of cores available: 4
INFO:src.data_collection:N_JOBS: 3
INFO:src.data_collection:number of cores available: 4
INFO:src.data_collection:N_JOBS: 3
INFO:src.data_collection:number of cores available: 4
INFO:src.data_collection:N_JOBS: 3
 96%|█████████▌| 162/169 [00:21<00:00,  7.37it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F08%2F29%2FWEIS-OP-MTLF-202408291800.csv
ERROR:src.data_collection:
 96%|█████████▋| 163/169 [00:21<00:00,  7.41it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F08%2F29%2FWEIS-OP-MTLF-202408291900.csv
ERROR:src.data_collection:
100%|██████████| 169/169 [00:22<00:00

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361 entries, 0 to 360
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         361 non-null    datetime64[ns]
 1   GMTIntervalEnd   361 non-null    datetime64[ns]
 2   MTLF             361 non-null    int64         
 3   Averaged_Actual  194 non-null    float64       
 4   timestamp_mst    361 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 14.2 KB


100%|██████████| 169/169 [00:21<00:00,  7.77it/s]
INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 194
INFO:src.data_collection:end_ts: 2024-09-13 00:00:00
INFO:src.data_collection:n_periods: 169


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361 entries, 0 to 360
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         361 non-null    datetime64[ns]
 1   GMTIntervalEnd   361 non-null    datetime64[ns]
 2   MTLF             361 non-null    int64         
 3   Averaged_Actual  194 non-null    float64       
 4   timestamp_mst    361 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 14.2 KB


100%|██████████| 169/169 [00:22<00:00,  7.67it/s]
INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 194
INFO:src.data_collection:end_ts: 2024-09-20 00:00:00
INFO:src.data_collection:n_periods: 169


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361 entries, 0 to 360
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         361 non-null    datetime64[ns]
 1   GMTIntervalEnd   361 non-null    datetime64[ns]
 2   MTLF             361 non-null    int64         
 3   Averaged_Actual  194 non-null    float64       
 4   timestamp_mst    361 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 14.2 KB


 63%|██████▎   | 107/169 [00:14<00:08,  7.47it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F09%2F17%2FWEIS-OP-MTLF-202409171100.csv
ERROR:src.data_collection:
100%|██████████| 169/169 [00:22<00:00,  7.43it/s]
INFO:src.data_collection:ROWS INSERTED: 120 ROWS UPDATED: 74


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361 entries, 0 to 360
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         361 non-null    datetime64[ns]
 1   GMTIntervalEnd   361 non-null    datetime64[ns]
 2   MTLF             361 non-null    int64         
 3   Averaged_Actual  194 non-null    float64       
 4   timestamp_mst    361 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 14.2 KB


## Mid Term Resource Forecast

![_](../../imgs/mtrf.PNG)

In [7]:
for d in day_list:
    dc.collect_upsert_mtrf(end_ts=d, n_periods=24*num_day_step+1, backfill=True)

INFO:src.data_collection:end_ts: 2024-08-30 00:00:00
INFO:src.data_collection:n_periods: 169
 14%|█▍        | 24/169 [00:03<00:20,  7.21it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/mid-term-resource-forecast-mtrf-weis?path=%2F2024%2F08%2F24%2FWEIS-OP-MTRF-202408240000.csv
ERROR:src.data_collection:
 96%|█████████▌| 162/169 [00:20<00:00,  7.78it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/mid-term-resource-forecast-mtrf-weis?path=%2F2024%2F08%2F29%2FWEIS-OP-MTRF-202408291800.csv
ERROR:src.data_collection:
 96%|█████████▋| 163/169 [00:21<00:00,  7.74it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/mid-term-resource-forecast-mtrf-weis?path=%2F2024%2F08%2F29%2FWEIS-OP-MTRF-202408291900.csv
ERROR:src.data_collection:
100%|██████████| 169/169 [00:21<00:00,  7.74it/s]
INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 3

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Interval           362 non-null    datetime64[ns]
 1   GMTIntervalEnd     362 non-null    datetime64[ns]
 2   Wind_Forecast_MW   361 non-null    float64       
 3   Solar_Forecast_MW  361 non-null    float64       
 4   timestamp_mst      362 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(2)
memory usage: 14.3 KB


  7%|▋         | 12/169 [00:01<00:20,  7.69it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/mid-term-resource-forecast-mtrf-weis?path=%2F2024%2F08%2F30%2FWEIS-OP-MTRF-202408301200.csv
ERROR:src.data_collection:
 99%|█████████▉| 167/169 [00:21<00:00,  7.61it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/mid-term-resource-forecast-mtrf-weis?path=%2F2024%2F09%2F06%2FWEIS-OP-MTRF-202409060000.csv
ERROR:src.data_collection:
100%|██████████| 169/169 [00:22<00:00,  7.61it/s]
INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 360
INFO:src.data_collection:end_ts: 2024-09-13 00:00:00
INFO:src.data_collection:n_periods: 169


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Interval           360 non-null    datetime64[ns]
 1   GMTIntervalEnd     360 non-null    datetime64[ns]
 2   Wind_Forecast_MW   360 non-null    float64       
 3   Solar_Forecast_MW  360 non-null    float64       
 4   timestamp_mst      360 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(2)
memory usage: 14.2 KB


  1%|          | 1/169 [00:00<01:03,  2.64it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/mid-term-resource-forecast-mtrf-weis?path=%2F2024%2F09%2F06%2FWEIS-OP-MTRF-202409060000.csv
ERROR:src.data_collection:
100%|██████████| 169/169 [00:21<00:00,  7.86it/s]
INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 360
INFO:src.data_collection:end_ts: 2024-09-20 00:00:00
INFO:src.data_collection:n_periods: 169


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361 entries, 0 to 360
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Interval           361 non-null    datetime64[ns]
 1   GMTIntervalEnd     361 non-null    datetime64[ns]
 2   Wind_Forecast_MW   360 non-null    float64       
 3   Solar_Forecast_MW  360 non-null    float64       
 4   timestamp_mst      361 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(2)
memory usage: 14.2 KB


 63%|██████▎   | 107/169 [00:14<00:08,  7.60it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/mid-term-resource-forecast-mtrf-weis?path=%2F2024%2F09%2F17%2FWEIS-OP-MTRF-202409171100.csv
ERROR:src.data_collection:
100%|██████████| 169/169 [00:22<00:00,  7.49it/s]
INFO:src.data_collection:ROWS INSERTED: 120 ROWS UPDATED: 241


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362 entries, 0 to 361
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Interval           362 non-null    datetime64[ns]
 1   GMTIntervalEnd     362 non-null    datetime64[ns]
 2   Wind_Forecast_MW   361 non-null    float64       
 3   Solar_Forecast_MW  361 non-null    float64       
 4   timestamp_mst      362 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(2)
memory usage: 14.3 KB


## LMP settlement location prices

![_](../../imgs/lmp_settlement_location.PNG)

## LMP upsert

### LMP daily file

In [9]:
for d in day_list:
    dc.collect_upsert_lmp(end_ts=d, n_periods=num_day_step+1, daily_file=True)

INFO:src.data_collection:end_ts: 2024-08-30 00:00:00
INFO:src.data_collection:n_periods: 8
100%|██████████| 8/8 [00:21<00:00,  2.64s/it]
INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 62,976


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62976 entries, 0 to 62975
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Interval_HE               62976 non-null  datetime64[ns]
 1   GMTIntervalEnd_HE         62976 non-null  datetime64[ns]
 2   timestamp_mst_HE          62976 non-null  datetime64[ns]
 3   Settlement_Location_Name  62976 non-null  object        
 4   PNODE_Name                62976 non-null  object        
 5   LMP                       62976 non-null  float64       
 6   MLC                       62976 non-null  float64       
 7   MCC                       62976 non-null  float64       
 8   MEC                       62976 non-null  float64       
dtypes: datetime64[ns](3), float64(4), object(2)
memory usage: 4.3+ MB


INFO:src.data_collection:end_ts: 2024-09-06 00:00:00
INFO:src.data_collection:n_periods: 8
100%|██████████| 8/8 [00:20<00:00,  2.57s/it]
INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 62,976


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62976 entries, 0 to 62975
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Interval_HE               62976 non-null  datetime64[ns]
 1   GMTIntervalEnd_HE         62976 non-null  datetime64[ns]
 2   timestamp_mst_HE          62976 non-null  datetime64[ns]
 3   Settlement_Location_Name  62976 non-null  object        
 4   PNODE_Name                62976 non-null  object        
 5   LMP                       62976 non-null  float64       
 6   MLC                       62976 non-null  float64       
 7   MCC                       62976 non-null  float64       
 8   MEC                       62976 non-null  float64       
dtypes: datetime64[ns](3), float64(4), object(2)
memory usage: 4.3+ MB


INFO:src.data_collection:end_ts: 2024-09-13 00:00:00
INFO:src.data_collection:n_periods: 8
100%|██████████| 8/8 [00:20<00:00,  2.59s/it]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62976 entries, 0 to 62975
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Interval_HE               62976 non-null  datetime64[ns]
 1   GMTIntervalEnd_HE         62976 non-null  datetime64[ns]
 2   timestamp_mst_HE          62976 non-null  datetime64[ns]
 3   Settlement_Location_Name  62976 non-null  object        
 4   PNODE_Name                62976 non-null  object        
 5   LMP                       62976 non-null  float64       
 6   MLC                       62976 non-null  float64       
 7   MCC                       62976 non-null  float64       
 8   MEC                       62976 non-null  float64       
dtypes: datetime64[ns](3), float64(4), object(2)
memory usage: 4.3+ MB


INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 62,976
INFO:src.data_collection:end_ts: 2024-09-20 00:00:00
INFO:src.data_collection:n_periods: 8
 75%|███████▌  | 6/8 [00:14<00:04,  2.35s/it]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/lmp-by-settlement-location-weis?path=%2F2024%2F09%2FBy_Day%2FWEIS-RTBM-LMP-DAILY-SL-20240920.csv
ERROR:src.data_collection:
100%|██████████| 8/8 [00:20<00:00,  2.58s/it]
INFO:src.data_collection:ROWS INSERTED: 0 ROWS UPDATED: 55,104


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55104 entries, 0 to 55103
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   Interval_HE               55104 non-null  datetime64[ns]
 1   GMTIntervalEnd_HE         55104 non-null  datetime64[ns]
 2   timestamp_mst_HE          55104 non-null  datetime64[ns]
 3   Settlement_Location_Name  55104 non-null  object        
 4   PNODE_Name                55104 non-null  object        
 5   LMP                       55104 non-null  float64       
 6   MLC                       55104 non-null  float64       
 7   MCC                       55104 non-null  float64       
 8   MEC                       55104 non-null  float64       
dtypes: datetime64[ns](3), float64(4), object(2)
memory usage: 3.8+ MB
