# Data collection
Gather public SPP Weis data from https://marketplace.spp.org/groups/operational-data-weis

In [1]:
import os
import pandas as pd
import duckdb

import requests
from io import StringIO

import ibis
import ibis.selectors as s
ibis.options.interactive = True

# logging
import logging

In [2]:
# define log
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)


In [3]:
os.chdir('../..')

In [4]:
import src.data_collection as dc

INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 46


## Set up backfill parameters

In [5]:
start_date = '2023-06-01'
end_date = '2024-09-02'
num_day_step = 30
day_list = pd.date_range(start = start_date, end=end_date, freq=f'{num_day_step }d')
day_list = [pd.Timestamp(d) for d in day_list]
day_list[:5]

[Timestamp('2023-06-01 00:00:00'),
 Timestamp('2023-07-01 00:00:00'),
 Timestamp('2023-07-31 00:00:00'),
 Timestamp('2023-08-30 00:00:00'),
 Timestamp('2023-09-29 00:00:00')]

## Mid Term Load Forecast

![_](../../imgs/mtlf.PNG)

HOUR = {0000, ..., 2300}
DAY = {01, ..., 31}

In [6]:
for d in day_list:
    dc.collect_upsert_mtlf(end_ts=d, n_periods=24*num_day_step+1, backfill=True)

INFO:src.data_collection:end_ts: 2023-06-01 00:00:00
INFO:src.data_collection:n_periods: 721
  0%|                                                                                                                                     | 0/721 [00:00<?, ?it/s]INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 46
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 46
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 46
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 46
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 46
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 46
INFO:src.data_collection:N_JOBS: 46
INFO:src.data_collection:number of cores available: 48
INFO:src.data_collection:N_JOBS: 46
INFO:src.data_col

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  744 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


  9%|███████████▎                                                                                                                | 66/721 [00:02<00:22, 29.62it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F06%2F04%2FWEIS-OP-MTLF-202306040900.csv
ERROR:src.data_collection:
 19%|███████████████████████▏                                                                                                   | 136/721 [00:03<00:17, 34.34it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F06%2F06%2FWEIS-OP-MTLF-202306061900.csv
ERROR:src.data_collection:
ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F06%2F06%2FWEIS-OP-MTLF-202306061800.csv
ERRO

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


 41%|███████████████████████████████████████████████████                                                                        | 299/721 [00:08<00:11, 36.63it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F07%2F13%2FWEIS-OP-MTLF-202307131900.csv
ERROR:src.data_collection:
 42%|███████████████████████████████████████████████████▏                                                                       | 300/721 [00:08<00:11, 36.69it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F07%2F13%2FWEIS-OP-MTLF-202307132000.csv
ERROR:src.data_collection:
 42%|███████████████████████████████████████████████████▎                                                                       | 301/721 [00:08<00:11, 36.68it/s]ERROR:src.data_collection:ERROR READING URL: htt

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


 94%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏      | 681/721 [00:35<00:02, 19.24it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F08%2F29%2FWEIS-OP-MTLF-202308292100.csv
ERROR:src.data_collection:
 95%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎      | 682/721 [00:35<00:02, 19.26it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F08%2F29%2FWEIS-OP-MTLF-202308292000.csv
ERROR:src.data_collection:
 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉   | 703/721 [00:41<00:01, 17.02it/s]ERROR:src.data_collection:ERROR READING URL: htt

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


 49%|████████████████████████████████████████████████████████████▏                                                              | 353/721 [00:17<00:18, 19.61it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F09%2F14%2FWEIS-OP-MTLF-202309142000.csv
ERROR:src.data_collection:
 52%|███████████████████████████████████████████████████████████████▊                                                           | 374/721 [00:18<00:16, 20.48it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F09%2F07%2FWEIS-OP-MTLF-202309072000.csv
ERROR:src.data_collection:
 93%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎        | 670/721 [00:42<00:03, 15.94it/s]ERROR:src.data_collection:ERROR READING URL: htt

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊| 720/721 [00:38<00:00, 18.80it/s]ERROR:src.data_collection:HTTPSConnectionPool(host='marketplace.spp.org', port=443): Max retries exceeded with url: /file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F10%2F09%2FWEIS-OP-MTLF-202310090400.csv (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f687248e110>, 'Connection to marketplace.spp.org timed out. (connect timeout=60)'))
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 721/721 [01:06<00:00, 10.81it/s]
INFO:src.data_collection:ROWS INSERTED: 720 ROWS UPDATED: 26
INFO:src.data_collection:end_ts: 2023-11-28 00:00:00
INFO:src.data_collection:n_periods: 721
ERROR:src.data_collection:error parsing: 2023-11-05 01:00:00


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


 76%|█████████████████████████████████████████████████████████████████████████████████████████████▎                             | 546/720 [00:31<00:09, 17.55it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F11%2F21%2FWEIS-OP-MTLF-202311211800.csv
ERROR:src.data_collection:
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 720/720 [00:42<00:00, 16.78it/s]
INFO:src.data_collection:ROWS INSERTED: 721 ROWS UPDATED: 26
INFO:src.data_collection:end_ts: 2023-12-28 00:00:00
INFO:src.data_collection:n_periods: 721


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 914 entries, 0 to 913
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         914 non-null    datetime64[ns]
 1   GMTIntervalEnd   914 non-null    datetime64[ns]
 2   MTLF             914 non-null    int64         
 3   Averaged_Actual  747 non-null    float64       
 4   timestamp_mst    914 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


  1%|█▌                                                                                                                           | 9/721 [00:01<01:19,  8.93it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F11%2F28%2FWEIS-OP-MTLF-202311282200.csv
ERROR:src.data_collection:
  2%|██▉                                                                                                                         | 17/721 [00:01<00:44, 15.67it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2023%2F11%2F28%2FWEIS-OP-MTLF-202311282100.csv
ERROR:src.data_collection:
  3%|████▏                                                                                                                       | 24/721 [00:01<00:33, 20.73it/s]ERROR:src.data_collection:ERROR READING URL: htt

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


 56%|████████████████████████████████████████████████████████████████████▊                                                      | 403/721 [00:15<00:12, 25.41it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F01%2F11%2FWEIS-OP-MTLF-202401111900.csv
ERROR:src.data_collection:
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋| 719/721 [00:49<00:00, 14.56it/s]ERROR:src.data_collection:HTTPSConnectionPool(host='marketplace.spp.org', port=443): Max retries exceeded with url: /file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F01%2F05%2FWEIS-OP-MTLF-202401050900.csv (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x7f9b64877250>, 'Connection to marketplace.spp.org timed out. (connect timeout=60)'))
100%|████████████████████████

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 721/721 [00:54<00:00, 13.34it/s]
INFO:src.data_collection:ROWS INSERTED: 720 ROWS UPDATED: 26
INFO:src.data_collection:end_ts: 2024-03-27 00:00:00
INFO:src.data_collection:n_periods: 721
ERROR:src.data_collection:error parsing: 2024-03-10 02:00:00


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


  5%|██████                                                                                                                      | 35/720 [00:01<00:26, 26.20it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F02%2F27%2FWEIS-OP-MTLF-202402271800.csv
ERROR:src.data_collection:
 38%|██████████████████████████████████████████████▍                                                                            | 272/720 [00:16<00:27, 16.43it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F03%2F10%2FWEIS-OP-MTLF-202403100300.csv
ERROR:src.data_collection:
 50%|████████████████████████████████████████████████████████████▉                                                              | 357/720 [00:23<00:24, 15.10it/s]ERROR:src.data_collection:ERROR READING URL: htt

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 912 entries, 0 to 911
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         912 non-null    datetime64[ns]
 1   GMTIntervalEnd   912 non-null    datetime64[ns]
 2   MTLF             912 non-null    int64         
 3   Averaged_Actual  744 non-null    float64       
 4   timestamp_mst    912 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


  0%|                                                                                                                                     | 0/721 [00:00<?, ?it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F03%2F28%2FWEIS-OP-MTLF-202403281800.csv
ERROR:src.data_collection:
 65%|███████████████████████████████████████████████████████████████████████████████▋                                           | 467/721 [00:25<00:13, 18.19it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F04%2F16%2FWEIS-OP-MTLF-202404160000.csv
ERROR:src.data_collection:
 89%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████▌             | 642/721 [00:40<00:05, 15.76it/s]ERROR:src.data_collection:ERROR READING URL: htt

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913 entries, 0 to 912
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Interval         913 non-null    datetime64[ns]
 1   GMTIntervalEnd   913 non-null    datetime64[ns]
 2   MTLF             913 non-null    int64         
 3   Averaged_Actual  746 non-null    float64       
 4   timestamp_mst    913 non-null    datetime64[ns]
dtypes: datetime64[ns](3), float64(1), int64(1)
memory usage: 35.8 KB


 49%|████████████████████████████████████████████████████████████▏                                                              | 353/721 [00:18<00:19, 18.85it/s]ERROR:src.data_collection:ERROR READING URL: https://marketplace.spp.org/file-browser-api/download/systemwide-hourly-load-forecast-mtlf-vs-actual-weis?path=%2F2024%2F05%2F12%2FWEIS-OP-MTLF-202405120000.csv
ERROR:src.data_collection:
 90%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▏           | 652/721 [00:48<00:05, 13.39it/s]


KeyboardInterrupt: 

## Mid Term Resource Forecast

![_](../../imgs/mtrf.PNG)

In [None]:
for d in day_list:
    dc.collect_upsert_mtrf(end_ts=d, n_periods=24*num_day_step+1, backfill=True)

## LMP settlement location prices

![_](../../imgs/lmp_settlement_location.PNG)

## LMP upsert

### LMP daily file

In [None]:
for d in day_list:
    dc.collect_upsert_lmp(end_ts=d, n_periods=num_day_step+1, daily_file=True)