# Data collection
Gather public SPP Weis data from https://marketplace.spp.org/groups/operational-data-weis

In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import os
import sys

# logging
import logging

import pandas as pd
import polars as pl
import duckdb

# define log
logging.basicConfig(level=logging.INFO)

# Set the logging level for the 'py4j' logger to ERROR or WARNING
logging.getLogger("py4j").setLevel(logging.ERROR) 

log = logging.getLogger(__name__)


In [0]:
# base imports
import os
import sys
from io import StringIO
from typing import List, Union, Callable
import tqdm

# logging
import logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

# data
import requests
import pandas as pd
import duckdb
# from meteostat import Hourly, Point

In [0]:
# set env vars
if dbutils:
    os.environ['AWS_ACCESS_KEY_ID'] = dbutils.secrets.get(scope = "aws", key = "AWS_ACCESS_KEY_ID")
    os.environ['AWS_SECRET_ACCESS_KEY'] = dbutils.secrets.get(scope = "aws", key = "AWS_SECRET_ACCESS_KEY")

from dotenv import load_dotenv
load_dotenv()


In [0]:
os.environ.get('AWS_S3_BUCKET')
os.environ.get('AWS_S3_FOLDER')

In [0]:
os.chdir('../..')
sys.path.append('./src')

In [0]:
import src.data_collection as dc

## Mid Term Load Forecast

![_](../../imgs/mtlf.PNG)

HOUR = {0000, ..., 2300}
DAY = {01, ..., 31}

In [0]:
# weis time are in central time
pd.to_datetime("4/1/2023 07:00:00").tz_localize("America/Chicago")

In [0]:
dc.get_time_components('4/1/2023 07:30')

In [0]:
dc.get_time_components('4/1/2023 07:30', five_min_ceil=True)

In [0]:
tc = dc.get_time_components(five_min_ceil=True)
tc

In [0]:
tc = dc.get_time_components(five_min_ceil=False)
tc

In [0]:
tc = dc.get_time_components('6/7/2025 08:00:00')

In [0]:
mtlf_url = dc.get_hourly_mtlf_url(tc)
print(mtlf_url)
mtlf_url.split('WEIS-')[-1].replace('.csv','.parquet')

In [0]:
mtrf_url = dc.get_hourly_mtrf_url(tc)
print(mtrf_url)
mtrf_url.split('WEIS-')[-1].replace('.csv','.parquet')

In [0]:
lmp_5min = dc.get_5min_lmp_url(tc)
print(lmp_5min)
lmp_5min.split('WEIS-')[-1].replace('.csv','.parquet')

In [0]:
lmp_daily = dc.get_daily_lmp_url(tc)
print(lmp_daily)
lmp_daily.split('WEIS-')[-1].replace('.csv','.parquet')

In [0]:
# test error handling
df = dc.get_csv_from_url(mtlf_url+'bad_url')
df

In [0]:
# test error handling
df = dc.get_csv_from_url('a'+mtlf_url)
df

In [0]:
# test success
df = dc.get_csv_from_url(mtlf_url)
df

In [0]:
# test getting and processing
dc.get_process_mtlf(tc)

In [0]:
# test getting data from a range of timestamps
end_ts = (
    pd.Timestamp.utcnow().tz_convert("America/Chicago").tz_localize(None) - 
    pd.Timedelta('2D')
).tz_localize(None)

range_df = dc.get_range_data_mtlf(end_ts=end_ts,  n_periods=20)
range_df

In [0]:
pl.read_parquet(range_df[0])

In [0]:
parquet_files = [pf for pf in range_df if pf.endswith('.parquet')]
parquet_files

In [0]:
dc.upsert_mtlf(parquet_files)

## Mid Term Resource Forecast

![_](../../imgs/mtrf.PNG)

In [0]:
range_df = dc.get_range_data_mtrf(end_ts=end_ts,  n_periods=10)
range_df

In [0]:
pl.read_parquet(range_df[0])

In [0]:
parquet_files = [pf for pf in range_df if pf.endswith('.parquet')]
parquet_files

In [0]:

dc.upsert_mtrf(parquet_files)

## LMP settlement location prices

![_](../../imgs/lmp_settlement_location.PNG)

In [0]:
range_df = dc.get_range_data_interval_5min_lmps(end_ts=end_ts,  n_periods=10)
range_df

In [0]:
pl.read_parquet(range_df[0])

In [0]:
# pl.scan_parquet('s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/lmp.parquet').head().collect()

In [0]:
parquet_files = [pf for pf in range_df if pf.endswith('.parquet')]
parquet_files

In [0]:
dc.upsert_lmp(parquet_files)

### Test LMP daily file

In [0]:
range_df = dc.get_range_data_interval_daily_lmps(end_ts=end_ts,  n_periods=10)
range_df

In [0]:
pl.read_parquet(range_df[0])

In [0]:
# pl.scan_parquet('s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/lmp.parquet').head().collect()

In [0]:
parquet_files = [pf for pf in range_df if pf.endswith('.parquet')]
parquet_files

In [0]:
dc.upsert_lmp(parquet_files)

## Generation capacity by fuel type