# Data collection
Gather public SPP Weis data from https://marketplace.spp.org/groups/operational-data-weis

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import sys

# logging
import logging

import pandas as pd
import polars as pl
import duckdb

# define log
logging.basicConfig(level=logging.INFO)

# Set the logging level for the 'py4j' logger to ERROR or WARNING
logging.getLogger("py4j").setLevel(logging.ERROR) 

log = logging.getLogger(__name__)


In [None]:
# base imports
import os
import sys
from io import StringIO
from typing import List, Union, Callable
import tqdm

# logging
import logging
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)

# data
import requests
import pandas as pd
import duckdb
# from meteostat import Hourly, Point

In [None]:
os.environ["AWS_DEFAULT_REGION"] = "us-west-2"

# On Databricks, pull AWS creds and config from the secrets store.
# Locally, fall back to .env file via python-dotenv.
if 'dbutils' in locals():
    os.environ['AWS_ACCESS_KEY_ID'] = dbutils.secrets.get(scope="aws", key="AWS_ACCESS_KEY_ID")
    os.environ['AWS_SECRET_ACCESS_KEY'] = dbutils.secrets.get(scope="aws", key="AWS_SECRET_ACCESS_KEY")
    os.environ['AWS_S3_BUCKET'] = dbutils.secrets.get(scope="aws", key="AWS_S3_BUCKET")
    os.environ['AWS_S3_FOLDER'] = dbutils.secrets.get(scope="aws", key="AWS_S3_FOLDER")
else:
    print('not on DBX')

from dotenv import load_dotenv
load_dotenv()

In [None]:
AWS_S3_BUCKET = os.environ.get('AWS_S3_BUCKET')
AWS_S3_FOLDER = os.environ.get('AWS_S3_FOLDER')
assert AWS_S3_BUCKET
assert AWS_S3_FOLDER

In [None]:
os.chdir('../..')
sys.path.append('./src')

In [None]:
import src.data_collection as dc

## Mid Term Load Forecast

![_](../../imgs/mtlf.PNG)

HOUR = {0000, ..., 2300}
DAY = {01, ..., 31}

In [None]:
# test getting data from a range of timestamps
end_ts = (
    pd.Timestamp.utcnow().tz_convert("America/Chicago").tz_localize(None) - 
    pd.Timedelta('2D')
).tz_localize(None)

range_df = dc.get_range_data_mtlf(end_ts=end_ts, n_periods=8760*2)
range_df[:10]

In [None]:
pl.read_parquet(range_df[0])

In [None]:
parquet_files = [pf for pf in range_df if pf.endswith('.parquet')]
parquet_files[:10]

In [None]:
dc.upsert_mtlf_mtrf_lmp(parquet_files, target = 'mtlf')

In [None]:
pl.scan_parquet('s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtlf.parquet').head().collect()

## Mid Term Resource Forecast

![_](../../imgs/mtrf.PNG)

In [None]:
range_df = dc.get_range_data_mtrf(end_ts=end_ts,  n_periods=8760*2)
range_df[:10]

In [None]:
pl.read_parquet(range_df[0])

In [None]:
parquet_files = [pf for pf in range_df if pf.endswith('.parquet')]
parquet_files[:10]

In [None]:
dc.upsert_mtlf_mtrf_lmp(parquet_files, target = 'mtrf')

In [None]:
s3_path = 's3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/mtrf.parquet'
pl.scan_parquet(s3_path).head().collect()

## LMP settlement location prices

![_](../../imgs/lmp_settlement_location.PNG)

### LMP daily file

In [None]:
range_df = dc.get_range_data_interval_daily_lmps(end_ts=end_ts,  n_periods=365*2)
range_df

In [None]:
pl.read_parquet(range_df[0])

In [None]:
parquet_files = [pf for pf in range_df if pf.endswith('.parquet')]
parquet_files

In [None]:
dc.upsert_mtlf_mtrf_lmp(parquet_files, target = 'lmp')

In [None]:
pl.scan_parquet('s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/lmp.parquet').head().collect()

### LMP 5 minute file

In [None]:
range_df = dc.get_range_data_interval_5min_lmps(end_ts=end_ts,  n_periods=24*12)
range_df[:10]

In [None]:
pl.read_parquet(range_df[0])

In [None]:
parquet_files = [pf for pf in range_df if pf.endswith('.parquet')]
parquet_files[:10]

In [None]:
dc.upsert_mtlf_mtrf_lmp(parquet_files, target = 'lmp')

In [None]:
pl.scan_parquet('s3://databricks-storage-7474645306723306/unity-catalog/7474645306723306/spp-weis/data/lmp.parquet').head().collect()