In [38]:
import tomli
from pathlib import Path
import requests
import zipfile
import io
from itertools import product

In [70]:
URL_TOML_DIR = Path(Path.cwd().parent, "data", "nysio_links.toml")
WEATHER_SAVE_DIR = Path(Path.cwd().parent, "data", "raw_data", "weather")
LOAD_SAVE_DIR = Path(Path.cwd().parent, "data", "raw_data", "load")
GET_WEATHER_DATA = False  # Flag to make sure the data isnt needlessly re-downloaded
GET_LOAD_DATA = False

In [61]:
def get_zip_data(url: str, save_dir: Path) -> None:
    """Downloads and unzips zip files downloaded from the internet.
    Returns None if page is 404.

    Args:
        url (str): url pointing to zip file
        save_dir (Path): directory to save unzipped files

    Returns:
        None: None
    """
    r = requests.get(url)
    if r.status_code != 200:
        return None
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(save_dir)
    return None

In [83]:
def get_folder_size(root_dir: Path) -> int:
    """Returns the size of a folder in mb.
    Not recursive, does not include sub directories.

    Args:
        root_dir (Path): folder directory

    Returns:
        int: folder size in mb
    """
    return (
        sum(f.stat().st_size for f in root_dir.glob("**/*") if f.is_file()) / 1_000_000
    )

In [62]:
with open(URL_TOML_DIR, mode="rb") as fp:
    urls_dict = tomli.load(fp)
weather_stem = urls_dict["urls"]["weather_stem"]
load_stem = urls_dict["urls"]["load_stem"]

# Weather

The weather zip files are constructed in the following format:

* "http://mis.nyiso.com/public/csv/lfweather/"
* date of file in yearmonthday format: "20230601"
* "lfweather_csv.zip"

The weather urls are bucketed into months, so the date portion of the url is always 1. 
From the website, we can see weather data from 2008-09 through to 2023-06

In [64]:
years = [str(x) for x in range(2008, 2024)]
months = [str(x) for x in range(1, 13)]
months = ["0" + x if int(x) < 10 else x for x in months]
days = ["01"]
# combining and concatenating all possible dates into the required format
# Note some dates will be invalid, e.g 2008 before 09.
# We will handle this in the requests function
all_possible_dates = ["".join(x) for x in list(product(years, months, days))]
all_weather_urls = [
    weather_stem + date + "lfweather_csv.zip" for date in all_possible_dates
]

In [58]:
if GET_WEATHER_DATA:
    for url in all_weather_urls:
        get_zip_data(url, WEATHER_SAVE_DIR)

# Load 

The load zip files are constructed in the following format:

* "http://mis.nyiso.com/public/csv/pal/"
* date of file in yearmonthday format: "20230601"
* "pal_csv.zip"

The Load data goes back to 2001-05, we'll grab everything even though the weather data only goes back to 2008

In [67]:
load_years = [str(x) for x in range(2001, 2024)]
all_possible_dates = ["".join(x) for x in list(product(years, months, days))]
all_load_urls = [load_stem + date + "pal_csv.zip" for date in all_possible_dates]

In [71]:
if GET_LOAD_DATA:
    for url in all_load_urls:
        get_zip_data(url, LOAD_SAVE_DIR)

# Data check

In [93]:
print(len(list(Path.glob(LOAD_SAVE_DIR, "*.csv"))))
print(len(list(Path.glob(WEATHER_SAVE_DIR, "*.csv"))))

5615
5351


In [94]:
print(get_folder_size(LOAD_SAVE_DIR))
print(get_folder_size(WEATHER_SAVE_DIR))

913.177374
34.879922


5615 csv files found for load data, totalling 913 mb. 

5351 csv files found for weather data, totalling 34 mb