In [31]:
import requests
import zipfile
import io
import pandas as pd
import os

def download_and_extract_zip(url):
    """
    Downloads and extracts all CSVs from a MISO ZIP archive.
    Returns a list of DataFrames.
    """
    print(f"  Trying: {url}")
    response = requests.get(url)

    if response.status_code != 200:
        return []

    if response.content[:2] != b'PK':  # ZIP files start with 'PK'
        return []

    dfs = []
    try:
        with zipfile.ZipFile(io.BytesIO(response.content)) as z:
            for filename in z.namelist():
                if filename.endswith(".csv"):
                    print(f"    Extracting: {filename}")
                    with z.open(filename) as f:
                        try:
                            df = pd.read_csv(f, low_memory=False)
                            df["SOURCE_FILE"] = filename
                            dfs.append(df)
                        except Exception as e:
                            print(f"    Failed to parse {filename}: {e}")
    except zipfile.BadZipFile:
        print(f"    Error: File at {url} is not a valid ZIP archive.")
    return dfs

def scrape_miso_quarterly_zips(year, quarters, output_dir="miso_data"):
    """
    Downloads and processes MISO LMP ZIP files for given year and quarters,
    handling naming inconsistencies and saving each quarter separately.
    """
    os.makedirs(output_dir, exist_ok=True)

    quarter_map_variants = {
        "Q1": ["Jan-Mar", "Jan_Mar"],
        "Q2": ["Apr-Jun", "Apr_Jun"],
        "Q3": ["Jul-Sep", "Jul_Sep"],
        "Q4": ["Oct-Dec", "Oct_Dec"]
    }
    suffixes = ["DA_LMPs.zip", "DA_LMP.zip"]
    year_sep_variants = ["_", "-"]

    for q in quarters:
        print(f"\n📦 Processing {year} {q}")
        success = False
        for quarter_str in quarter_map_variants[q]:
             for sep in year_sep_variants:
                for suffix in suffixes:
                    filename = f"{year}{sep}{quarter_str}_{suffix}"
                    url = f"https://docs.misoenergy.org/marketreports/{filename}"

                    dfs = download_and_extract_zip(url)
                    if dfs:
                        combined_df = pd.concat(dfs, ignore_index=True)
                        out_path = os.path.join(output_dir, f"{year}_{q}.csv")
                        combined_df.to_csv(out_path, index=False)
                        print(f"✅ Saved to {out_path}")
                        success = True
                        break  # stop after the first successful variant
                if success:
                    break
                if not success:
                    print(f"⚠️ No valid file found for {year} {q}")



# Example usage
if __name__ == "__main__":
    scrape_miso_quarterly_zips(year=2024, quarters=["Q1", "Q2", "Q3", "Q4"])




📦 Processing 2024 Q1
  Trying: https://docs.misoenergy.org/marketreports/2024_Jan-Mar_DA_LMPs.zip
    Extracting: DA.csv
✅ Saved to miso_data/2024_Q1.csv
  Trying: https://docs.misoenergy.org/marketreports/2024_Jan_Mar_DA_LMPs.zip
  Trying: https://docs.misoenergy.org/marketreports/2024_Jan_Mar_DA_LMP.zip

📦 Processing 2024 Q2
  Trying: https://docs.misoenergy.org/marketreports/2024_Apr-Jun_DA_LMPs.zip
    Extracting: DA.csv
✅ Saved to miso_data/2024_Q2.csv
  Trying: https://docs.misoenergy.org/marketreports/2024_Apr_Jun_DA_LMPs.zip
  Trying: https://docs.misoenergy.org/marketreports/2024_Apr_Jun_DA_LMP.zip

📦 Processing 2024 Q3
  Trying: https://docs.misoenergy.org/marketreports/2024_Jul-Sep_DA_LMPs.zip
  Trying: https://docs.misoenergy.org/marketreports/2024_Jul-Sep_DA_LMP.zip
⚠️ No valid file found for 2024 Q3
  Trying: https://docs.misoenergy.org/marketreports/2024-Jul-Sep_DA_LMPs.zip
    Extracting: DA.csv
✅ Saved to miso_data/2024_Q3.csv
  Trying: https://docs.misoenergy.org/mar

In [45]:
import os
import requests
import zipfile
import io
import pandas as pd

def extract_and_save_quarter_from_nested_zip(nested_zip_bytes, nested_filename, year, output_dir):
    """
    Extracts CSVs from a nested ZIP archive and saves them as a quarterly file
    based on the name of the nested ZIP.
    """
    # Identify quarter from filename
    quarter_hint = ""
    if any(m in nested_filename for m in ["Jan", "Feb", "Mar"]):
        quarter_hint = "Q1"
    elif any(m in nested_filename for m in ["Apr", "May", "Jun"]):
        quarter_hint = "Q2"
    elif any(m in nested_filename for m in ["Jul", "Aug", "Sep"]):
        quarter_hint = "Q3"
    elif any(m in nested_filename for m in ["Oct", "Nov", "Dec"]):
        quarter_hint = "Q4"
    else:
        print(f"⚠️ Could not identify quarter from: {nested_filename}")
        return

    # Extract CSVs from nested ZIP
    dfs = []
    try:
        with zipfile.ZipFile(io.BytesIO(nested_zip_bytes)) as nested_zip:
            for filename in nested_zip.namelist():
                if filename.endswith(".csv"):
                    print(f"    📄 Extracting CSV: {filename}")
                    with nested_zip.open(filename) as f:
                        try:
                            df = pd.read_csv(f, low_memory=False)
                            df["SOURCE_FILE"] = filename
                            dfs.append(df)
                        except Exception as e:
                            print(f"      ⚠️ Failed to read {filename}: {e}")
    except zipfile.BadZipFile:
        print(f"⚠️ Invalid nested ZIP: {nested_filename}")
        return

    # Save only that quarter’s data
    if dfs:
        combined = pd.concat(dfs, ignore_index=True)
        out_path = os.path.join(output_dir, f"{year}_{quarter_hint}.csv")
        combined.to_csv(out_path, index=False)
        print(f"✅ Saved quarterly CSV: {out_path}")
    else:
        print(f"⚠️ No CSVs extracted from: {nested_filename}")

def download_and_process_annual_zip(year, output_dir="miso_data"):
    """
    Downloads a yearly MISO ZIP (which contains nested ZIPs by month/quarter),
    extracts quarterly data, and saves each quarter separately.
    """
    os.makedirs(output_dir, exist_ok=True)
    filename = f"{year}01_DA_LMPs_zip.zip"
    url = f"https://docs.misoenergy.org/marketreports/{filename}"
    print(f"\n📦 Processing annual archive for {year}: {url}")

    try:
        response = requests.get(url)
        if response.status_code != 200 or response.content[:2] != b'PK':
            print("❌ Failed to download or invalid ZIP format.")
            return

        with zipfile.ZipFile(io.BytesIO(response.content)) as outer_zip:
            for nested_name in outer_zip.namelist():
                if nested_name.endswith(".zip"):
                    print(f"  📦 Found nested ZIP: {nested_name}")
                    with outer_zip.open(nested_name) as nested_file:
                        nested_bytes = nested_file.read()
                        extract_and_save_quarter_from_nested_zip(
                            nested_zip_bytes=nested_bytes,
                            nested_filename=nested_name,
                            year=year,
                            output_dir=output_dir
                        )
    except Exception as e:
        print(f"⚠️ Error processing archive: {e}")

if __name__ == "__main__":
    download_and_process_annual_zip(2022, output_dir="miso_data")



📦 Processing annual archive for 2022: https://docs.misoenergy.org/marketreports/202201_DA_LMPs_zip.zip
  📦 Found nested ZIP: 2022_Apr-Jun_DA_LMPs.zip
    📄 Extracting CSV: 2022_Apr-Jun_DA_LMPs.csv
✅ Saved quarterly CSV: miso_data/2022_Q2.csv
  📦 Found nested ZIP: 2022_Jan-Mar_DA_LMPs.zip
    📄 Extracting CSV: 2022_Jan-Mar_DA_LMP.csv
✅ Saved quarterly CSV: miso_data/2022_Q1.csv
  📦 Found nested ZIP: 2022_Jul-Sep_DA_LMPs.zip
    📄 Extracting CSV: 2022_Jul-Sep_DA_LMPs.csv
✅ Saved quarterly CSV: miso_data/2022_Q3.csv
  📦 Found nested ZIP: 2022_Oct-Dec_DA_LMPs.zip
    📄 Extracting CSV: 2022_Oct-Dec_DA_LMPs.csv
✅ Saved quarterly CSV: miso_data/2022_Q4.csv
