In [1]:
import pandas as pd
import os
from collections import defaultdict

In [2]:
# Load city-level temperature dataset
city_path = "temperatureData/GlobalLandTemperaturesByCity.csv"
df_city = pd.read_csv(city_path, parse_dates=["dt"])

# Filter for U.S. cities and drop missing temperature values
df_city = df_city[df_city["Country"] == "United States"]
df_city = df_city.rename(columns={
    "dt": "Date",
    "City": "Location",
    "AverageTemperature": "Temp_C"
})[["Date", "Location", "Temp_C"]].dropna()


In [3]:
# Define cities associated with each PJM region
region_to_cities = {
    "AEP": ["Akron", "Columbus", "Dayton", "Charleston", "Indianapolis", "Lexington Fayette", "Huntington"],
    "COMED": ["Chicago", "Rockford", "Aurora", "Naperville", "Joliet"],
    "DAYTON": ["Dayton"],
    "DEOK": ["Cincinnati", "Lexington Fayette"],
    "DOM": ["Charlotte", "Raleigh", "Richmond", "Virginia Beach", "Washington"],
    "DUQ": ["Pittsburgh"],
    "EKPC": ["Lexington Fayette", "Louisville"],
    "FE": ["Akron", "Allentown", "Cleveland", "Newark", "Baltimore"],
    "NI": ["South Bend", "Fort Wayne"],
    "PJM_Load": ["Columbus", "Detroit", "Charlotte", "Chicago", "Baltimore",
                 "Philadelphia", "Cleveland", "Indianapolis", "Louisville",
                 "Nashville", "Jersey City", "Akron"],
    "PJME": ["Baltimore", "Newark", "Jersey City", "Washington"],
    "PJMW": ["Charleston", "Lexington Fayette", "Columbus", "Dayton"]
}


In [4]:
# Create output folder
os.makedirs("temperatureData_clean", exist_ok=True)

# Loop through all cleaned energy files
for energy_file in os.listdir("energyData_clean"):
    if not energy_file.endswith("_daily.csv"):
        continue

    # Extract and normalize region from filename
    region = energy_file.split("_")[0]
    if region == "PJM":
        region = "PJM_Load"

    if region not in region_to_cities:
        print(f"⚠️ Skipping unrecognized region in file: {energy_file}")
        continue

    # Load energy data to determine actual date range
    energy_path = os.path.join("energyData_clean", energy_file)
    energy_df = pd.read_csv(energy_path, index_col=0, parse_dates=True)
    date_min, date_max = energy_df.index.min(), energy_df.index.max()

    # Filter temperature data for this region's cities and date range
    region_cities = region_to_cities[region]
    filtered_temp = df_city[
        (df_city["Location"].isin(region_cities)) &
        (df_city["Date"] >= date_min) &
        (df_city["Date"] <= date_max)
    ].copy()

    if filtered_temp.empty:
        print(f"⚠️ No temperature data found for {region} ({date_min.date()} → {date_max.date()})")
        continue

    # Format date range in filename
    date_str_min = date_min.strftime("%Y-%m-%d")
    date_str_max = date_max.strftime("%Y-%m-%d")

    # Output name uses display name, so revert "PJM_Load" → "PJM"
    display_region = "PJM" if region == "PJM_Load" else region
    out_path = f"temperatureData_clean/{display_region}_{date_str_min}_to_{date_str_max}.csv"
    filtered_temp.to_csv(out_path, index=False)
    print(f"✅ Saved {display_region} temperature data → {out_path}")

✅ Saved AEP temperature data → temperatureData_clean/AEP_2004-10-01_to_2018-08-03.csv
✅ Saved COMED temperature data → temperatureData_clean/COMED_2011-01-01_to_2018-08-03.csv
✅ Saved DAYTON temperature data → temperatureData_clean/DAYTON_2004-10-01_to_2018-08-03.csv
✅ Saved DEOK temperature data → temperatureData_clean/DEOK_2012-01-01_to_2018-08-03.csv
✅ Saved DOM temperature data → temperatureData_clean/DOM_2005-05-01_to_2018-08-03.csv
✅ Saved DUQ temperature data → temperatureData_clean/DUQ_2005-01-01_to_2018-08-03.csv
✅ Saved EKPC temperature data → temperatureData_clean/EKPC_2013-06-01_to_2018-08-03.csv
✅ Saved FE temperature data → temperatureData_clean/FE_2011-06-01_to_2018-08-03.csv
✅ Saved NI temperature data → temperatureData_clean/NI_2004-05-01_to_2011-01-01.csv
✅ Saved PJME temperature data → temperatureData_clean/PJME_2002-01-01_to_2018-08-03.csv
✅ Saved PJMW temperature data → temperatureData_clean/PJMW_2002-04-01_to_2018-08-03.csv
✅ Saved PJM temperature data → temperatu