In [1]:
import pandas as pd
import os

# Folder where your CSVs are stored
data_dir = "energyData"
cleaned_dir = "energyData_clean"
os.makedirs(cleaned_dir, exist_ok=True)

file_list = [f for f in os.listdir(data_dir) if f.endswith("_hourly.csv")]

for filename in file_list:
    filepath = os.path.join(data_dir, filename)
    print(f"Processing {filename}...")

    df = pd.read_csv(filepath)

    # Detect columns
    print("Columns:", df.columns.tolist())

    # Auto-detect datetime and load column
    datetime_col = 'Datetime' if 'Datetime' in df.columns else df.columns[0]
    mw_col = 'MW' if 'MW' in df.columns else df.columns[1]  # assume 2nd col is load

    # Convert datetime
    df[datetime_col] = pd.to_datetime(df[datetime_col], errors='coerce')

    # Drop rows with missing datetime or MW
    df.dropna(subset=[datetime_col, mw_col], inplace=True)

    # Set index and sort
    df.set_index(datetime_col, inplace=True)
    df = df.sort_index()

    # Resample to daily
    df_daily = df[[mw_col]].resample('D').mean()
    df_daily.dropna(inplace=True)

    # Save cleaned version
    cleaned_path = os.path.join(cleaned_dir, filename.replace(".csv", "_daily.csv"))
    df_daily.to_csv(cleaned_path)
    print(f"Saved cleaned file to: {cleaned_path}")

Processing AEP_hourly.csv...
Columns: ['Datetime', 'AEP_MW']
Saved cleaned file to: energyData_clean\AEP_hourly_daily.csv
Processing COMED_hourly.csv...
Columns: ['Datetime', 'COMED_MW']
Saved cleaned file to: energyData_clean\COMED_hourly_daily.csv
Processing DAYTON_hourly.csv...
Columns: ['Datetime', 'DAYTON_MW']
Saved cleaned file to: energyData_clean\DAYTON_hourly_daily.csv
Processing DEOK_hourly.csv...
Columns: ['Datetime', 'DEOK_MW']
Saved cleaned file to: energyData_clean\DEOK_hourly_daily.csv
Processing DOM_hourly.csv...
Columns: ['Datetime', 'DOM_MW']
Saved cleaned file to: energyData_clean\DOM_hourly_daily.csv
Processing DUQ_hourly.csv...
Columns: ['Datetime', 'DUQ_MW']
Saved cleaned file to: energyData_clean\DUQ_hourly_daily.csv
Processing EKPC_hourly.csv...
Columns: ['Datetime', 'EKPC_MW']
Saved cleaned file to: energyData_clean\EKPC_hourly_daily.csv
Processing FE_hourly.csv...
Columns: ['Datetime', 'FE_MW']
Saved cleaned file to: energyData_clean\FE_hourly_daily.csv
Proces

In [2]:
file_list = [f for f in os.listdir(cleaned_dir) if f.endswith("_daily.csv")]

for filename in file_list:
    filepath = os.path.join(cleaned_dir, filename)
    print(f"\n🔍 Validating {filename}...")

    try:
        df = pd.read_csv(filepath, index_col=0, parse_dates=True)

        # Basic checks
        assert isinstance(df.index, pd.DatetimeIndex), "❌ Index is not datetime"
        assert df.isnull().sum().sum() == 0, "❌ Contains missing values"

        # Range checks
        col = df.columns[0]
        if not (5000 < df[col].min() and df[col].max() < 100000):
            print(f"⚠️  {col} values out of expected MW range: min={df[col].min()}, max={df[col].max()}")

        # Print info
        print(f"✅ {filename}:")
        print(f"   Rows: {len(df)}")
        print(f"   Date Range: {df.index.min().date()} → {df.index.max().date()}")
        print(f"   MW Range: {df[col].min():.2f} → {df[col].max():.2f}")

    except Exception as e:
        print(f"❌ Error in {filename}: {e}")


🔍 Validating AEP_hourly_daily.csv...
✅ AEP_hourly_daily.csv:
   Rows: 5055
   Date Range: 2004-10-01 → 2018-08-03
   MW Range: 11078.04 → 22847.88

🔍 Validating COMED_hourly_daily.csv...
✅ COMED_hourly_daily.csv:
   Rows: 2772
   Date Range: 2011-01-01 → 2018-08-03
   MW Range: 8148.75 → 19920.29

🔍 Validating DAYTON_hourly_daily.csv...
⚠️  DAYTON_MW values out of expected MW range: min=1366.3333333333333, max=3136.625
✅ DAYTON_hourly_daily.csv:
   Rows: 5055
   Date Range: 2004-10-01 → 2018-08-03
   MW Range: 1366.33 → 3136.62

🔍 Validating DEOK_hourly_daily.csv...
⚠️  DEOK_MW values out of expected MW range: min=1219.0, max=4503.458333333333
✅ DEOK_hourly_daily.csv:
   Rows: 2407
   Date Range: 2012-01-01 → 2018-08-03
   MW Range: 1219.00 → 4503.46

🔍 Validating DOM_hourly_daily.csv...
✅ DOM_hourly_daily.csv:
   Rows: 4843
   Date Range: 2005-05-01 → 2018-08-03
   MW Range: 7772.00 → 18976.62

🔍 Validating DUQ_hourly_daily.csv...
⚠️  DUQ_MW values out of expected MW range: min=1188.