In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set style settings
sns.set_theme()
sns.set_palette("husl")

# Notebook-wide settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

def load_data():
    """
    Load both PJM and weather datasets
    """
    # Load PJM data
    pjm_path = '../data/raw/pjm_dataset/pjm_hourly_est.csv'
    df_pjm = pd.read_csv(pjm_path, parse_dates=['Datetime'])
    
    # Load weather data
    weather_path = '../data/raw/weather/noaa_data.csv'
    df_weather = pd.read_csv(weather_path, parse_dates=['timestamp'])
    
    return df_pjm, df_weather

# Load datasets
df_pjm, df_weather = load_data()

# Display basic information
print("PJM Dataset Shape:", df_pjm.shape)
print("\nWeather Dataset Shape:", df_weather.shape)

PJM Dataset Shape: (178262, 13)

Weather Dataset Shape: (148993, 31)


In [2]:
from utils.feature_engineering import merge_data

# Execute the cleaning and merging
df_merged = merge_data(df_pjm, df_weather)

# Display information about the merged dataset
print("Merged Dataset Shape:", df_merged.shape)
print("\nDate Range:")
print("Start:", df_merged['timestamp'].min())
print("End:", df_merged['timestamp'].max())
print("\nMissing Values:")
print(df_merged.isnull().sum()[df_merged.isnull().sum() > 0])
print("\nSample of merged data:")
print(df_merged.head())

Merged Dataset Shape: (36000, 34)

Date Range:
Start: 2002-01-01 00:00:00
End: 2018-06-23 02:00:00

Missing Values:
PJME    1
PJMW    1
dtype: int64

Sample of merged data:
               timestamp     PJME    PJMW  avg_wind_speed  precipitation  \
0    2002-01-01 00:00:00      NaN     NaN         15.0000            0.0   
5249 2002-01-01 01:00:00  30393.0  5038.0         14.8375            0.0   
5250 2002-01-01 02:00:00  29265.0  5038.0         14.6750            0.0   
5251 2002-01-01 03:00:00  28357.0  5038.0         14.5125            0.0   
5252 2002-01-01 04:00:00  27899.0  5038.0         14.3500            0.0   

      avg_temperature  max_temperature  min_temperature  temperature  \
0           -8.400000         3.300000       -20.100000   -20.100000   
5249        -8.782181         3.523102       -20.994105   -20.623083   
5250        -9.111362         3.736495       -21.785040   -20.162511   
5251        -9.389349         3.940488       -22.476271   -18.729084   
5252      

In [3]:
from utils.feature_engineering import create_temporal_features, create_lag_features

# Create features
df_features = create_temporal_features(df_merged)
df_features = create_lag_features(df_features)

# Display information about the new features
print("Dataset shape after feature creation:", df_features.shape)
print("\nNew temporal features:", [col for col in df_features.columns if col not in df_merged.columns])
print("\nSample of cyclic features:")
print(df_features[['hour', 'hour_sin', 'hour_cos', 'day_of_week', 'day_of_week_sin', 'day_of_week_cos']].head())
print("\nMissing values in lag features:")
lag_cols = [col for col in df_features.columns if 'lag' in col or 'rolling' in col]
print(df_features[lag_cols].isnull().sum().sort_values(ascending=False).head())

Dataset shape after feature creation: (35999, 94)

New temporal features: ['hour', 'day', 'month', 'day_of_week', 'week_of_year', 'hour_sin', 'hour_cos', 'day_of_week_sin', 'day_of_week_cos', 'month_sin', 'month_cos', 'is_morning', 'is_afternoon', 'is_evening', 'is_night', 'is_weekend', 'PJME_lag_1h', 'PJMW_lag_1h', 'PJME_lag_2h', 'PJMW_lag_2h', 'PJME_lag_3h', 'PJMW_lag_3h', 'PJME_lag_6h', 'PJMW_lag_6h', 'PJME_lag_12h', 'PJMW_lag_12h', 'PJME_lag_24h', 'PJMW_lag_24h', 'PJME_lag_48h', 'PJMW_lag_48h', 'PJME_lag_72h', 'PJMW_lag_72h', 'PJME_lag_96h', 'PJMW_lag_96h', 'PJME_lag_120h', 'PJMW_lag_120h', 'PJME_lag_144h', 'PJMW_lag_144h', 'PJME_lag_168h', 'PJMW_lag_168h', 'PJME_same_hour_1d', 'PJME_same_hour_7d', 'PJMW_same_hour_1d', 'PJMW_same_hour_7d', 'PJME_rolling_mean_6h', 'PJMW_rolling_mean_6h', 'PJME_rolling_std_6h', 'PJMW_rolling_std_6h', 'PJME_rolling_mean_12h', 'PJMW_rolling_mean_12h', 'PJME_rolling_std_12h', 'PJMW_rolling_std_12h', 'PJME_rolling_mean_24h', 'PJMW_rolling_mean_24h', 'PJM

In [4]:
from utils.feature_engineering import create_weather_features, create_interaction_features

# Create features
df_weather = create_weather_features(df_features)
df_weather = create_interaction_features(df_weather)

# Display information about the new features
print("Dataset shape after weather feature creation:", df_weather.shape)
print("\nNew weather and interaction features:", 
      [col for col in df_weather.columns if col not in df_features.columns])

# Show sample of new features
weather_features = [col for col in df_weather.columns if col not in df_features.columns]
print("\nSample of new weather features:")
print(df_weather[weather_features].head())

# Check for any unexpected missing values
missing_weather = df_weather[weather_features].isnull().sum()
print("\nMissing values in new weather features:")
print(missing_weather[missing_weather > 0])

Dataset shape after weather feature creation: (35999, 127)

New weather and interaction features: ['temp_avg', 'temp_change_chicago', 'temp_change_24h_chicago', 'temp_extreme_cold_chicago', 'temp_extreme_hot_chicago', 'temp_change_washington', 'temp_change_24h_washington', 'temp_extreme_cold_washington', 'temp_extreme_hot_washington', 'temp_change_pittsburgh', 'temp_change_24h_pittsburgh', 'temp_extreme_cold_pittsburgh', 'temp_extreme_hot_pittsburgh', 'temp_change_columbus', 'temp_change_24h_columbus', 'temp_extreme_cold_columbus', 'temp_extreme_hot_columbus', 'temp_spread', 'wind_speed_avg', 'precipitation_avg', 'weather_severity', 'temp_hour_sin', 'temp_hour_cos', 'temp_month_sin', 'temp_month_cos', 'severity_weekend', 'severity_hour', 'temp_morning', 'temp_afternoon', 'temp_evening', 'temp_night', 'temp_change_morning', 'temp_change_afternoon']

Sample of new weather features:
       temp_avg  temp_change_chicago  temp_change_24h_chicago  \
5249 -19.827572                  NaN      

In [5]:
from utils.feature_engineering import create_holiday_features, create_consumption_features

# Create final features
df_final = create_holiday_features(df_weather)
df_final = create_consumption_features(df_final)

# Display information about the new features
print("Dataset shape after final feature creation:", df_final.shape)
print("\nNew features:", [col for col in df_final.columns if col not in df_weather.columns])

# Show sample of holiday features
holiday_cols = [col for col in df_final.columns if 'holiday' in col or 'season' in col]
print("\nSample of holiday features:")
print(df_final[holiday_cols].head())

# Show summary of consumption features
consumption_cols = ['trend_signal', 'is_peak_hour', 'is_peak_hour_of_week', 
                   'consumption_change_rate', 'weekly_seasonality']
print("\nSample of consumption features:")
print(df_final[consumption_cols].head())

# Check missing values in new features
new_cols = [col for col in df_final.columns if col not in df_weather.columns]
missing_final = df_final[new_cols].isnull().sum()
print("\nMissing values in new features:")
print(missing_final[missing_final > 0])

Dataset shape after final feature creation: (35999, 142)

New features: ['is_holiday', 'is_christmas_season', 'is_thanksgiving_season', 'is_summer_holiday', 'is_day_before_holiday', 'is_day_after_holiday', 'ma_24h', 'ma_168h', 'trend_signal', 'hour_of_week', 'is_peak_hour', 'is_peak_hour_of_week', 'consumption_change_rate', 'consumption_change_rate_24h', 'weekly_seasonality']

Sample of holiday features:
      is_holiday  is_christmas_season  is_thanksgiving_season  \
5249        True                 True                   False   
5250        True                 True                   False   
5251        True                 True                   False   
5252        True                 True                   False   
5253        True                 True                   False   

      is_summer_holiday  is_day_before_holiday  is_day_after_holiday  \
5249              False                  False                 False   
5250              False                  False           

In [6]:
from utils.feature_engineering import prepare_final_features

# Prepare features for both prediction windows
df_24h = prepare_final_features(df_final, prediction_window='24h')
df_7d = prepare_final_features(df_final, prediction_window='7d')

# Display information about final feature sets
print("\n24-hour prediction feature set:")
print("Shape:", df_24h.shape)
print("\nFeature list:")
print(df_24h.columns.tolist())

print("\n7-day prediction feature set:")
print("Shape:", df_7d.shape)
print("\nFeature list:")
print(df_7d.columns.tolist())

# Show sample statistics
print("\nSample statistics for 24h features:")
print(df_24h.describe().round(3))

# Check for any remaining missing values
print("\nMissing values in 24h features:")
print(df_24h.isnull().sum()[df_24h.isnull().sum() > 0])
print("\nMissing values in 7d features:")
print(df_7d.isnull().sum()[df_7d.isnull().sum() > 0])

Dropping 23 highly correlated features: ['max_temperature_chicago', 'min_temperature_chicago', 'max_temperature_washington', 'min_temperature_washington', 'max_temperature_pittsburgh', 'min_temperature_pittsburgh', 'avg_temperature_columbus', 'max_temperature_columbus', 'min_temperature_columbus', 'temperature_columbus', 'week_of_year', 'PJMW_lag_2h', 'PJMW_lag_3h', 'PJME_same_hour_1d', 'PJMW_same_hour_1d', 'PJME_rolling_mean_6h', 'PJMW_rolling_mean_6h', 'PJMW_rolling_mean_24h', 'temp_avg', 'temp_change_columbus', 'temp_change_morning', 'ma_24h', 'hour_of_week']
Dropping 26 highly correlated features: ['max_temperature_chicago', 'min_temperature_chicago', 'max_temperature_washington', 'min_temperature_washington', 'max_temperature_pittsburgh', 'min_temperature_pittsburgh', 'avg_temperature_columbus', 'max_temperature_columbus', 'min_temperature_columbus', 'temperature_columbus', 'week_of_year', 'PJMW_lag_2h', 'PJMW_lag_3h', 'PJME_same_hour_1d', 'PJME_same_hour_7d', 'PJMW_same_hour_1d',

In [8]:
def prepare_and_save_features(df_24h, df_7d):
    """
    Prepare final feature sets and save to disk as CSV
    Note: Change the save paths according to your project structure
    Target variable: PJME (PJM East Region)
    """
    # Add timestamp back to both datasets
    df_24h_final = df_24h.copy()
    df_7d_final = df_7d.copy()
    
    # Define feature sets (excluding PJMW and keeping only necessary columns)
    cols_to_drop = ['PJMW']
    
    df_24h_final = df_24h_final.drop(columns=cols_to_drop)
    df_7d_final = df_7d_final.drop(columns=cols_to_drop)
    
    # Save to CSV format
    # Note: Modify these paths according to your project structure
    save_path_24h = '../data/processed/features_24h.csv'
    save_path_7d = '../data/processed/features_7d.csv'
    
    # Save with index (timestamp) included
    df_24h_final.to_csv(save_path_24h, index=True)
    df_7d_final.to_csv(save_path_7d, index=True)
    
    # Print information about saved datasets
    print("Saved feature sets:")
    print(f"24h prediction features shape: {df_24h_final.shape}")
    print(f"7d prediction features shape: {df_7d_final.shape}")
    print("\nFeature sets saved to:")
    print(f"24h features: {save_path_24h}")
    print(f"7d features: {save_path_7d}")
    print("\nNote: Target variable is 'PJME' (PJM East Region)")
    
    # Return the paths for reference
    return save_path_24h, save_path_7d

# Save the feature sets
save_path_24h, save_path_7d = prepare_and_save_features(df_24h, df_7d)

# Verify the saved files
import os
print("\nVerifying saved files:")
print(f"24h features file exists: {os.path.exists(save_path_24h)}")
print(f"7d features file exists: {os.path.exists(save_path_7d)}")

# Display sample of final feature set
print("\nSample of features (24h prediction set):")
pd.read_csv(save_path_24h, nrows=5)

Saved feature sets:
24h prediction features shape: (35975, 108)
7d prediction features shape: (35831, 114)

Feature sets saved to:
24h features: ../data/processed/features_24h.csv
7d features: ../data/processed/features_7d.csv

Note: Target variable is 'PJME' (PJM East Region)

Verifying saved files:
24h features file exists: True
7d features file exists: True

Sample of features (24h prediction set):


Unnamed: 0.1,Unnamed: 0,avg_wind_speed,precipitation,avg_temperature,max_temperature,min_temperature,temperature,avg_wind_speed_chicago,precipitation_chicago,avg_temperature_chicago,temperature_chicago,avg_wind_speed_washington,precipitation_washington,avg_temperature_washington,temperature_washington,avg_wind_speed_pittsburgh,precipitation_pittsburgh,avg_temperature_pittsburgh,temperature_pittsburgh,avg_wind_speed_columbus,precipitation_columbus,year,hour,day,month,day_of_week,hour_sin,hour_cos,day_of_week_sin,day_of_week_cos,month_sin,month_cos,is_morning,is_afternoon,is_evening,is_night,is_weekend,PJME_lag_1h,PJMW_lag_1h,PJME_lag_2h,PJME_lag_3h,PJME_lag_6h,PJMW_lag_6h,PJME_lag_12h,PJMW_lag_12h,PJME_lag_24h,PJMW_lag_24h,PJME_lag_48h,PJMW_lag_48h,PJME_lag_72h,PJMW_lag_72h,PJME_lag_96h,PJMW_lag_96h,PJME_lag_120h,PJMW_lag_120h,PJME_lag_144h,PJMW_lag_144h,PJME_rolling_std_6h,PJMW_rolling_std_6h,PJME_rolling_mean_12h,PJMW_rolling_mean_12h,PJME_rolling_std_12h,PJMW_rolling_std_12h,PJME_rolling_mean_24h,PJME_rolling_std_24h,PJMW_rolling_std_24h,temp_change_chicago,temp_change_24h_chicago,temp_extreme_cold_chicago,temp_extreme_hot_chicago,temp_change_washington,temp_change_24h_washington,temp_extreme_cold_washington,temp_extreme_hot_washington,temp_change_pittsburgh,temp_change_24h_pittsburgh,temp_extreme_cold_pittsburgh,temp_extreme_hot_pittsburgh,temp_change_24h_columbus,temp_extreme_cold_columbus,temp_extreme_hot_columbus,temp_spread,wind_speed_avg,precipitation_avg,weather_severity,temp_hour_sin,temp_hour_cos,temp_month_sin,temp_month_cos,severity_weekend,severity_hour,temp_morning,temp_afternoon,temp_evening,temp_night,temp_change_afternoon,is_holiday,is_christmas_season,is_thanksgiving_season,is_summer_holiday,is_day_before_holiday,is_day_after_holiday,trend_signal,is_peak_hour,is_peak_hour_of_week,consumption_change_rate,consumption_change_rate_24h,weekly_seasonality,PJME
0,5225,-0.429285,-0.467798,-0.536033,-1.322572,-1.469806,-0.769941,-1.679896,-0.408322,-1.359245,-2.142225,-0.458147,-0.464617,-1.60413,-2.208788,-1.784132,-0.553929,-1.606745,-2.186696,-1.970644,-0.51361,-1.283855,-1.412802,-1.563536,-1.187435,-0.519922,0.365548,1.228093,1.38818,-0.301818,0.298529,0.897652,-0.532182,-0.516627,-0.468896,1.254907,-0.641961,-0.091529,-0.613683,0.362116,0.787278,1.12829,-0.613697,0.198576,-0.613706,0.072107,-0.613822,,,,,,,,,,,0.889705,-1.564782,0.528409,-0.686558,-0.05097,-2.22032,0.233747,-1.035542,-2.915956,0.189709,0.064083,3.00441,-0.333462,0.398978,0.243926,3.006271,-0.333462,0.125625,0.217378,3.006737,-0.333462,0.2647,3.006271,-0.333462,0.076363,-1.807859,-0.702771,2.552127,-0.420315,-1.577194,-0.85127,-1.437188,-0.296553,-0.277211,-0.350545,-0.349371,-0.267963,-2.409065,0.29153,-0.172111,2.359741,-0.212192,-0.148381,-0.168394,5.874804,-1.012616,-0.333822,-0.336029,-0.746322,-0.583076,-1.149513,28121.0
1,5226,-0.413475,-0.467798,-0.527761,-1.319414,-1.437953,-0.729014,-1.642962,-0.408322,-1.367566,-2.065389,-0.411805,-0.464617,-1.587179,-2.108546,-1.714185,-0.553929,-1.599921,-2.125841,-1.921339,-0.51361,-1.283855,-1.27692,-1.563536,-1.187435,-0.519922,0.7063,1.084971,1.38818,-0.301818,0.298529,0.897652,-0.532182,-0.516627,-0.468896,1.254907,-0.641961,-0.376708,-0.613683,-0.091566,0.362076,1.109899,-0.613697,0.065874,-0.613706,-0.151022,-0.613822,,,,,,,,,,,0.970321,-1.564782,0.46932,-0.686558,0.205319,-2.22032,0.213871,-0.931248,-2.915956,0.548474,0.072868,3.00441,-0.333462,0.680426,0.351981,3.006271,-0.333462,0.402613,0.196675,3.006737,-0.333462,0.24808,3.006271,-0.333462,-0.009843,-1.745442,-0.702771,2.552127,-0.765793,-1.334755,-0.80733,-1.353304,-0.296553,-0.057025,-0.350545,-0.349371,-0.267963,-2.294085,0.29153,-0.172111,2.359741,-0.212192,-0.148381,-0.168394,5.874804,-1.012616,-0.333822,-0.336029,-0.38717,-0.498523,-1.149513,27437.0
2,5227,-0.397664,-0.467798,-0.519327,-1.316342,-1.405285,-0.673533,-1.606028,-0.408322,-1.376186,-1.94547,-0.365463,-0.464617,-1.570055,-1.975377,-1.644239,-0.553929,-1.593203,-2.025615,-1.872033,-0.51361,-1.283855,-1.141039,-1.563536,-1.187435,-0.519922,0.99891,0.857296,1.38818,-0.301818,0.298529,0.897652,-0.532182,-0.516627,-0.468896,1.254907,-0.641961,-0.511981,-0.613683,-0.376749,-0.091604,1.039892,-0.613697,-0.045271,-0.613706,-0.330633,-0.613822,,,,,,,,,,,0.56893,-1.564782,0.418843,-0.686558,0.414064,-2.22032,0.202389,-0.861831,-2.915956,0.856444,0.078632,3.00441,-0.333462,0.904158,0.432482,3.006271,-0.333462,0.663539,0.190484,3.006737,-0.333462,0.235746,3.006271,-0.333462,-0.05767,-1.683025,-0.702771,2.552127,-0.985584,-0.992881,-0.741411,-1.227463,-0.296553,0.163161,-0.350545,-0.349371,-0.267963,-2.121595,0.29153,-0.172111,2.359741,-0.212192,-0.148381,-0.168394,5.874804,-1.012616,-0.333822,-0.336029,-0.102733,-0.324991,-1.149513,27301.0
3,5228,-0.381854,-0.467798,-0.510768,-1.313343,-1.371973,-0.607508,-1.569095,-0.408322,-1.384992,-1.79244,-0.31912,-0.464617,-1.552822,-1.819775,-1.574292,-0.553929,-1.586588,-1.891332,-1.822728,-0.51361,-1.283855,-1.005158,-1.563536,-1.187435,-0.519922,1.223438,0.560585,1.38818,-0.301818,0.298529,0.897652,-0.532182,-0.516627,-0.468896,1.254907,-0.641961,-0.538877,-0.613683,-0.512023,-0.376785,0.787157,-0.613697,-0.060697,-0.613706,-0.42123,-0.613822,,,,,,,,,,,-0.178519,-1.564782,0.374632,-0.686558,0.571662,-2.22032,0.19841,-0.837437,-2.915956,1.093134,0.071973,3.00441,-0.333462,1.056596,0.476492,3.006271,-0.333462,0.889266,0.20051,3.006737,-0.333462,0.227391,3.006271,-0.333462,-0.144584,-1.620609,-0.702771,2.552127,-1.05719,-0.61635,-0.658502,-1.069187,-0.296553,0.383347,-0.350545,-0.349371,-0.267963,-1.904645,0.29153,-0.172111,2.359741,-0.212192,-0.148381,-0.168394,5.874804,-1.012616,-0.333822,-0.336029,0.094876,-0.15905,-1.149513,27533.0
4,5229,-0.366044,-0.467798,-0.502122,-1.310402,-1.338188,-0.535508,-1.532161,-0.408322,-1.393875,-1.618304,-0.272778,-0.464617,-1.535546,-1.653268,-1.504346,-0.553929,-1.58007,-1.730923,-1.773423,-0.51361,-1.283855,-0.869277,-1.563536,-1.187435,-0.519922,1.364582,0.215058,1.38818,-0.301818,0.298529,0.897652,-0.532182,-0.516627,-0.468896,1.254907,-0.641961,-0.492995,-0.613683,-0.538919,-0.512059,0.361976,-0.613697,0.24604,-0.613706,-0.389976,-0.613822,,,,,,,,,,,-0.937367,-1.564782,0.316695,-0.686558,0.676852,-2.22032,0.202194,-0.856735,-2.915956,1.244002,0.046933,3.00441,-0.333462,1.130686,0.48019,3.006271,-0.333462,1.062416,0.226278,3.006737,-0.333462,0.222504,3.006271,-0.333462,-0.262543,-1.558192,-0.702771,2.552127,-0.989955,-0.269996,-0.564652,-0.890024,-0.296553,0.603533,-0.350545,-0.349371,-0.267963,-1.659066,0.29153,-0.172111,2.359741,-0.212192,-0.148381,-0.168394,5.874804,-1.012616,-0.333822,-0.336029,0.43522,0.016532,-1.149513,28405.0
