# Feature Engineering Pipeline

## 1. Imports and Setup


In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## 2. Core Feature Engineering Function

In [2]:
def create_essential_features(df, horizon_hours=2):
    """
    Creates 49 essential features for energy prediction
    Args:
        df: DataFrame with 'date' and 'Appliances' columns
        horizon_hours: Prediction horizon (default: 2 hours)
    Returns:
        DataFrame with engineered features and target
    """
    data = df.copy()
    data['date'] = pd.to_datetime(data['date'])
    data = data.sort_values('date').reset_index(drop=True)
    
    # Create target variable (future appliance usage)
    data['target'] = data['Appliances'].shift(-horizon_hours)
    
    # Extract base time components
    data['hour'] = data['date'].dt.hour
    data['day_of_week'] = data['date'].dt.dayofweek
    data['month'] = data['date'].dt.month
    
    # Calculate target timestamp features
    data['target_date'] = data['date'] + pd.Timedelta(hours=horizon_hours)
    data['target_hour'] = data['target_date'].dt.hour
    data['target_dow'] = data['target_date'].dt.dayofweek
    
    return data


## 3. Time-Based Features (8 features)

In [3]:
def add_time_features(data):
    """Adds cyclical time encodings and discrete time components"""
    
    # Hour cyclical encoding (captures 24-hour cycle)
    data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
    data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
    
    # Target hour cyclical encoding
    data['target_hour_sin'] = np.sin(2 * np.pi * data['target_hour'] / 24)
    
    # Day of week cyclical encoding (captures weekly patterns)
    data['dow_sin'] = np.sin(2 * np.pi * data['day_of_week'] / 7)
    
    # Month cyclical encoding (captures seasonal patterns)
    data['month_sin'] = np.sin(2 * np.pi * data['month'] / 12)
    
    return data


## 4. Lag Features (7 features)
######  Historical values at key intervals capture temporal dependencies

In [4]:
def add_lag_features(data):
    """Creates lag features at strategic time intervals"""
    
    # Short-term lags (immediate past influence)
    data['lag_1h'] = data['Appliances'].shift(1)
    data['lag_2h'] = data['Appliances'].shift(2)
    data['lag_3h'] = data['Appliances'].shift(3)
    
    # Medium-term lags (recent patterns)
    data['lag_6h'] = data['Appliances'].shift(6)
    data['lag_12h'] = data['Appliances'].shift(12)
    data['lag_24h'] = data['Appliances'].shift(24)  # Yesterday same hour
    
    # Long-term lag (weekly pattern)
    data['lag_168h'] = data['Appliances'].shift(168)  # Last week same hour
    
    return data


## 5. Rolling Mean Features (5 features)
###### Moving averages smooth noise and capture trend levels


In [5]:
def add_rolling_means(data):
    """Calculates rolling averages over multiple time windows"""
    
    # 3-hour rolling mean (very short-term trend)
    data['roll_3h_mean'] = data['Appliances'].shift(1).rolling(3, min_periods=1).mean()
    
    # 6-hour rolling mean (short-term trend)
    data['roll_6h_mean'] = data['Appliances'].shift(1).rolling(6, min_periods=2).mean()
    
    # 12-hour rolling mean (half-day trend)
    data['roll_12h_mean'] = data['Appliances'].shift(1).rolling(12, min_periods=4).mean()
    
    # 24-hour rolling mean (daily baseline)
    data['roll_24h_mean'] = data['Appliances'].shift(1).rolling(24, min_periods=8).mean()
    
    # 168-hour rolling mean (weekly baseline)
    data['roll_168h_mean'] = data['Appliances'].shift(1).rolling(168, min_periods=56).mean()
    
    return data

## 6. Rolling Extremes (6 features)
###### Min/max values identify volatility ranges and boundaries


In [6]:
def add_rolling_extremes(data):
    """Extracts min and max values from rolling windows"""
    
    # Create rolling window objects
    rolled_6 = data['Appliances'].shift(1).rolling(6, min_periods=2)
    rolled_12 = data['Appliances'].shift(1).rolling(12, min_periods=4)
    rolled_24 = data['Appliances'].shift(1).rolling(24, min_periods=8)
    rolled_168 = data['Appliances'].shift(1).rolling(168, min_periods=56)
    
    # 6-hour extremes (short-term range)
    data['roll_6h_max'] = rolled_6.max()
    data['roll_6h_min'] = rolled_6.min()
    
    # 12-hour extremes
    data['roll_12h_max'] = rolled_12.max()
    data['roll_12h_min'] = rolled_12.min()
    
    # 24-hour and 168-hour minimums (baseline floors)
    data['roll_24h_min'] = rolled_24.min()
    data['roll_168h_min'] = rolled_168.min()
    
    return data, rolled_6, rolled_24, rolled_168


## 7. Rolling Percentiles (4 features)
###### Quantiles capture distribution shape and outlier context


In [7]:
def add_rolling_percentiles(data, rolled_24, rolled_168):
    """Calculates percentiles from rolling distributions"""
    
    # Weekly percentiles (long-term distribution)
    data['roll_168h_median'] = rolled_168.median()
    data['roll_168h_q25'] = rolled_168.quantile(0.25)
    data['roll_168h_q75'] = rolled_168.quantile(0.75)
    
    # Daily median (short-term center)
    data['roll_24h_median'] = rolled_24.median()
    
    return data


## 8. Momentum Features (3 features)
###### Rate of change indicates acceleration or deceleration trends

In [8]:
def add_momentum_features(data):
    """Computes change rates over different time scales"""
    
    # 1-hour momentum (immediate change)
    data['momentum_1h'] = data['Appliances'] - data['lag_1h']
    
    # 6-hour momentum (short-term change)
    data['momentum_6h'] = data['Appliances'] - data['lag_6h']
    
    # 24-hour momentum (daily change)
    data['momentum_24h'] = data['Appliances'] - data['lag_24h']
    
    return data


## 9. Relative Position Features (6 features)
###### Normalized metrics show current value context within distributions

In [9]:
def add_relative_position(data, rolled_6, rolled_168):
    """Creates relative positioning and z-score features"""
    
    # Distance from minimums (how far above baseline)
    data['dist_from_24h_min'] = data['Appliances'] - data['roll_24h_min']
    data['dist_from_6h_min'] = data['Appliances'] - data['roll_6h_min']
    
    # Ratio to means (relative magnitude)
    data['rel_to_24h_mean'] = data['Appliances'] / (data['roll_24h_mean'] + 1)
    data['rel_to_6h_mean'] = data['Appliances'] / (data['roll_6h_mean'] + 1)
    
    # Z-scores (standardized deviations)
    data['zscore_168h'] = (data['Appliances'] - data['roll_168h_mean']) / (rolled_168.std() + 1)
    data['zscore_6h'] = (data['Appliances'] - data['roll_6h_mean']) / (rolled_6.std() + 1)
    
    return data

## 10. Volatility Feature (1 feature)
###### Range captures variability over time window

In [10]:
def add_volatility(data, rolled_24):
    """Calculates range as volatility measure"""
    
    # 24-hour range (max - min = daily volatility)
    data['range_24h'] = rolled_24.max() - data['roll_24h_min']
    
    return data

## 11. Exponential Moving Averages (2 features)
###### EMAs weight recent values more heavily than simple moving averages

In [11]:
def add_ema_features(data):
    """Creates exponentially weighted moving averages"""
    
    # 3-hour EMA (fast-reacting trend)
    data['ema_3h'] = data['Appliances'].ewm(span=3, adjust=False).mean().shift(1)
    
    # 6-hour EMA (balanced trend)
    data['ema_6h'] = data['Appliances'].ewm(span=6, adjust=False).mean().shift(1)
    
    return data

## 12. Usage Regime Feature (1 feature)
###### Categorical bins identify low/medium/high/very-high usage states

In [12]:
def add_usage_regime(data):
    """Bins appliance usage into discrete regimes"""
    
    # Categorize into 4 usage levels (2nd most important feature!)
    data['usage_regime'] = pd.cut(data['Appliances'], 
                                   bins=[0, 100, 200, 300, np.inf],
                                   labels=[0, 1, 2, 3]).astype(int)
    
    return data

## 13. Context Flag Features (5 features)
###### Binary indicators for specific time periods and conditions

In [None]:
def add_context_flags(data):
    """Creates binary flags for time-of-day and weekend periods"""
    
    # Time of day flags
    data['is_night'] = ((data['hour'] >= 22) | (data['hour'] <= 5)).astype(int)
    data['is_morning'] = ((data['hour'] >= 6) & (data['hour'] <= 9)).astype(int)
    data['is_evening'] = ((data['hour'] >= 17) & (data['hour'] <= 21)).astype(int)
    
    # Weekend flag
    data['is_weekend'] = (data['day_of_week'] >= 5).astype(int)
    
    # Target hour peak flag (high usage evening hours)
    data['target_is_peak'] = ((data['target_hour'] >= 18) & (data['target_hour'] <= 20)).astype(int)
    
    return data

## 14. Spike Detection Features (3 features)
###### Identify local peaks, troughs, and anomalous intensity levels

In [14]:
def add_spike_detection(data):
    """Detects local extrema and spike intensity"""
    
    # Local peak (higher than neighbors)
    data['is_local_peak'] = ((data['Appliances'] > data['lag_1h']) & 
                              (data['Appliances'] > data['Appliances'].shift(-1))).astype(int)
    
    # Local trough (lower than neighbors)
    data['is_local_trough'] = ((data['Appliances'] < data['lag_1h']) & 
                                (data['Appliances'] < data['Appliances'].shift(-1))).astype(int)
    
    # Spike intensity (ratio to 24h baseline)
    data['spike_intensity_24h'] = data['Appliances'] / (data['roll_24h_mean'] + 1)
    
    return data

## 15. Interaction Features (2 features)
###### Multiplicative features capture combined effects

In [15]:
def add_interactions(data):
    """Creates interaction terms between categorical and continuous features"""
    
    # Evening period weighted by usage level
    data['evening_x_level'] = data['is_evening'] * data['Appliances']
    
    # Weekend weighted by usage level
    data['weekend_x_level'] = data['is_weekend'] * data['Appliances']
    
    return data

## 16. Historical Pattern Feature (1 feature)
###### Average usage at the same hour across previous weeks


In [16]:
def add_historical_patterns(data):
    """Computes historical average for target hour"""
    
    # Rolling average of this hour across past weeks
    data['avg_this_hour'] = data.groupby('target_hour')['Appliances'].transform(
        lambda x: x.shift(1).rolling(168, min_periods=24).mean()
    )
    
    return data

## 17. Trend Feature (1 feature)
###### Linear slope captures directional movement

In [17]:
def add_trend(data):
    """Calculates 6-hour linear trend slope"""
    
    # Trend direction over recent 6 hours
    data['trend_6h'] = data['Appliances'].shift(1).rolling(6).apply(
        lambda x: np.polyfit(range(len(x)), x, 1)[0] if len(x) == 6 else 0, 
        raw=True
    )
    
    return data

## 18. Complete Pipeline Function

In [18]:
def engineer_all_features(df, horizon_hours=2):
    """
    Master function that orchestrates all feature engineering steps
    
    Args:
        df: DataFrame with 'date' and 'Appliances' columns
        horizon_hours: Prediction horizon (default: 2)
    
    Returns:
        DataFrame with 49 engineered features + target
    """
    # Initialize base features
    data = create_essential_features(df, horizon_hours)
    
    # Add feature groups sequentially
    data = add_time_features(data)
    data = add_lag_features(data)
    data = add_rolling_means(data)
    data, rolled_6, rolled_24, rolled_168 = add_rolling_extremes(data)
    data = add_rolling_percentiles(data, rolled_24, rolled_168)
    data = add_momentum_features(data)
    data = add_relative_position(data, rolled_6, rolled_168)
    data = add_volatility(data, rolled_24)
    data = add_ema_features(data)
    data = add_usage_regime(data)
    data = add_context_flags(data)
    data = add_spike_detection(data)
    data = add_interactions(data)
    data = add_historical_patterns(data)
    data = add_trend(data)
    
    return data

## 19. Feature List Definition


In [19]:
def get_feature_names():
    """Returns ordered list of all 49 feature names for model training"""
    
    features = [
        # Time features (8)
        'hour_sin', 'hour_cos', 'target_hour_sin', 'dow_sin', 'month_sin',
        'day_of_week', 'target_dow', 'month',
        
        # Lag features (7)
        'lag_1h', 'lag_2h', 'lag_3h', 'lag_6h', 'lag_12h', 'lag_24h', 'lag_168h',
        
        # Rolling means (5)
        'roll_3h_mean', 'roll_6h_mean', 'roll_12h_mean', 'roll_24h_mean', 'roll_168h_mean',
        
        # Rolling extremes (6)
        'roll_6h_max', 'roll_6h_min', 'roll_12h_max', 'roll_12h_min', 
        'roll_24h_min', 'roll_168h_min',
        
        # Percentiles (4)
        'roll_168h_median', 'roll_168h_q25', 'roll_168h_q75', 'roll_24h_median',
        
        # Momentum (3)
        'momentum_1h', 'momentum_6h', 'momentum_24h',
        
        # Relative position (6)
        'dist_from_24h_min', 'dist_from_6h_min', 'rel_to_24h_mean', 
        'rel_to_6h_mean', 'zscore_168h', 'zscore_6h',
        
        # Volatility (1)
        'range_24h',
        
        # EMAs (2)
        'ema_3h', 'ema_6h',
        
        # Regime (1)
        'usage_regime',
        
        # Context flags (5)
        'is_night', 'is_morning', 'is_evening', 'is_weekend', 'target_is_peak',
        
        # Spike detection (3)
        'is_local_peak', 'is_local_trough', 'spike_intensity_24h',
        
        # Interactions (2)
        'evening_x_level', 'weekend_x_level',
        
        # Historical patterns (1)
        'avg_this_hour',
        
        # Trend (1)
        'trend_6h'
    ]
    
    return features

## 20. Usage

In [22]:
df = pd.read_csv('C:/Users/lekshmi/Desktop/ml projects/appliances energy prediction/KAG_energydata_complete.csv')

print(f"Raw data shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Apply feature engineering pipeline
print("\nApplying feature engineering...")
df_features = engineer_all_features(df[['date', 'Appliances']], horizon_hours=2)
print(f"Engineered data shape: {df_features.shape}")

features = get_feature_names()
print(f"\nTotal features created: {len(features)}")

# Prepare final dataset
X = df_features[features].fillna(0)
y = df_features['target']

# Combine features and target for saving
final_data = X.copy()
final_data['target'] = y
final_data['date'] = df_features['date']

# Reorder columns (date first, target last)
cols = ['date'] + features + ['target']
final_data = final_data[cols]

print(f"Final dataset shape: {final_data.shape}")
final_data=final_data.dropna()
print(f"Missing values: {final_data.isnull().sum().sum()}")
final_data=final_data.dropna()

Raw data shape: (19735, 29)
Columns: ['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2']

Applying feature engineering...
Engineered data shape: (19735, 61)

Total features created: 55
Final dataset shape: (19735, 57)
Missing values: 0


## 21. Save Engineered Features

In [None]:
import os

# Create directories if they don't exist
os.makedirs('data/processed', exist_ok=True)


# Save complete engineered dataset
output_path = 'data/processed/engineered_features.csv'
final_data.to_csv(output_path, index=False)
print(f"✓ Saved processed features to: {output_path}")





✓ Saved processed features to: data/processed/engineered_features.csv


In [24]:
os.makedirs('data/features', exist_ok=True)
feature_names_path = 'data/features/feature_names.txt'
with open(feature_names_path, 'w') as f:
    f.write('\n'.join(features))
print(f"✓ Saved feature names to: {feature_names_path}")

✓ Saved feature names to: data/features/feature_names.txt
