In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import joblib

In [8]:
df = pd.read_csv("simulated_energy_data.csv")


In [9]:
features = [
    "home_size_sqft","residents","ac_level","climate","time_usage_type",
    "house_type","fridge","washer","dryer","dishwasher","ev_charger","pool_pump"
]
target = "total_kwh"

X = df[features]
y = df[target]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
categorical_features = ["ac_level","climate","time_usage_type","house_type"]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(), categorical_features)
], remainder="passthrough")


In [12]:
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, random_state=42))
])
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

In [13]:
def evaluate(y_true, y_pred, label="Model"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"--- {label} Metrics ---")
    print(f"R¬≤ Score : {r2:.3f}")
    print(f"RMSE     : {rmse:.2f}")
    print(f"MAE      : {mae:.2f}\n")
evaluate(y_test, y_pred, label="Random Forest")

--- Random Forest Metrics ---
R¬≤ Score : 0.910
RMSE     : 77.28
MAE      : 57.14



In [None]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_rmse = -cross_val_score(rf_pipeline, X, y, cv=kf, scoring='neg_mean_squared_error')
cv_mae = -cross_val_score(rf_pipeline, X, y, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(rf_pipeline, X, y, cv=kf, scoring='r2')

print("Cross-Validation Metrics (Random Forest):")
print(f"R¬≤ CV      : {cv_r2.mean():.3f} ¬± {cv_r2.std():.3f}")
print(f"RMSE CV    : {np.sqrt(cv_rmse.mean()):.2f} ¬± {np.sqrt(cv_rmse.std()):.2f}")
print(f"MAE CV     : {cv_mae.mean():.2f} ¬± {cv_mae.std():.2f}")


üîπ Cross-Validation Metrics (Random Forest):
R¬≤ CV      : 0.915 ¬± 0.009
RMSE CV    : 79.57 ¬± 22.85
MAE CV     : 58.55 ¬± 2.78


In [4]:
# ============================================
# üî• ENERGY BILL PREDICTION ‚Äì BEST PIPELINE
# ============================================

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

# ============================================
# 1Ô∏è‚É£ LOAD DATA
# ============================================

df = pd.read_csv("simulated_energy_data.csv")

# ============================================
# 2Ô∏è‚É£ FEATURE ENGINEERING (STRONG SIGNAL)
# ============================================

# Appliance count
appliance_cols = ["fridge", "washer", "dryer", "dishwasher", "ev_charger", "pool_pump"]
df["appliance_count"] = df[appliance_cols].sum(axis=1)

# Residents per sqft (density effect)
df["residents_per_sqft"] = df["residents"] / df["home_size_sqft"]

# HVAC intensity proxy
df["hvac_intensity"] = df["home_size_sqft"] * df["residents"]

# High consumption household
df["has_ev_or_pool"] = ((df["ev_charger"] == 1) | (df["pool_pump"] == 1)).astype(int)

# ============================================
# 3Ô∏è‚É£ DEFINE FEATURES
# ============================================

target = "bill_usd"

categorical_features = ["ac_level", "climate", "time_usage_type"]
numeric_features = [
    "home_size_sqft",
    "residents",
    "total_kwh",
    "carbon_kg",
    "appliance_count",
    "residents_per_sqft",
    "hvac_intensity",
    "has_ev_or_pool"
]

X = df[categorical_features + numeric_features]
y = df[target]

# ============================================
# 4Ô∏è‚É£ PREPROCESSING
# ============================================

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numeric_features)
    ]
)

# ============================================
# 5Ô∏è‚É£ MODEL (Gradient Boosting)
# ============================================

model = GradientBoostingRegressor(random_state=42)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", model)
])

# ============================================
# 6Ô∏è‚É£ TRAIN / TEST SPLIT
# ============================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================================
# 7Ô∏è‚É£ HYPERPARAMETER TUNING
# ============================================

param_dist = {
    "regressor__n_estimators": [200, 300, 400, 500],
    "regressor__learning_rate": [0.01, 0.03, 0.05, 0.1],
    "regressor__max_depth": [3, 4, 5],
    "regressor__min_samples_split": [2, 5, 10],
    "regressor__min_samples_leaf": [1, 2, 4],
    "regressor__subsample": [0.8, 0.9, 1.0]
}

search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=25,
    cv=5,
    scoring="r2",
    verbose=1,
    n_jobs=-1,
    random_state=42
)

search.fit(X_train, y_train)

best_model = search.best_estimator_

print("\n‚úÖ Best Parameters:", search.best_params_)

# ============================================
# 8Ô∏è‚É£ TEST PERFORMANCE
# ============================================

y_pred = best_model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("\nüìä TEST METRICS")
print("R¬≤   :", round(r2, 4))
print("RMSE :", round(rmse, 2))
print("MAE  :", round(mae, 2))

# ============================================
# 9Ô∏è‚É£ CROSS VALIDATION (OVERFITTING CHECK)
# ============================================

cv_scores = cross_val_score(best_model, X, y, cv=5, scoring="r2")

print("\nüîπ Cross-Validation")
print("R¬≤ CV Mean :", round(cv_scores.mean(), 4))
print("R¬≤ CV Std  :", round(cv_scores.std(), 4))

# ============================================
# üîü FEATURE IMPORTANCE
# ============================================

feature_names = best_model.named_steps["preprocessor"].get_feature_names_out()
importances = best_model.named_steps["regressor"].feature_importances_

importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print("\nüî• Top 15 Features")
print(importance_df.head(15))

# ============================================
# 1Ô∏è‚É£1Ô∏è‚É£ SAVE MODEL
# ============================================

joblib.dump(best_model, "best_energy_model.pkl")
print("\nüíæ Model saved as best_energy_model.pkl")

Fitting 5 folds for each of 25 candidates, totalling 125 fits

‚úÖ Best Parameters: {'regressor__subsample': 1.0, 'regressor__n_estimators': 300, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 4, 'regressor__max_depth': 3, 'regressor__learning_rate': 0.03}

üìä TEST METRICS
R¬≤   : 0.9567
RMSE : 14.69
MAE  : 12.05

üîπ Cross-Validation
R¬≤ CV Mean : 0.9633
R¬≤ CV Std  : 0.0024

üî• Top 15 Features
                              Feature  Importance
14                     num__carbon_kg    0.462383
13                     num__total_kwh    0.396096
10      cat__time_usage_type_off_peak    0.107611
8          cat__time_usage_type_mixed    0.012115
7   cat__time_usage_type_evening_peak    0.009355
9   cat__time_usage_type_morning_peak    0.006799
16            num__residents_per_sqft    0.002308
17                num__hvac_intensity    0.001416
11                num__home_size_sqft    0.001183
15               num__appliance_count    0.000298
2                cat__ac_l