In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np
import joblib

In [8]:
df = pd.read_csv("simulated_energy_data.csv")


In [9]:
features = [
    "home_size_sqft","residents","ac_level","climate","time_usage_type",
    "house_type","fridge","washer","dryer","dishwasher","ev_charger","pool_pump"
]
target = "total_kwh"

X = df[features]
y = df[target]


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
categorical_features = ["ac_level","climate","time_usage_type","house_type"]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(), categorical_features)
], remainder="passthrough")


In [12]:
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, random_state=42))
])
rf_pipeline.fit(X_train, y_train)
y_pred = rf_pipeline.predict(X_test)

In [13]:
def evaluate(y_true, y_pred, label="Model"):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"--- {label} Metrics ---")
    print(f"RÂ² Score : {r2:.3f}")
    print(f"RMSE     : {rmse:.2f}")
    print(f"MAE      : {mae:.2f}\n")
evaluate(y_test, y_pred, label="Random Forest")

--- Random Forest Metrics ---
RÂ² Score : 0.910
RMSE     : 77.28
MAE      : 57.14



In [None]:
from sklearn.model_selection import cross_val_score, KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_rmse = -cross_val_score(rf_pipeline, X, y, cv=kf, scoring='neg_mean_squared_error')
cv_mae = -cross_val_score(rf_pipeline, X, y, cv=kf, scoring='neg_mean_absolute_error')
cv_r2 = cross_val_score(rf_pipeline, X, y, cv=kf, scoring='r2')

print("Cross-Validation Metrics (Random Forest):")
print(f"RÂ² CV      : {cv_r2.mean():.3f} Â± {cv_r2.std():.3f}")
print(f"RMSE CV    : {np.sqrt(cv_rmse.mean()):.2f} Â± {np.sqrt(cv_rmse.std()):.2f}")
print(f"MAE CV     : {cv_mae.mean():.2f} Â± {cv_mae.std():.2f}")


ðŸ”¹ Cross-Validation Metrics (Random Forest):
RÂ² CV      : 0.915 Â± 0.009
RMSE CV    : 79.57 Â± 22.85
MAE CV     : 58.55 Â± 2.78
