In [39]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [41]:
df = pd.read_csv('power_consumption.csv', low_memory=False)
print("Data Loaded. Shape:", df.shape)


Data Loaded. Shape: (260640, 10)


In [43]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df = df.dropna(subset=['Date'])  

df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Weekday'] = df['Date'].dt.weekday

df = df.drop(columns=['Date', 'Time'])


In [45]:
df = df.replace('?', np.nan)
df = df.dropna()
df = df.astype(float)
print("Data cleaned. Shape after cleaning:", df.shape)


Data cleaned. Shape after cleaning: (153229, 12)


In [47]:
target = 'Global_active_power'
features = df.drop(columns=[target]).columns.tolist()

X = df[features]
y = df[target]


In [49]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [51]:
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [53]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")



Model Evaluation:
Mean Absolute Error (MAE): 0.0122
Root Mean Squared Error (RMSE): 0.0308
R² Score: 0.9993


In [55]:
comparison_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})
comparison_df['Error (%)'] = 100 * abs(comparison_df['Actual'] - comparison_df['Predicted']) / comparison_df['Actual']

print("\nSample Predictions (first 10):")
print(comparison_df.head(10).to_string(index=False))



Sample Predictions (first 10):
 Actual  Predicted  Error (%)
  1.408    1.40570   0.163352
  0.272    0.27312   0.411765
  0.222    0.22612   1.855856
  1.414    1.41556   0.110325
  0.574    0.55818   2.756098
  1.374    1.35670   1.259098
  0.230    0.23010   0.043478
  2.082    2.06980   0.585975
  1.352    1.35726   0.389053
  0.854    0.75284  11.845433


In [57]:
avg_error = comparison_df['Error (%)'].mean()
max_error = comparison_df['Error (%)'].max()
min_error = comparison_df['Error (%)'].min()

print("\nPrediction Error Summary:")
print(f"Average Error: {avg_error:.2f}%")
print(f"Max Error: {max_error:.2f}%")
print(f"Min Error: {min_error:.2f}%")



Prediction Error Summary:
Average Error: 1.35%
Max Error: 146.65%
Min Error: 0.00%


In [59]:
most_accurate = comparison_df.sort_values(by='Error (%)').head(3)
least_accurate = comparison_df.sort_values(by='Error (%)', ascending=False).head(3)

print("\nMost Accurate Predictions:")
print(most_accurate.to_string(index=False))

print("\nLeast Accurate Predictions:")
print(least_accurate.to_string(index=False))



Most Accurate Predictions:
 Actual  Predicted  Error (%)
  0.218      0.218        0.0
  0.218      0.218        0.0
  0.218      0.218        0.0

Least Accurate Predictions:
 Actual  Predicted  Error (%)
  0.234    0.57716 146.649573
  0.388    0.76270  96.572165
  0.276    0.50762  83.920290


In [61]:
average_consumption_kw = df['Global_active_power'].mean()
average_consumption_wh = average_consumption_kw * 1000  # Convert kW to Watts

print(f"\nOn average, each household consumes {average_consumption_kw:.4f} kW or {average_consumption_wh:.2f} Watts of power.")



On average, each household consumes 1.1381 kW or 1138.12 Watts of power.
