### Importing necessary libraries
This cell imports essential Python libraries for data manipulation, model building, and evaluation.

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


### Loading the dataset
This cell reads the power consumption data from a CSV file and prints its shape.

In [5]:
df = pd.read_csv('power_consumption.csv', low_memory=False)
print("Data Loaded. Shape:", df.shape)


Data Loaded. Shape: (260640, 10)


### Displaying the first few rows
This cell shows a preview of the dataset using the `head()` function.

In [7]:
df.head()

Unnamed: 0,index,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
0,0,1/1/07,0:00:00,2.58,0.136,241.97,10.6,0,0,0.0
1,1,1/1/07,0:01:00,2.552,0.1,241.75,10.4,0,0,0.0
2,2,1/1/07,0:02:00,2.55,0.1,241.64,10.4,0,0,0.0
3,3,1/1/07,0:03:00,2.55,0.1,241.71,10.4,0,0,0.0
4,4,1/1/07,0:04:00,2.554,0.1,241.98,10.4,0,0,0.0


### Show Last Few Rows
Displaying the last five rows of the dataset.

In [9]:
df.tail()

Unnamed: 0,index,Date,Time,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
260635,260635,30/6/2007,23:55:00,2.88,0.36,239.01,12.0,0,0,18.0
260636,260636,30/6/2007,23:56:00,2.892,0.358,238.86,12.2,0,0,17.0
260637,260637,30/6/2007,23:57:00,2.882,0.28,239.05,12.0,0,0,18.0
260638,260638,30/6/2007,23:58:00,2.66,0.29,238.98,11.2,0,0,18.0
260639,260639,30/6/2007,23:59:00,2.548,0.354,239.25,10.6,0,1,17.0


### Dataset Information
Checking the structure, data types, and non-null values.

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260640 entries, 0 to 260639
Data columns (total 10 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   index                  260640 non-null  int64  
 1   Date                   260640 non-null  object 
 2   Time                   260640 non-null  object 
 3   Global_active_power    260640 non-null  object 
 4   Global_reactive_power  260640 non-null  object 
 5   Voltage                260640 non-null  object 
 6   Global_intensity       260640 non-null  object 
 7   Sub_metering_1         260640 non-null  object 
 8   Sub_metering_2         260640 non-null  object 
 9   Sub_metering_3         256869 non-null  float64
dtypes: float64(1), int64(1), object(8)
memory usage: 19.9+ MB


### Descriptive Statistics
Getting statistical summary of the dataset's numerical features.

In [13]:
df.describe()

Unnamed: 0,index,Sub_metering_3
count,260640.0,256869.0
mean,130319.5,5.831825
std,75240.431418,8.186709
min,0.0,0.0
25%,65159.75,0.0
50%,130319.5,0.0
75%,195479.25,17.0
max,260639.0,20.0


### Date Parsing and Feature Engineering
Converting 'Date' to datetime and extracting day, month, year, and weekday.

In [15]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce')
df = df.dropna(subset=['Date'])  

df['Day'] = df['Date'].dt.day
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year
df['Weekday'] = df['Date'].dt.weekday

df = df.drop(columns=['Date', 'Time'])


### Data Cleaning
Replacing placeholders, removing nulls, and converting data types.

In [17]:
df = df.replace('?', np.nan)
df = df.dropna()
df = df.astype(float)
print("Data cleaned. Shape after cleaning:", df.shape)


Data cleaned. Shape after cleaning: (153229, 12)


### Feature and Target Selection
Separating input features from the target variable for prediction.

In [19]:
target = 'Global_active_power'
features = df.drop(columns=[target]).columns.tolist()

X = df[features]
y = df[target]


### Split Data
Splitting the dataset into training and test sets for evaluation.

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


### Train Random Forest Model
Initializing and fitting a Random Forest Regressor model.

In [23]:
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Evaluate the Model
Calculating evaluation metrics: MAE, RMSE, and R² score.

In [25]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")



Model Evaluation:
Mean Absolute Error (MAE): 0.0122
Root Mean Squared Error (RMSE): 0.0308
R² Score: 0.9993


### Compare Actual vs Predicted
Displaying predictions and calculating percentage error for the first 10 records.

In [27]:
comparison_df = pd.DataFrame({
    'Actual': y_test.values,
    'Predicted': y_pred
})
comparison_df['Error (%)'] = 100 * abs(comparison_df['Actual'] - comparison_df['Predicted']) / comparison_df['Actual']

print("\nSample Predictions (first 10):")
print(comparison_df.head(10).to_string(index=False))



Sample Predictions (first 10):
 Actual  Predicted  Error (%)
  1.408    1.40570   0.163352
  0.272    0.27312   0.411765
  0.222    0.22612   1.855856
  1.414    1.41556   0.110325
  0.574    0.55818   2.756098
  1.374    1.35670   1.259098
  0.230    0.23010   0.043478
  2.082    2.06942   0.604227
  1.352    1.35726   0.389053
  0.854    0.75284  11.845433


### Error Summary
Calculating average, maximum, and minimum error percentages.

In [29]:
avg_error = comparison_df['Error (%)'].mean()
max_error = comparison_df['Error (%)'].max()
min_error = comparison_df['Error (%)'].min()

print("\nPrediction Error Summary:")
print(f"Average Error: {avg_error:.2f}%")
print(f"Max Error: {max_error:.2f}%")
print(f"Min Error: {min_error:.2f}%")



Prediction Error Summary:
Average Error: 1.35%
Max Error: 146.65%
Min Error: 0.00%


### Accuracy Extremes
Identifying the top 3 most and least accurate predictions.

In [31]:
most_accurate = comparison_df.sort_values(by='Error (%)').head(3)
least_accurate = comparison_df.sort_values(by='Error (%)', ascending=False).head(3)

print("\nMost Accurate Predictions:")
print(most_accurate.to_string(index=False))

print("\nLeast Accurate Predictions:")
print(least_accurate.to_string(index=False))



Most Accurate Predictions:
 Actual  Predicted  Error (%)
  0.218      0.218        0.0
  0.218      0.218        0.0
  0.218      0.218        0.0

Least Accurate Predictions:
 Actual  Predicted  Error (%)
  0.234    0.57716 146.649573
  0.388    0.76270  96.572165
  0.276    0.50762  83.920290


### Average Power Consumption
Calculating the average power consumption in kW and converting it to Watts.

In [33]:
average_consumption_kw = df['Global_active_power'].mean()
average_consumption_wh = average_consumption_kw * 1000  # Convert kW to Watts

print(f"\nOn average, each household consumes {average_consumption_kw:.4f} kW or {average_consumption_wh:.2f} Watts of power.")



On average, each household consumes 1.1381 kW or 1138.12 Watts of power.
