# Chapter 4: Baseline Machine Learning Models

In this notebook, we will build and evaluate OLS and Random Forest regressors on the energy dataset, using key metrics and diagnostic plots.

## 1. Load Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df = pd.read_csv('data/features.csv')

## 2. Define Features and Target

In [None]:
X = df.drop(columns=['Global_active_power', 'datetime'])
y = df['Global_active_power']

## 3. Train-Test Split (80/20)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

## 4. OLS Regression

In [None]:
ols = LinearRegression()
ols.fit(X_train, y_train)
y_pred_ols = ols.predict(X_test)

## 5. Random Forest Regression

In [None]:
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

## 6. Evaluation Metrics

In [None]:
def print_metrics(y_true, y_pred, model_name):
    print(f'--- {model_name} ---')
    print(f'MAE: {mean_absolute_error(y_true, y_pred):.3f}')
    print(f'RMSE: {mean_squared_error(y_true, y_pred, squared=False):.3f}')
    print(f'R2: {r2_score(y_true, y_pred):.3f}\n')

print_metrics(y_test, y_pred_ols, 'OLS')
print_metrics(y_test, y_pred_rf, 'Random Forest')

## 7. Diagnostic Plots

In [None]:
plt.figure(figsize=(6, 6))
plt.scatter(y_test, y_pred_rf, alpha=0.3)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Random Forest: Actual vs Predicted')
plt.show()

In [None]:
residuals = y_test - y_pred_rf
plt.figure(figsize=(10, 4))
plt.plot(residuals.values)
plt.axhline(y=0, color='r', linestyle='--')
plt.title('Residual Plot')
plt.xlabel('Time')
plt.ylabel('Residual')
plt.show()

In [None]:
importances = rf.feature_importances_
feat_df = pd.DataFrame({'feature': X.columns, 'importance': importances})
feat_df.sort_values(by='importance', ascending=False, inplace=True)
plt.figure(figsize=(10, 5))
sns.barplot(x='importance', y='feature', data=feat_df)
plt.title('Feature Importance')
plt.show()