In [22]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import numpy as np

# Load dataset
data = fetch_california_housing()
X = data.data
y = data.target

# Convert to pandas DataFrame
df = pd.DataFrame(X, columns=data.feature_names)
df['target'] = y

display(df.head())

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [28]:
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to test
models = [
    ("Linear Regression", LinearRegression()),
    ("Ridge Regression", Ridge()),
    ("Decision Tree", DecisionTreeRegressor(random_state=42)),
    #("Random Forest", RandomForestRegressor(random_state=42)),
    ("Gradient Boosting", GradientBoostingRegressor(random_state=42))
]


In [37]:
# Train and test each model
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name}:")
    print(f"  Mean Squared Error (MSE) = {mse:.4f}")
    print(f"  Root Mean Squared Error (RMSE) = {rmse:.4f}")
    print(f"  Mean Absolute Error (MAE) = {mae:.4f}")
    print(f"  R^2 Score = {r2:.4f}")
    print(f"  Sample actuals    : {y_test[:5]}")
    print(f"  Sample predictions: {y_pred[:5]}\n")
    print("----------------------------------------")

for i in range(5):
      actual = y_test[i]
      predicted = y_pred[i]
      error = predicted - actual
      print(f"  Sample {i+1}: Predicted = {predicted:.4f}, Actual = {actual:.4f}, Error = {error:.4f}")

Linear Regression:
  Mean Squared Error (MSE) = 0.5559
  Root Mean Squared Error (RMSE) = 0.7456
  Mean Absolute Error (MAE) = 0.5332
  R^2 Score = 0.5758
  Sample actuals    : [0.477   0.458   5.00001 2.186   2.78   ]
  Sample predictions: [0.71912284 1.76401657 2.70965883 2.83892593 2.60465725]

----------------------------------------
Ridge Regression:
  Mean Squared Error (MSE) = 0.5558
  Root Mean Squared Error (RMSE) = 0.7455
  Mean Absolute Error (MAE) = 0.5332
  R^2 Score = 0.5759
  Sample actuals    : [0.477   0.458   5.00001 2.186   2.78   ]
  Sample predictions: [0.71923978 1.76395141 2.70909238 2.83897613 2.60476825]

----------------------------------------
Decision Tree:
  Mean Squared Error (MSE) = 0.4952
  Root Mean Squared Error (RMSE) = 0.7037
  Mean Absolute Error (MAE) = 0.4547
  R^2 Score = 0.6221
  Sample actuals    : [0.477   0.458   5.00001 2.186   2.78   ]
  Sample predictions: [0.414   1.203   5.00001 2.17    2.257  ]

----------------------------------------


**Insights:**

Gradient Boosting significantly outperforms other models with the lowest error metrics (MSE, RMSE, MAE) and the highest R² score. It captures complex relationships in the housing data better than basic linear models or a single decision tree.

---

**Summary of Metrics:**

MSE: Sensitive to large errors/outliers.

RMSE: Same units as target, interpretable.

MAE: Robust to outliers, average error per prediction.

R²: Shows explained variance (closer to 1 is better).

Sample Predictions: Shows first few predicted vs. actual prices.