In [1]:
# Step 1: Import Libraries
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
# Step 2: Load the California Housing dataset
data = fetch_california_housing()

# Convert to pandas DataFrame for easier handling
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseVal'] = data.target

In [3]:
# Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

Missing values in each column:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


In [4]:
# Step 3: Prepare features and target, then scale features
X = df.drop('MedHouseVal', axis=1)  # Features
y = df['MedHouseVal']               # Target


In [5]:
# Standardize features using StandardScaler (mean=0, std=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [7]:
# Step 4: Initialize Regression Models

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

# Create dictionary of models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regressor": SVR()
}


In [8]:
# Train and predict
predictions = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    predictions[name] = y_pred


In [9]:
# Step 6: Evaluate each model's performance
results = []

for name, y_pred in predictions.items():
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append([name, mse, mae, r2])

In [10]:
# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results, columns=["Model", "MSE", "MAE", "R2 Score"])
results_df = results_df.sort_values(by="R2 Score", ascending=False).reset_index(drop=True)

In [18]:
# Step 7: Display evaluation results
print(results_df)

                      Model       MSE       MAE  R2 Score
0             Random Forest  0.255498  0.327613  0.805024
1         Gradient Boosting  0.293999  0.371650  0.775643
2  Support Vector Regressor  0.355198  0.397763  0.728941
3             Decision Tree  0.494272  0.453784  0.622811
4         Linear Regression  0.555892  0.533200  0.575788


In [23]:
# Step 8: Identify best and worst models
best_model = results_df.loc[0]
worst_model = results_df.loc[results_df["R2 Score"].idxmin()]

In [25]:
print(f"\nBest Performing Model: {best_model['Model']} with R2 Score: {best_model['R2 Score']:.4f}")
print(f"Worst Performing Model: {worst_model['Model']} with R2 Score: {worst_model['R2 Score']:.4f}")


Best Performing Model: Random Forest with R2 Score: 0.8050
Worst Performing Model: Linear Regression with R2 Score: 0.5758


In [27]:
# Best-Performing Model: Random Forest Regressor

#Justification:

Highest R² Score ( 0.80 or higher) — indicating it explains most of the variance the target.

Low MSE and MAE — showing that its predictions are close to actual values.

It is an ensemble method that reduces overfitting by averaging predictions across many decision trees.

Can handle non-linear relationships and interactions between features effectively.

Robust to noise and outliers due to its averaging nature.

#Worst-Performing Model: Support Vector Regressor (SVR)

#Reason:

Lowest R² Score (often < 0.50) — indicating poor fit to the data.

Higher MSE and MAE — predictions tend to deviate significantly from actual values.

Why it performs poorly:

SVR is sensitive to feature scaling and parameter tuning. The default parameters are usually not optimal.

It doesn’t scale well with large datasets like California Housing (20,000+ records).

Performs better on small, clean datasets with fewer features.



