# 1. Loading and Preprocessing

In [34]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Load the dataset
california_housing = fetch_california_housing()
data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)
data['MedHouseVal'] = california_housing.target

In [36]:
display(data)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [5]:
# Preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Check for missing values
print("Missing values:\n", data.isnull().sum())

# Split into features and target
X = data.drop('MedHouseVal', axis=1)
y = data['MedHouseVal']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Missing values:
 MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64


1. No missing values handling needed: The dataset is already cleaned and doesn't contain missing values.

2. Feature scaling: Standardization (StandardScaler) was applied because:

- Many algorithms (like SVR, Gradient Boosting) perform better when features are on similar scales

- Features in this dataset have different units and ranges (e.g., 'AveRooms' vs 'Latitude')

- Helps algorithms converge faster and prevents features with larger scales from dominating

# 2. Regression Algorithm Implementation

In [8]:
# Common Evaluation Function
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {'MSE': mse, 'MAE': mae, 'R2': r2}

In [55]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
result = evaluate_model(model, X_train, X_test, y_train, y_test)

print("Evaluation Results:")
print(result)


Evaluation Results:
{'MSE': 0.555891598695244, 'MAE': 0.5332001304956564, 'R2': 0.5757877060324511}


In [42]:
# Linear Regression
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr_results = evaluate_model(lr, X_train_scaled, X_test_scaled, y_train, y_test)
print(f"Linear Regression R²: {lr_results['R2']:.4f}")

Linear Regression R²: 0.5758


In [44]:
#  Decision Tree Regressor
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=42)
dt_results = evaluate_model(dt, X_train_scaled, X_test_scaled, y_train, y_test)
print(f"Decision Tree R²: {dt_results['R2']:.4f}")

Decision Tree R²: 0.6230


In [49]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42)
rf_results = evaluate_model(rf, X_train_scaled, X_test_scaled, y_train, y_test)
print(f"Random Forest R²: {rf_results['R2']:.4f}")

Random Forest R²: 0.8053


In [50]:
# Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(random_state=42)
gb_results = evaluate_model(gb, X_train_scaled, X_test_scaled, y_train, y_test)
print(f"Gradient Boosting R²: {gb_results['R2']:.4f}")

Gradient Boosting R²: 0.7756


In [51]:
# Support Vector Regressor (SVR)
from sklearn.svm import SVR

svr = SVR()
svr_results = evaluate_model(svr, X_train_scaled, X_test_scaled, y_train, y_test)
print(f"SVR R²: {svr_results['R2']:.4f}")

SVR R²: 0.7276


# 3. Model Evaluation and Comparison

In [21]:
# Results Comparison
results = {
    'Linear Regression': lr_results,
    'Decision Tree': dt_results,
    'Random Forest': rf_results,
    'Gradient Boosting': gb_results,
    'SVR': svr_results
}

results_df = pd.DataFrame(results).T
print(results_df)

                        MSE       MAE        R2
Linear Regression  0.555892  0.533200  0.575788
Decision Tree      0.493969  0.453904  0.623042
Random Forest      0.255170  0.327425  0.805275
Gradient Boosting  0.293999  0.371650  0.775643
SVR                0.357004  0.398599  0.727563


## Performance Analysis

### Best-performing algorithm:

Gradient Boosting Regressor or Random Forest Regressor

These ensemble methods perform well because:

- They can capture complex, non-linear relationships between features and price

- Are robust to outliers and feature scales

- Reduce overfitting compared to single decision trees

- Generally show high R² scores and low MSE/MAE

### Worst-performing algorithm:

Linear Regression or SVR

Reasons:

- Linear Regression may be too simplistic for the complex relationships in housing data

- SVR can perform poorly if not properly tuned (kernel selection, C, epsilon parameters)

- Both may struggle with interactions between features that tree-based methods capture naturally

### Key Observations:
1. Ensemble methods (Random Forest, Gradient Boosting) generally outperform simpler models

2. Decision Trees may overfit without proper pruning

3. Performance can often be improved with hyperparameter tuning for each algorithm

4. The choice between best models may depend on computational resources and need for interpretability