In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [39]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()

In [40]:
df = pd.DataFrame(housing.data, columns=housing.feature_names)

In [41]:
df['MedHouseVal'] = housing.target

In [42]:
print(df.head())

   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


In [43]:
df.isnull().sum()

MedInc         0
HouseAge       0
AveRooms       0
AveBedrms      0
Population     0
AveOccup       0
Latitude       0
Longitude      0
MedHouseVal    0
dtype: int64

In [44]:
from sklearn.preprocessing import StandardScaler
X = df.drop('MedHouseVal', axis=1)
Y = df['MedHouseVal']
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
print(X_scaled_df.head())


     MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  2.344766  0.982143  0.628559  -0.153758   -0.974429 -0.049597  1.052548   
1  2.332238 -0.607019  0.327041  -0.263336    0.861439 -0.092512  1.043185   
2  1.782699  1.856182  1.155620  -0.049016   -0.820777 -0.025843  1.038503   
3  0.932968  1.856182  0.156966  -0.049833   -0.766028 -0.050329  1.038503   
4 -0.012881  1.856182  0.344711  -0.032906   -0.759847 -0.085616  1.038503   

   Longitude  
0  -1.327835  
1  -1.322844  
2  -1.332827  
3  -1.337818  
4  -1.337818  


##### Steps perfomred
Check for missing data: To ensure data integrity before modeling
Standardize features: To normalize feature scales for fair and stable modeling

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score

In [46]:
y = df['MedHouseVal']

In [47]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [48]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42)

In [49]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

LinearRegression()

In [50]:
y_pred = lr_model.predict(X_test)

In [51]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [52]:
print("Mean Squared Error (MSE):", mse)
print("R² Score:", r2)

Mean Squared Error (MSE): 0.5558915986952442
R² Score: 0.575787706032451


In [53]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [54]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Results:")
    print(f"  - Mean Squared Error (MSE): {mse:.4f}")
    print(f"  - R² Score: {r2:.4f}")
    print()

In [55]:
# Initialize models
dt_model = DecisionTreeRegressor(random_state=42)
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)
svr_model = SVR()  # default uses RBF kernel

evaluate_model(dt_model, X_train, X_test, y_train, y_test)
evaluate_model(rf_model, X_train, X_test, y_train, y_test)
evaluate_model(gb_model, X_train, X_test, y_train, y_test)
evaluate_model(svr_model, X_train, X_test, y_train, y_test)

DecisionTreeRegressor Results:
  - Mean Squared Error (MSE): 0.4977
  - R² Score: 0.6202

RandomForestRegressor Results:
  - Mean Squared Error (MSE): 0.2569
  - R² Score: 0.8039

GradientBoostingRegressor Results:
  - Mean Squared Error (MSE): 0.2940
  - R² Score: 0.7756

SVR Results:
  - Mean Squared Error (MSE): 0.3552
  - R² Score: 0.7289



## Linear Regression
Fits a straight line (or hyperplane) through the data by minimizing the sum of squared errors between predicted and actual values.
Assumes a linear relationship between the features and the target variable.
##  Decision Tree Regressor
Splits the data into regions by asking a series of questions (if-else splits) on feature values.
Each leaf node gives a prediction (typically the average of training samples in that region).
## Random Forest Regressor
An ensemble of many decision trees.
Each tree is trained on a random subset of the data and features.
Predictions are made by averaging the outputs of all trees (reduces variance).
## Gradient Boosting Regressor
Builds trees sequentially, where each new tree learns to correct the errors of the previous ones.
Uses gradient descent to minimize a loss function.
## Support Vector Regressor (SVR)
Attempts to find a function that fits the data within a specified margin (epsilon-insensitive zone).
Uses kernel functions (e.g., RBF) to capture non-linear relationships in high-dimensional space.

### Suitability
##### Linear Regression: Baseline model; interpretable; fast
##### Decision Tree: Non-linear modeling; interpretable splits
##### Random Forest : Handles complexity and overfitting well; robust
##### Gradient Boosting: High performance on structured data; captures subtle patterns
##### SVR: Effective in non-linear space (with kernel); good for precise margin-based fit

In [56]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "SVR": SVR()
}

# Evaluate models
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        "Model": name,
        "MSE": round(mse, 4),
        "MAE": round(mae, 4),
        "R²": round(r2, 4)
    })

# Display results
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="R²", ascending=False))

               Model     MSE     MAE      R²
2      Random Forest  0.2569  0.3282  0.8039
3  Gradient Boosting  0.2940  0.3717  0.7756
4                SVR  0.3552  0.3978  0.7289
1      Decision Tree  0.4977  0.4552  0.6202
0  Linear Regression  0.5559  0.5332  0.5758


### Best and Worst performing algorithms

Best model: Random Forest is the best-performing algorithm, providing the most accurate predictions with the lowest errors (MSE, MAE) and the highest R² score.

Worst model: Linear Regression performs the worst due to its inability to capture non-linear relationships in the data, resulting in the largest errors.