In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler

In [2]:
housing_data = fetch_california_housing()
df = pd.DataFrame(housing_data.data, columns=housing_data.feature_names)
df['Target'] = housing_data.target

In [3]:
# Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop('Target', axis=1))

df_scaled = pd.DataFrame(scaled_features, columns=housing_data.feature_names)
df_scaled['Target'] = df['Target']

In [4]:
# Describe preprocessing steps
print("Preprocessing Steps: ")
print("1. Loaded the dataset.")
print("2. Converted to DataFrame for easy manipulation.")
print("3. Checked and confirmed no missing values.")
print("4. Standardized the features using StandardScaler.")

Preprocessing Steps: 
1. Loaded the dataset.
2. Converted to DataFrame for easy manipulation.
3. Checked and confirmed no missing values.
4. Standardized the features using StandardScaler.


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

In [6]:
X = df_scaled.drop('Target', axis=1)
y = df_scaled['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Linear Regression: Fits a linear equation to the dataset. Suitable for datasets with a linear relationship between features and target.

Decision Tree Regressor: Splits data into regions for predictions. Handles non-linear relationships well.

Random Forest Regressor: Uses multiple decision trees to reduce overfitting. Suitable for complex datasets.

Gradient Boosting Regressor: Builds models sequentially to correct errors. Effective on complex data.

SVR: Tries to fit the best line within a threshold around the actual points. Good for non-linear relationships.

In [7]:
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Support Vector Regressor (SVR)": SVR()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(f"{name} trained and predicted.")

Linear Regression trained and predicted.
Decision Tree Regressor trained and predicted.
Random Forest Regressor trained and predicted.
Gradient Boosting Regressor trained and predicted.
Support Vector Regressor (SVR) trained and predicted.


In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [9]:
# Evaluate each model
evaluation_metrics = {}
for name, model in models.items():
    predictions = model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    mae = mean_absolute_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    evaluation_metrics[name] = {'MSE': mse, 'MAE': mae, 'R²': r2}


In [10]:
# Display evaluation results
for model, metrics in evaluation_metrics.items():
    print(f"\n{model}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value}")



Linear Regression:
MSE: 0.5558915986952442
MAE: 0.5332001304956565
R²: 0.575787706032451

Decision Tree Regressor:
MSE: 0.4844127956291909
MAE: 0.451308316375969
R²: 0.6303346484397037

Random Forest Regressor:
MSE: 0.25674808471529464
MAE: 0.3280718902131784
R²: 0.8040702643384245

Gradient Boosting Regressor:
MSE: 0.2940804571354899
MAE: 0.3717234163505605
R²: 0.7755811643398038

Support Vector Regressor (SVR):
MSE: 0.3551984619989419
MAE: 0.3977630963437859
R²: 0.7289407597956462


In [11]:
best_model = min(evaluation_metrics, key=lambda x: evaluation_metrics[x]['MSE'])
worst_model = max(evaluation_metrics, key=lambda x: evaluation_metrics[x]['MSE'])
print(f"\nBest Model: {best_model}")
print(f"Worst Model: {worst_model}")


Best Model: Random Forest Regressor
Worst Model: Linear Regression
