In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn.inspection import PartialDependenceDisplay

In [3]:
# Step 1: Data Preparation
# Load the California housing dataset and select non-spatial features
data = fetch_california_housing(as_frame=True)
X = data.data[['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup']]
y = data.target

# Split the dataset into training (70%) and test (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


In [4]:
# Step 2: Model Selection and Training
# Initialize models
models = {
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'Support Vector Machine': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

In [5]:
# Define hyperparameter search spaces
param_grids = {
    'Decision Tree': {'max_depth': list(range(1, 11))},
    'Random Forest': {'n_estimators': list(range(100, 1001, 100))},
    'Gradient Boosting': {'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]},
    'Support Vector Machine': {'kernel': ['linear', 'poly', 'rbf']},
    'K-Nearest Neighbors': {'n_neighbors': list(range(10, 101, 10))}
}

In [None]:
# Perform grid search with 5-fold cross-validation and train models
from tqdm import tqdm

best_models = {}
for model_name, model in tqdm(models.items()):
    grid_search = GridSearchCV(model, param_grids[model_name], cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_models[model_name] = best_model

 20%|████████████████▊                                                                   | 1/5 [00:03<00:12,  3.23s/it]

In [None]:
# Step 3: Model Evaluation
# Calculate R-squared (R2) scores for all models on training and test sets
results = {}
for model_name, model in best_models.items():
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    results[model_name] = {'Train R2': train_r2, 'Test R2': test_r2}

In [None]:
# Step 4: Interpretability Analysis for Gradient Boosting Decision Tree
# Feature importance
gbdt_model = best_models['Gradient Boosting']
feature_importance = gbdt_model.feature_importances_

In [None]:
# Partial dependence plots
features = X.columns
fig, ax = plt.subplots(figsize=(12, 6))
plot_partial_dependence(gbdt_model, X_train, features=features, ax=ax)
plt.suptitle('Partial Dependence Plots for Non-Spatial Features')
plt.subplots_adjust(top=0.9)

In [None]:
# Step 5: Report
# Print model performance results
print("Model Performance:")
for model_name, metrics in results.items():
    print(f"{model_name}: Train R2 = {metrics['Train R2']:.4f}, Test R2 = {metrics['Test R2']:.4f}")

In [None]:
# Display feature importance plot
plt.figure(figsize=(10, 6))
plt.bar(features, feature_importance)
plt.xlabel('Features')
plt.ylabel('Feature Importance')
plt.title('Impurity-Based Feature Importance')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Show partial dependence plots
plt.show()