In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import os

In [None]:
joined_df = pd.read_csv('spatialjoined70_v2.csv')

# Balance Features

In [None]:
# extracting labeled locations only
labeled_df = buffer_joined[buffer_joined.label == 'Labeled']

In [None]:
# find the largest balanced training size
min_class_size = labeled_df['value'].value_counts().min()
print(f"Smallest class size: {min_class_size}")

# amount of each value class to sample
value_class_size = 217299

# sample same amount from each class
balanced_df = labeled_df.groupby('value').apply(lambda x: x.sample(value_class_size, random_state=42), include_groups=True).reset_index(drop=True)

# Prepare and Scale Features

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# features and target
X = balanced_df[['SURFACE', 'HEIGHT', 'LAND', 'POP', 'VOLUME']]
y = balanced_df['value']

# normalizing features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(X)
X = pd.DataFrame(scaled_features, columns=X.columns)

X.to_csv('X_lowres_balanced')
y.to_csv('y_lowres_balanced')

# Grid Search

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

X = pd.read_csv('X_lowres_balanced', index_col=0)
y = pd.read_csv('y_lowres_balanced', index_col=0)

# normalizing features
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(X)
X = pd.DataFrame(scaled_features, columns=X.columns)

# separate out the test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# convert target variables to NumPy arrays and ravel
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Define the parameter distributions
param_dist = {
    'n_estimators': randint(100, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced']
}

# Initialize the model
clf = RandomForestClassifier(random_state=42)

# Initialize the random search
random_search = RandomizedSearchCV(estimator=clf, param_distributions=param_dist, 
                                   n_iter=100, cv=3, n_jobs=-1, verbose=2, scoring='f1_macro', random_state=42)

# Fit the random search to the data
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validated F1 score: {random_search.best_score_}")

# Use the best estimator to make predictions
best_clf = random_search.best_estimator_

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# training the classifier
clf = RandomForestClassifier(n_estimators=117, bootstrap=True, class_weight=None,
                             max_depth=30, max_features='sqrt', min_samples_leaf=1,
                             min_samples_split=4, random_state=42)
clf.fit(X_train, y_train)

# Classification Report & Confusion Matrix

In [None]:
# predicting on test set + classification report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
# CONFUSION MATRIX
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
classes = sorted(y_test.unique())

plt.figure(figsize=(12, 7.5))
sns.heatmap(cm, annot=True, fmt='d', cmap=plt.cm.Blues, xticklabels=classes, yticklabels=classes)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.title('Confusion Matrix (Model 3)', fontsize=16)
plt.show()

# SHAP Values

In [None]:
sample_size = 500

# Randomly sample indices
sample_indices = np.random.choice(X_val.index, size=sample_size, replace=False)

# Subset
X_sample = X_test.loc[sample_indices]
y_sample = y_test.loc[sample_indices]

In [None]:
import shap

explainer = shap.TreeExplainer(clf)
explanation = explainer(X_sample)
shap_values = explainer.shap_values(X_sample)

## Feature Importance Plot

In [None]:
# Plot the SHAP summary plot
shap.summary_plot(shap_values, X_sample.values, plot_type="bar", class_names=class_names, feature_names=X_sample.columns, show=False)
plt.title("Feature Importance (Model 1)", fontsize=12, pad=0.01)
plt.xlabel("mean (|SHAP value|) (average impact on model output magnitude)", fontsize=10)
plt.tight_layout()
plt.show()

## Summary Plot

In [None]:
class_names = ['amenity', 'leisure', 'landuse', 'shop', 'tourism']

for i, class_name in enumerate(class_names):
    plt.figure(figsize=(10, 12))
    shap.summary_plot(shap_values[i], X_sample.values, feature_names=X_sample.columns, show=False)
    plt.title(f'Class: {class_name}', fontsize=16)
    plt.xlabel("SHAP value (impact on model output)", fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

## Dependence Plot

In [None]:
# List of feature names
feature_names = X_sample.columns.tolist()

# Number of classes
num_classes = len(shap_values)

# Create a figure with subplots
fig, axes = plt.subplots(num_classes, len(feature_names), figsize=(19.5, 15), sharey=True, sharex=True)
# Iterate over each class and feature
for class_idx in range(num_classes):
    for feature_idx, feature_name in enumerate(feature_names):
        ax = axes[class_idx, feature_idx]
        shap.dependence_plot(
            feature_idx, shap_values[class_idx], X_sample.values, feature_names=feature_names, ax=ax, show=False
        )
        ax.set_xlabel('')
        ax.set_ylabel('SHAP value')
        if class_idx == 0:
            ax.set_title(feature_name, fontsize=18)
        if feature_idx == 0:
            ax.set_ylabel(class_names[class_idx], fontsize=18)

plt.tight_layout()
plt.show()