
# Feature visualization (PCA, Autoencoder, CatBoost)
Use the artifacts saved by `train_pca_ae_catboost` to inspect latent spaces, reconstructions, and CatBoost feature importance.


In [None]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from joblib import load

from src.config import INTERIM_DIR
from src.features.autoencoder import load_autoencoder, AutoencoderFeatureExtractor
from src.features.pca_svd import PCAFeatureExtractor
from src.features.catboost_model import load_catboost_model

sns.set_context("notebook")
INTERIM_DIR = Path(INTERIM_DIR)


In [None]:

# Load artifacts and data
scaler = load(INTERIM_DIR / "scaler.joblib")
pca: PCAFeatureExtractor = load(INTERIM_DIR / "pca_model.joblib")
ae_model = load_autoencoder(INTERIM_DIR / "autoencoder.pt", device="cpu")
ae = AutoencoderFeatureExtractor(ae_model, device="cpu")
cb_model = load_catboost_model(INTERIM_DIR / "catboost_model.cbm")

X_train = np.load(INTERIM_DIR / "X_train.npy")
y_train = np.load(INTERIM_DIR / "y_train.npy")
X_train_std = scaler.transform(X_train)

TILE_SIZE = 32
n_bands = X_train.shape[1] // (TILE_SIZE * TILE_SIZE)
assert n_bands * TILE_SIZE * TILE_SIZE == X_train.shape[1], "input dims should be tile_size^2 * bands"


In [None]:

# Load or compute latent representations

def load_or_compute(name, fn):
    path = INTERIM_DIR / name
    if path.exists():
        return np.load(path)
    arr = fn()
    np.save(path, arr)
    return arr

Zp_train = load_or_compute("Zp_train.npy", lambda: pca.transform(X_train_std))
Za_train = load_or_compute("Za_train.npy", lambda: ae.transform(X_train_std))



## PCA variance spectrum
How much variance each principal component explains (proxy for information captured).


In [None]:

plt.figure(figsize=(6, 4))
plt.plot(np.var(Zp_train, axis=0), marker="o")
plt.title("PCA Latent Variance Spectrum")
plt.xlabel("Component index")
plt.ylabel("Variance (std. units)")
plt.grid()
plt.show()



## Latent projections (colored by NDRE)
Comparing geometry of the first two latent dimensions for PCA vs Autoencoder.


In [None]:

fig, ax = plt.subplots(1, 2, figsize=(12, 5))

sc0 = ax[0].scatter(Zp_train[:, 0], Zp_train[:, 1], c=y_train, s=8, cmap="viridis")
ax[0].set_title("PCA latent space")
ax[0].set_xlabel("PC1")
ax[0].set_ylabel("PC2")

sc1 = ax[1].scatter(Za_train[:, 0], Za_train[:, 1], c=y_train, s=8, cmap="viridis")
ax[1].set_title("Autoencoder latent space")
ax[1].set_xlabel("z1")
ax[1].set_ylabel("z2")

cbar = fig.colorbar(sc1, ax=ax.ravel().tolist(), shrink=0.9)
cbar.set_label("NDRE (target)")
plt.tight_layout()
plt.show()



## Reconstructions (original vs PCA vs Autoencoder)
Visual check on how each model rebuilds a tile (band 0 shown). Differences highlight what information is retained.


In [None]:

idx = 0
x_std = X_train_std[idx : idx + 1]

x_orig = scaler.inverse_transform(x_std)[0]
x_pca = scaler.inverse_transform(pca.inverse_transform(pca.transform(x_std)))[0]
x_ae = scaler.inverse_transform(ae.reconstruct(x_std))[0]

img_orig = x_orig.reshape(TILE_SIZE, TILE_SIZE, n_bands)
img_pca = x_pca.reshape(TILE_SIZE, TILE_SIZE, n_bands)
img_ae = x_ae.reshape(TILE_SIZE, TILE_SIZE, n_bands)

band = 0
fig, ax = plt.subplots(1, 3, figsize=(12, 4))
for a, img, title in zip(
    ax,
    [img_orig, img_pca, img_ae],
    ["Original", "PCA recon", "AE recon"],
):
    sns.heatmap(img[:, :, band], cmap="viridis", cbar=False, ax=a)
    a.set_title(f"{title} (band {band})")
    a.axis("off")
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 3))
plt.plot(x_orig[:512], label="Original", alpha=0.8)
plt.plot(x_pca[:512], label="PCA recon", alpha=0.8)
plt.plot(x_ae[:512], label="AE recon", alpha=0.8)
plt.title("First 512 flattened pixels")
plt.xlabel("Flattened index")
plt.legend()
plt.tight_layout()
plt.show()



## PCA spatial patterns
Top components reshaped to tile space (band-aggregated energy).


In [None]:

num_show = 4
components = pca.components_[:num_show]
comp_imgs = components.reshape(num_show, TILE_SIZE, TILE_SIZE, n_bands)
comp_energy = comp_imgs.sum(axis=-1)

fig, axes = plt.subplots(1, num_show, figsize=(3*num_show, 3))
for i, ax in enumerate(axes):
    sns.heatmap(comp_energy[i], cmap="coolwarm", center=0, cbar=False, ax=ax)
    ax.set_title(f"PC{i+1} spatial energy")
    ax.axis("off")
plt.tight_layout()
plt.show()



## CatBoost feature importance
Aggregated spatial importance (sum across bands) and band-level importance (sum across pixels).


In [None]:

cb_importance = cb_model.get_feature_importance(type="FeatureImportance")
imp_grid = cb_importance.reshape(TILE_SIZE, TILE_SIZE, n_bands)
spatial_imp = imp_grid.sum(axis=-1)
band_imp = imp_grid.sum(axis=(0, 1))

fig, ax = plt.subplots(1, 2, figsize=(11, 4))
sns.heatmap(spatial_imp, cmap="magma", ax=ax[0])
ax[0].set_title("Spatial importance (all bands)")
ax[0].axis("off")

ax[1].bar(range(n_bands), band_imp)
ax[1].set_xlabel("Band index")
ax[1].set_ylabel("Importance (sum across pixels)")
ax[1].set_title("Per-band importance")
plt.tight_layout()
plt.show()



## Takeaways to record in reports
- PCA shows which global linear directions dominate variance; inspect early components for physically meaningful axes.
- Autoencoder reconstructions reveal whether nonlinear features preserve fine-grained spatial detail.
- CatBoost importance surfaces show whether the model relies on specific spatial regions or bands; compare with PCA components to spot agreement.
