All models together to be run 50 times (once per subset/decluster)

**the modelling methodology**

‚ÄúEach declustered subset (DC·µ¢) was internally partitioned (80/20) to allow model validation on independent test data, ensuring that predictive performance and model stability were assessed under spatial independence. After internal validation, models were retrained on all samples in each DC·µ¢ to maximize predictive strength and produce the final maps used for ensemble averaging. The ensemble predictions were subsequently validated using the external holdout dataset (test‚ÇÅ) derived from the original data split.‚Äù

real deal run:

## Cross-validation over decluster runs (1‚Äì50)

In [9]:
# =========================================================
# Script 1: Cross-validation over decluster runs (1‚Äì50)
# RF: 5-fold random CV
# OK: Leave-One-Out CV
# RF+OK: 5-fold CV with residual kriging
# =========================================================

import os
import glob
import warnings
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
from pykrige.ok import OrdinaryKriging

warnings.filterwarnings("ignore")

# -----------------------------
# PATHS & SETTINGS
# -----------------------------
DECLUSTER_DIR = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned"
OUTPUT_DIR = "/Users/inesschwartz/Desktop/model/results_cv_global"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# RF hyperparameters (your tuned ones)
RF_PARAMS = {
    "n_estimators": 500,
    "min_samples_leaf": 5,
    "max_features": 0.5,
    "max_depth": 20,
    "random_state": 42,
    "n_jobs": 4,
}

# Variogram parameters from grid search
VARIOGRAM = {
    "nugget": 0.08,
    "sill": 0.21,
    "range": 50000,
    "anisotropy_scaling": 0.40,
    "anisotropy_angle": 135,
}

VARIOGRAM_MODEL = "spherical"
N_FOLDS = 5
MAX_DECLUSTERS = 50   # 1‚Äì50

# -----------------------------
# Helpers
# -----------------------------
def compute_metrics(y_true, y_pred):
    return {
        "R2": r2_score(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAE": mean_absolute_error(y_true, y_pred),
    }


# -----------------------------
# RF 5-fold CV (predictions for all points)
# -----------------------------
def rf_cv_5fold(X, y, k=N_FOLDS):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    y_pred = np.zeros_like(y, dtype=float)

    for train_idx, test_idx in kf.split(X):
        X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
        y_tr = y[train_idx]

        rf = RandomForestRegressor(**RF_PARAMS)
        rf.fit(X_tr, y_tr)
        y_pred[test_idx] = rf.predict(X_te)

    metrics = compute_metrics(y, y_pred)
    return metrics, y_pred


# -----------------------------
# OK Leave-One-Out CV
# -----------------------------
def ok_loo_cv(coords, y, variogram, model=VARIOGRAM_MODEL):
    n = len(y)
    y_pred = np.zeros(n, dtype=float)

    for i in tqdm(range(n), desc="LOO OK", leave=False):
        mask = np.ones(n, dtype=bool)
        mask[i] = False

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=y[mask],
            variogram_model=model,
            variogram_parameters={
                "sill": variogram["sill"],
                "range": variogram["range"],
                "nugget": variogram["nugget"],
                "anisotropy_scaling": variogram["anisotropy_scaling"],
                "anisotropy_angle": variogram["anisotropy_angle"],
            },
            enable_plotting=False,
            verbose=False,
        )

        z, _ = ok.execute(
            "points",
            np.array([coords[i, 0]]),
            np.array([coords[i, 1]]),
        )
        y_pred[i] = float(z.data[0])

    metrics = compute_metrics(y, y_pred)
    return metrics, y_pred


# -----------------------------
# RF+OK 5-fold CV (residual kriging)
# -----------------------------
def rf_ok_cv_5fold(X, y, coords, variogram, k=N_FOLDS, model=VARIOGRAM_MODEL):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    y_pred_rf = np.zeros_like(y, dtype=float)
    y_pred_resid_ok = np.zeros_like(y, dtype=float)
    y_pred_hybrid = np.zeros_like(y, dtype=float)

    for train_idx, test_idx in kf.split(X):
        X_tr, X_te = X.iloc[train_idx], X.iloc[test_idx]
        y_tr, y_te = y[train_idx], y[test_idx]
        coords_tr, coords_te = coords[train_idx], coords[test_idx]

        # RF model
        rf = RandomForestRegressor(**RF_PARAMS)
        rf.fit(X_tr, y_tr)

        y_pred_rf[test_idx] = rf.predict(X_te)

        # Residuals on training data
        resid_tr = y_tr - rf.predict(X_tr)

        # OK on residuals
        ok = OrdinaryKriging(
            x=coords_tr[:, 0],
            y=coords_tr[:, 1],
            z=resid_tr,
            variogram_model=model,
            variogram_parameters={
                "sill": variogram["sill"],
                "range": variogram["range"],
                "nugget": variogram["nugget"],
                "anisotropy_scaling": variogram["anisotropy_scaling"],
                "anisotropy_angle": variogram["anisotropy_angle"],
            },
            enable_plotting=False,
            verbose=False,
        )

        z_resid, _ = ok.execute(
            "points",
            coords_te[:, 0],
            coords_te[:, 1],
        )
        y_pred_resid_ok[test_idx] = z_resid.data

        # Hybrid = RF + kriged residual
        y_pred_hybrid[test_idx] = y_pred_rf[test_idx] + y_pred_resid_ok[test_idx]

    metrics_rf = compute_metrics(y, y_pred_rf)
    metrics_resid = compute_metrics(y, y_pred_resid_ok)
    metrics_hybrid = compute_metrics(y, y_pred_hybrid)

    return metrics_rf, metrics_resid, metrics_hybrid, y_pred_rf, y_pred_resid_ok, y_pred_hybrid


# =========================================================
# MAIN LOOP OVER DECLUSTERS
# =========================================================
files = sorted(glob.glob(os.path.join(DECLUSTER_DIR, "decluster_run_*.csv")))
files = files[:MAX_DECLUSTERS]

summary_records = []
point_records = []

print(f"Running CV on {len(files)} decluster subsets...")

for i, path in enumerate(tqdm(files, desc="Declusters"), start=1):
    df = pd.read_csv(path)
    n = len(df)

    print(f"\nüìÇ Subset {i:03d} | n = {n}")

    y = df["log_soc_stock"].values
    coords = df[["X_coord", "Y_coord"]].values
    site_ids = df["site_info_id"].values if "site_info_id" in df.columns else np.arange(n)
    X = df.drop(columns=["log_soc_stock", "X_coord", "Y_coord", "site_info_id"], errors="ignore")

    # --- RF CV ---
    rf_metrics, y_rf_cv = rf_cv_5fold(X, y, k=N_FOLDS)

    # --- OK LOO ---
    ok_metrics, y_ok_loo = ok_loo_cv(coords, y, VARIOGRAM)

    # --- RF+OK 5-fold CV (residual kriging) ---
    rfok_rf_metrics, rfok_resid_metrics, rfok_hybrid_metrics, \
        y_rf_5fold, y_ok_resid_5fold, y_hybrid_5fold = rf_ok_cv_5fold(
            X, y, coords, VARIOGRAM, k=N_FOLDS
        )

    # -------- Summary metrics --------
    summary_records.append({"subset": i, "model": "RF_5fold", **rf_metrics})
    summary_records.append({"subset": i, "model": "OK_LOO", **ok_metrics})
    summary_records.append({"subset": i, "model": "RF_5fold_in_RF+OK", **rfok_rf_metrics})
    summary_records.append({"subset": i, "model": "OK_resid_5fold", **rfok_resid_metrics})
    summary_records.append({"subset": i, "model": "RF+OK_5fold", **rfok_hybrid_metrics})

    print(
        f"‚úÖ DC{i:03d} | RF R¬≤={rf_metrics['R2']:.3f} | "
        f"OK R¬≤={ok_metrics['R2']:.3f} | "
        f"RF+OK R¬≤={rfok_hybrid_metrics['R2']:.3f}"
    )

    # -------- Per-point predictions (optional but useful) --------
    for model_name, preds in {
        "RF_5fold": y_rf_cv,
        "OK_LOO": y_ok_loo,
        "RF+OK_5fold": y_hybrid_5fold,
    }.items():
        df_tmp = pd.DataFrame({
            "subset": i,
            "model": model_name,
            "site_info_id": site_ids,
            "y_true": y,
            "y_pred": preds,
            "error": y - preds,
            "abs_error": np.abs(y - preds),
        })
        point_records.append(df_tmp)

# Save outputs
summary_df = pd.DataFrame(summary_records)
points_df = pd.concat(point_records, ignore_index=True)

summary_csv = os.path.join(OUTPUT_DIR, "cv_summary_all_declusters.csv")
points_csv = os.path.join(OUTPUT_DIR, "cv_point_predictions_all_declusters.csv")

summary_df.to_csv(summary_csv, index=False)
points_df.to_csv(points_csv, index=False)

print("\n‚úÖ CV complete.")
print(f"Summary metrics ‚Üí {summary_csv}")
print(f"Point predictions ‚Üí {points_csv}")


Running CV on 50 decluster subsets...


Declusters:   0%|          | 0/50 [00:00<?, ?it/s]


üìÇ Subset 001 | n = 634


Declusters:   2%|‚ñè         | 1/50 [00:33<27:45, 33.99s/it]

‚úÖ DC001 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.329

üìÇ Subset 002 | n = 634


Declusters:   4%|‚ñç         | 2/50 [01:04<25:47, 32.23s/it]

‚úÖ DC002 | RF R¬≤=0.370 | OK R¬≤=0.188 | RF+OK R¬≤=0.321

üìÇ Subset 003 | n = 634


Declusters:   6%|‚ñå         | 3/50 [01:38<25:40, 32.78s/it]

‚úÖ DC003 | RF R¬≤=0.384 | OK R¬≤=0.203 | RF+OK R¬≤=0.354

üìÇ Subset 004 | n = 634


Declusters:   8%|‚ñä         | 4/50 [01:58<21:13, 27.67s/it]

‚úÖ DC004 | RF R¬≤=0.384 | OK R¬≤=0.204 | RF+OK R¬≤=0.354

üìÇ Subset 005 | n = 634


Declusters:  10%|‚ñà         | 5/50 [02:16<18:17, 24.39s/it]

‚úÖ DC005 | RF R¬≤=0.374 | OK R¬≤=0.208 | RF+OK R¬≤=0.335

üìÇ Subset 006 | n = 634


Declusters:  12%|‚ñà‚ñè        | 6/50 [02:34<16:05, 21.95s/it]

‚úÖ DC006 | RF R¬≤=0.388 | OK R¬≤=0.204 | RF+OK R¬≤=0.355

üìÇ Subset 007 | n = 634


Declusters:  14%|‚ñà‚ñç        | 7/50 [02:51<14:47, 20.63s/it]

‚úÖ DC007 | RF R¬≤=0.356 | OK R¬≤=0.188 | RF+OK R¬≤=0.301

üìÇ Subset 008 | n = 634


Declusters:  16%|‚ñà‚ñå        | 8/50 [03:08<13:34, 19.40s/it]

‚úÖ DC008 | RF R¬≤=0.358 | OK R¬≤=0.189 | RF+OK R¬≤=0.304

üìÇ Subset 009 | n = 634


Declusters:  18%|‚ñà‚ñä        | 9/50 [03:26<12:48, 18.73s/it]

‚úÖ DC009 | RF R¬≤=0.386 | OK R¬≤=0.204 | RF+OK R¬≤=0.355

üìÇ Subset 010 | n = 634


Declusters:  20%|‚ñà‚ñà        | 10/50 [03:43<12:12, 18.30s/it]

‚úÖ DC010 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.329

üìÇ Subset 011 | n = 634


Declusters:  22%|‚ñà‚ñà‚ñè       | 11/50 [04:00<11:43, 18.04s/it]

‚úÖ DC011 | RF R¬≤=0.375 | OK R¬≤=0.206 | RF+OK R¬≤=0.335

üìÇ Subset 012 | n = 634


Declusters:  24%|‚ñà‚ñà‚ñç       | 12/50 [04:18<11:21, 17.94s/it]

‚úÖ DC012 | RF R¬≤=0.375 | OK R¬≤=0.208 | RF+OK R¬≤=0.336

üìÇ Subset 013 | n = 634


Declusters:  26%|‚ñà‚ñà‚ñå       | 13/50 [04:36<11:00, 17.85s/it]

‚úÖ DC013 | RF R¬≤=0.385 | OK R¬≤=0.204 | RF+OK R¬≤=0.356

üìÇ Subset 014 | n = 634


Declusters:  28%|‚ñà‚ñà‚ñä       | 14/50 [04:54<10:43, 17.87s/it]

‚úÖ DC014 | RF R¬≤=0.376 | OK R¬≤=0.206 | RF+OK R¬≤=0.341

üìÇ Subset 015 | n = 634


Declusters:  30%|‚ñà‚ñà‚ñà       | 15/50 [05:12<10:30, 18.02s/it]

‚úÖ DC015 | RF R¬≤=0.370 | OK R¬≤=0.187 | RF+OK R¬≤=0.321

üìÇ Subset 016 | n = 634


Declusters:  32%|‚ñà‚ñà‚ñà‚ñè      | 16/50 [05:29<10:03, 17.74s/it]

‚úÖ DC016 | RF R¬≤=0.358 | OK R¬≤=0.189 | RF+OK R¬≤=0.304

üìÇ Subset 017 | n = 634


Declusters:  34%|‚ñà‚ñà‚ñà‚ñç      | 17/50 [05:46<09:37, 17.49s/it]

‚úÖ DC017 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.329

üìÇ Subset 018 | n = 634


Declusters:  36%|‚ñà‚ñà‚ñà‚ñå      | 18/50 [06:04<09:22, 17.59s/it]

‚úÖ DC018 | RF R¬≤=0.388 | OK R¬≤=0.206 | RF+OK R¬≤=0.356

üìÇ Subset 019 | n = 634


Declusters:  38%|‚ñà‚ñà‚ñà‚ñä      | 19/50 [06:21<09:06, 17.64s/it]

‚úÖ DC019 | RF R¬≤=0.357 | OK R¬≤=0.189 | RF+OK R¬≤=0.303

üìÇ Subset 020 | n = 634


Declusters:  40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [06:41<09:04, 18.15s/it]

‚úÖ DC020 | RF R¬≤=0.374 | OK R¬≤=0.206 | RF+OK R¬≤=0.333

üìÇ Subset 021 | n = 634


Declusters:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 21/50 [07:00<08:53, 18.40s/it]

‚úÖ DC021 | RF R¬≤=0.374 | OK R¬≤=0.208 | RF+OK R¬≤=0.335

üìÇ Subset 022 | n = 634


Declusters:  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 22/50 [07:18<08:30, 18.25s/it]

‚úÖ DC022 | RF R¬≤=0.357 | OK R¬≤=0.189 | RF+OK R¬≤=0.303

üìÇ Subset 023 | n = 634


Declusters:  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 23/50 [07:36<08:12, 18.24s/it]

‚úÖ DC023 | RF R¬≤=0.376 | OK R¬≤=0.206 | RF+OK R¬≤=0.341

üìÇ Subset 024 | n = 634


Declusters:  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 24/50 [07:54<07:51, 18.12s/it]

‚úÖ DC024 | RF R¬≤=0.388 | OK R¬≤=0.204 | RF+OK R¬≤=0.355

üìÇ Subset 025 | n = 634


Declusters:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [08:12<07:32, 18.11s/it]

‚úÖ DC025 | RF R¬≤=0.386 | OK R¬≤=0.204 | RF+OK R¬≤=0.355

üìÇ Subset 026 | n = 634


Declusters:  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 26/50 [08:30<07:18, 18.26s/it]

‚úÖ DC026 | RF R¬≤=0.358 | OK R¬≤=0.189 | RF+OK R¬≤=0.304

üìÇ Subset 027 | n = 634


Declusters:  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 27/50 [08:48<06:57, 18.15s/it]

‚úÖ DC027 | RF R¬≤=0.375 | OK R¬≤=0.206 | RF+OK R¬≤=0.339

üìÇ Subset 028 | n = 634


Declusters:  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 28/50 [09:06<06:38, 18.13s/it]

‚úÖ DC028 | RF R¬≤=0.373 | OK R¬≤=0.207 | RF+OK R¬≤=0.333

üìÇ Subset 029 | n = 634


Declusters:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 29/50 [09:25<06:21, 18.16s/it]

‚úÖ DC029 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.329

üìÇ Subset 030 | n = 634


Declusters:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [09:44<06:10, 18.52s/it]

‚úÖ DC030 | RF R¬≤=0.373 | OK R¬≤=0.207 | RF+OK R¬≤=0.333

üìÇ Subset 031 | n = 634


Declusters:  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 31/50 [10:17<07:15, 22.93s/it]

‚úÖ DC031 | RF R¬≤=0.388 | OK R¬≤=0.204 | RF+OK R¬≤=0.355

üìÇ Subset 032 | n = 634


Declusters:  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 32/50 [10:37<06:35, 21.99s/it]

‚úÖ DC032 | RF R¬≤=0.358 | OK R¬≤=0.188 | RF+OK R¬≤=0.303

üìÇ Subset 033 | n = 634


Declusters:  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 33/50 [11:12<07:19, 25.87s/it]

‚úÖ DC033 | RF R¬≤=0.373 | OK R¬≤=0.187 | RF+OK R¬≤=0.323

üìÇ Subset 034 | n = 634


Declusters:  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 34/50 [11:31<06:23, 23.96s/it]

‚úÖ DC034 | RF R¬≤=0.356 | OK R¬≤=0.188 | RF+OK R¬≤=0.301

üìÇ Subset 035 | n = 634


Declusters:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [11:50<05:36, 22.43s/it]

‚úÖ DC035 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.329

üìÇ Subset 036 | n = 634


Declusters:  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 36/50 [12:09<04:58, 21.29s/it]

‚úÖ DC036 | RF R¬≤=0.356 | OK R¬≤=0.188 | RF+OK R¬≤=0.301

üìÇ Subset 037 | n = 634


Declusters:  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 37/50 [12:28<04:29, 20.71s/it]

‚úÖ DC037 | RF R¬≤=0.354 | OK R¬≤=0.185 | RF+OK R¬≤=0.297

üìÇ Subset 038 | n = 634


Declusters:  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 38/50 [12:51<04:16, 21.37s/it]

‚úÖ DC038 | RF R¬≤=0.375 | OK R¬≤=0.206 | RF+OK R¬≤=0.335

üìÇ Subset 039 | n = 634


Declusters:  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 39/50 [13:37<05:14, 28.55s/it]

‚úÖ DC039 | RF R¬≤=0.373 | OK R¬≤=0.207 | RF+OK R¬≤=0.333

üìÇ Subset 040 | n = 634


Declusters:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [14:11<05:04, 30.45s/it]

‚úÖ DC040 | RF R¬≤=0.388 | OK R¬≤=0.206 | RF+OK R¬≤=0.356

üìÇ Subset 041 | n = 634


Declusters:  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 41/50 [14:55<05:08, 34.26s/it]

‚úÖ DC041 | RF R¬≤=0.371 | OK R¬≤=0.188 | RF+OK R¬≤=0.323

üìÇ Subset 042 | n = 634


Declusters:  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 42/50 [15:47<05:16, 39.59s/it]

‚úÖ DC042 | RF R¬≤=0.388 | OK R¬≤=0.207 | RF+OK R¬≤=0.356

üìÇ Subset 043 | n = 634


Declusters:  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 43/50 [16:21<04:25, 37.94s/it]

‚úÖ DC043 | RF R¬≤=0.386 | OK R¬≤=0.204 | RF+OK R¬≤=0.355

üìÇ Subset 044 | n = 634


Declusters:  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 44/50 [16:46<03:24, 34.02s/it]

‚úÖ DC044 | RF R¬≤=0.373 | OK R¬≤=0.188 | RF+OK R¬≤=0.323

üìÇ Subset 045 | n = 634


Declusters:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [17:01<02:21, 28.36s/it]

‚úÖ DC045 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.329

üìÇ Subset 046 | n = 634


Declusters:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 46/50 [17:17<01:38, 24.64s/it]

‚úÖ DC046 | RF R¬≤=0.373 | OK R¬≤=0.207 | RF+OK R¬≤=0.333

üìÇ Subset 047 | n = 634


Declusters:  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 47/50 [17:32<01:05, 21.90s/it]

‚úÖ DC047 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.329

üìÇ Subset 048 | n = 634


Declusters:  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 48/50 [17:48<00:39, 19.98s/it]

‚úÖ DC048 | RF R¬≤=0.370 | OK R¬≤=0.185 | RF+OK R¬≤=0.319

üìÇ Subset 049 | n = 634


Declusters:  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 49/50 [18:03<00:18, 18.63s/it]

‚úÖ DC049 | RF R¬≤=0.387 | OK R¬≤=0.205 | RF+OK R¬≤=0.354

üìÇ Subset 050 | n = 634


Declusters: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [18:19<00:00, 21.99s/it]

‚úÖ DC050 | RF R¬≤=0.384 | OK R¬≤=0.204 | RF+OK R¬≤=0.354






‚úÖ CV complete.
Summary metrics ‚Üí /Users/inesschwartz/Desktop/model/results_cv_global/cv_summary_all_declusters.csv
Point predictions ‚Üí /Users/inesschwartz/Desktop/model/results_cv_global/cv_point_predictions_all_declusters.csv


## Bagged RF, OK, RF+OK maps over 50 declusters

In [11]:
# =========================================================
# Script 2: Bagged maps from 50 declusters
# Produces mean & std GeoTIFFs for:
#  RF, OK, RF+OK (log_soc_stock)
# =========================================================

import os
import glob
import warnings
import numpy as np
import pandas as pd
import rasterio
from rasterio.transform import from_origin
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from pykrige.ok import OrdinaryKriging

warnings.filterwarnings("ignore")

# -----------------------------
# PATHS & SETTINGS
# -----------------------------
DECLUSTER_DIR = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned"
GRID_CSV = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_aligned.csv"
OUTPUT_DIR = "/Users/inesschwartz/Desktop/model/bagged_maps"
os.makedirs(OUTPUT_DIR, exist_ok=True)

GRID_X_COL = "X_coord"   # change if your grid uses different names
GRID_Y_COL = "Y_coord"

CRS_EPSG = "EPSG:32733"
PIXEL_SIZE = 1000.0       # 1 km

MAX_DECLUSTERS = 50

# RF hyperparameters (same as before)
RF_PARAMS = {
    "n_estimators": 500,
    "min_samples_leaf": 5,
    "max_features": 0.5,
    "max_depth": 20,
    "random_state": 42,
    "n_jobs": 4,
}

# Variogram parameters
VARIOGRAM = {
    "nugget": 0.08,
    "sill": 0.21,
    "range": 50000,
    "anisotropy_scaling": 0.40,
    "anisotropy_angle": 135,
}
VARIOGRAM_MODEL = "spherical"


# -----------------------------
# Load grid and build raster layout
# -----------------------------
grid = pd.read_csv(GRID_CSV)

xs = np.sort(grid[GRID_X_COL].unique())
ys = np.sort(grid[GRID_Y_COL].unique())  # ascending (south‚Üínorth)

nx = len(xs)
ny = len(ys)

print(f"Grid: nx={nx}, ny={ny}, total points={len(grid)}")

# origin for GeoTIFF (top-left): xmin, ymax
xmin = xs.min()
ymax = ys.max()

transform = from_origin(xmin, ymax, PIXEL_SIZE, PIXEL_SIZE)

# Features for RF: drop coords and any ID-like columns
X_grid = grid.drop(columns=[GRID_X_COL, GRID_Y_COL], errors="ignore")


# -----------------------------
# Helper: turn flat predictions into 2D array (ny, nx)
# -----------------------------
def predictions_to_grid(pred_flat, x_vals, y_vals):
    dfp = pd.DataFrame({
        GRID_X_COL: grid[GRID_X_COL].values,
        GRID_Y_COL: grid[GRID_Y_COL].values,
        "pred": pred_flat,
    })
    pivot = dfp.pivot(index=GRID_Y_COL, columns=GRID_X_COL, values="pred")
    # Ensure proper ordering
    pivot = pivot.reindex(index=np.sort(y_vals), columns=np.sort(x_vals))
    arr = pivot.values  # rows = y (ascending), cols = x (ascending)
    # For GeoTIFF, rows go from top (max y) to bottom (min y)
    arr = np.flipud(arr)
    return arr


# =========================================================
# MAIN: loop over declusters & accumulate predictions
# =========================================================
files = sorted(glob.glob(os.path.join(DECLUSTER_DIR, "decluster_run_*.csv")))
files = files[:MAX_DECLUSTERS]

rf_preds_all = []
ok_preds_all = []
rfok_preds_all = []

for i, path in enumerate(tqdm(files, desc="Declusters (mapping)"), start=1):
    df = pd.read_csv(path)
    y = df["log_soc_stock"].values
    coords = df[["X_coord", "Y_coord"]].values
    X_train = df.drop(columns=["log_soc_stock", "X_coord", "Y_coord", "site_info_id"], errors="ignore")

    print(f"\nüìÇ Subset {i:03d} | n = {len(df)}")

    # -------------------------------
    # 1) RF model on full data
    # -------------------------------
    rf = RandomForestRegressor(**RF_PARAMS)
    rf.fit(X_train, y)

    rf_grid_pred = rf.predict(X_grid)  # flat (len(grid),)

    # -------------------------------
    # 2) OK on full data (log_soc_stock)
    # -------------------------------
    ok = OrdinaryKriging(
        x=coords[:, 0],
        y=coords[:, 1],
        z=y,
        variogram_model=VARIOGRAM_MODEL,
        variogram_parameters={
            "sill": VARIOGRAM["sill"],
            "range": VARIOGRAM["range"],
            "nugget": VARIOGRAM["nugget"],
            "anisotropy_scaling": VARIOGRAM["anisotropy_scaling"],
            "anisotropy_angle": VARIOGRAM["anisotropy_angle"],
        },
        enable_plotting=False,
        verbose=False,
    )

    z_ok, _ = ok.execute(
        "points",
        grid[GRID_X_COL].values,
        grid[GRID_Y_COL].values,
    )
    ok_grid_pred = z_ok.data  # flat

    # -------------------------------
    # 3) RF+OK hybrid (residual kriging) on full data
    # -------------------------------
    resid = y - rf.predict(X_train)

    ok_resid = OrdinaryKriging(
        x=coords[:, 0],
        y=coords[:, 1],
        z=resid,
        variogram_model=VARIOGRAM_MODEL,
        variogram_parameters={
            "sill": VARIOGRAM["sill"],
            "range": VARIOGRAM["range"],
            "nugget": VARIOGRAM["nugget"],
            "anisotropy_scaling": VARIOGRAM["anisotropy_scaling"],
            "anisotropy_angle": VARIOGRAM["anisotropy_angle"],
        },
        enable_plotting=False,
        verbose=False,
    )

    z_resid_grid, _ = ok_resid.execute(
        "points",
        grid[GRID_X_COL].values,
        grid[GRID_Y_COL].values,
    )
    resid_grid_pred = z_resid_grid.data

    rfok_grid_pred = rf_grid_pred + resid_grid_pred

    # Store
    rf_preds_all.append(rf_grid_pred)
    ok_preds_all.append(ok_grid_pred)
    rfok_preds_all.append(rfok_grid_pred)

# Convert to arrays: shape (n_declusters, n_points)
rf_preds_all = np.vstack(rf_preds_all)      # (D, P)
ok_preds_all = np.vstack(ok_preds_all)      # (D, P)
rfok_preds_all = np.vstack(rfok_preds_all)  # (D, P)

# Mean & std over declusters (axis=0)
rf_mean = rf_preds_all.mean(axis=0)
rf_std  = rf_preds_all.std(axis=0)

ok_mean = ok_preds_all.mean(axis=0)
ok_std  = ok_preds_all.std(axis=0)

rfok_mean = rfok_preds_all.mean(axis=0)
rfok_std  = rfok_preds_all.std(axis=0)

# Turn flat -> 2D arrays (ny, nx)
rf_mean_grid   = predictions_to_grid(rf_mean, xs, ys)
rf_std_grid    = predictions_to_grid(rf_std, xs, ys)
ok_mean_grid   = predictions_to_grid(ok_mean, xs, ys)
ok_std_grid    = predictions_to_grid(ok_std, xs, ys)
rfok_mean_grid = predictions_to_grid(rfok_mean, xs, ys)
rfok_std_grid  = predictions_to_grid(rfok_std, xs, ys)


# -----------------------------
# Save GeoTIFFs
# -----------------------------
def write_tif(path, arr2d, transform, crs):
    ny, nx = arr2d.shape
    with rasterio.open(
        path,
        "w",
        driver="GTiff",
        height=ny,
        width=nx,
        count=1,
        dtype="float32",
        crs=crs,
        transform=transform,
    ) as dst:
        dst.write(arr2d.astype("float32"), 1)


write_tif(os.path.join(OUTPUT_DIR, "RF_logSOC_mean.tif"),   rf_mean_grid,   transform, CRS_EPSG)
write_tif(os.path.join(OUTPUT_DIR, "RF_logSOC_std.tif"),    rf_std_grid,    transform, CRS_EPSG)
write_tif(os.path.join(OUTPUT_DIR, "OK_logSOC_mean.tif"),   ok_mean_grid,   transform, CRS_EPSG)
write_tif(os.path.join(OUTPUT_DIR, "OK_logSOC_std.tif"),    ok_std_grid,    transform, CRS_EPSG)
write_tif(os.path.join(OUTPUT_DIR, "RFOK_logSOC_mean.tif"), rfok_mean_grid, transform, CRS_EPSG)
write_tif(os.path.join(OUTPUT_DIR, "RFOK_logSOC_std.tif"),  rfok_std_grid,  transform, CRS_EPSG)

print("\n‚úÖ Bagged maps written to:")
print(OUTPUT_DIR)


Grid: nx=1352, ny=1521, total points=1259342


Declusters (mapping):   0%|          | 0/50 [00:00<?, ?it/s]


üìÇ Subset 001 | n = 634


Declusters (mapping):   2%|‚ñè         | 1/50 [04:34<3:44:21, 274.72s/it]


üìÇ Subset 002 | n = 634


Declusters (mapping):   4%|‚ñç         | 2/50 [08:55<3:33:26, 266.80s/it]


üìÇ Subset 003 | n = 634


Declusters (mapping):   6%|‚ñå         | 3/50 [13:06<3:23:16, 259.50s/it]


üìÇ Subset 004 | n = 634


Declusters (mapping):   8%|‚ñä         | 4/50 [17:14<3:15:30, 255.02s/it]


üìÇ Subset 005 | n = 634


Declusters (mapping):  10%|‚ñà         | 5/50 [21:15<3:07:28, 249.97s/it]


üìÇ Subset 006 | n = 634


Declusters (mapping):  12%|‚ñà‚ñè        | 6/50 [25:35<3:05:49, 253.39s/it]


üìÇ Subset 007 | n = 634


Declusters (mapping):  14%|‚ñà‚ñç        | 7/50 [29:47<3:01:11, 252.83s/it]


üìÇ Subset 008 | n = 634


Declusters (mapping):  16%|‚ñà‚ñå        | 8/50 [33:45<2:53:35, 247.98s/it]


üìÇ Subset 009 | n = 634


Declusters (mapping):  18%|‚ñà‚ñä        | 9/50 [37:44<2:47:35, 245.25s/it]


üìÇ Subset 010 | n = 634


Declusters (mapping):  20%|‚ñà‚ñà        | 10/50 [41:45<2:42:43, 244.08s/it]


üìÇ Subset 011 | n = 634


Declusters (mapping):  22%|‚ñà‚ñà‚ñè       | 11/50 [45:44<2:37:28, 242.28s/it]


üìÇ Subset 012 | n = 634


Declusters (mapping):  24%|‚ñà‚ñà‚ñç       | 12/50 [49:44<2:33:03, 241.66s/it]


üìÇ Subset 013 | n = 634


Declusters (mapping):  26%|‚ñà‚ñà‚ñå       | 13/50 [54:02<2:32:07, 246.69s/it]


üìÇ Subset 014 | n = 634


Declusters (mapping):  28%|‚ñà‚ñà‚ñä       | 14/50 [58:09<2:28:04, 246.80s/it]


üìÇ Subset 015 | n = 634


Declusters (mapping):  30%|‚ñà‚ñà‚ñà       | 15/50 [1:02:03<2:21:44, 242.99s/it]


üìÇ Subset 016 | n = 634


Declusters (mapping):  32%|‚ñà‚ñà‚ñà‚ñè      | 16/50 [1:06:00<2:16:39, 241.15s/it]


üìÇ Subset 017 | n = 634


Declusters (mapping):  34%|‚ñà‚ñà‚ñà‚ñç      | 17/50 [1:10:09<2:13:51, 243.38s/it]


üìÇ Subset 018 | n = 634


Declusters (mapping):  36%|‚ñà‚ñà‚ñà‚ñå      | 18/50 [1:14:12<2:09:41, 243.18s/it]


üìÇ Subset 019 | n = 634


Declusters (mapping):  38%|‚ñà‚ñà‚ñà‚ñä      | 19/50 [1:18:12<2:05:09, 242.25s/it]


üìÇ Subset 020 | n = 634


Declusters (mapping):  40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [1:22:15<2:01:21, 242.72s/it]


üìÇ Subset 021 | n = 634


Declusters (mapping):  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 21/50 [1:26:23<1:58:04, 244.29s/it]


üìÇ Subset 022 | n = 634


Declusters (mapping):  44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 22/50 [1:30:37<1:55:15, 247.00s/it]


üìÇ Subset 023 | n = 634


Declusters (mapping):  46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 23/50 [1:34:38<1:50:24, 245.34s/it]


üìÇ Subset 024 | n = 634


Declusters (mapping):  48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 24/50 [1:38:43<1:46:17, 245.29s/it]


üìÇ Subset 025 | n = 634


Declusters (mapping):  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [1:42:51<1:42:30, 246.03s/it]


üìÇ Subset 026 | n = 634


Declusters (mapping):  52%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè    | 26/50 [1:47:00<1:38:46, 246.94s/it]


üìÇ Subset 027 | n = 634


Declusters (mapping):  54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 27/50 [1:51:03<1:34:10, 245.67s/it]


üìÇ Subset 028 | n = 634


Declusters (mapping):  56%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå    | 28/50 [1:55:06<1:29:50, 245.00s/it]


üìÇ Subset 029 | n = 634


Declusters (mapping):  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 29/50 [1:59:15<1:26:07, 246.09s/it]


üìÇ Subset 030 | n = 634


Declusters (mapping):  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 30/50 [2:03:02<1:20:08, 240.44s/it]


üìÇ Subset 031 | n = 634


Declusters (mapping):  62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 31/50 [2:06:45<1:14:27, 235.13s/it]


üìÇ Subset 032 | n = 634


Declusters (mapping):  64%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç   | 32/50 [2:10:34<1:10:01, 233.41s/it]


üìÇ Subset 033 | n = 634


Declusters (mapping):  66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 33/50 [2:14:23<1:05:41, 231.86s/it]


üìÇ Subset 034 | n = 634


Declusters (mapping):  68%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä   | 34/50 [2:18:01<1:00:46, 227.89s/it]


üìÇ Subset 035 | n = 634


Declusters (mapping):  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [2:21:42<56:28, 225.87s/it]  


üìÇ Subset 036 | n = 634


Declusters (mapping):  72%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè  | 36/50 [2:25:25<52:27, 224.86s/it]


üìÇ Subset 037 | n = 634


Declusters (mapping):  74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 37/50 [2:29:12<48:53, 225.66s/it]


üìÇ Subset 038 | n = 634


Declusters (mapping):  76%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 38/50 [2:33:01<45:17, 226.44s/it]


üìÇ Subset 039 | n = 634


Declusters (mapping):  78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 39/50 [2:36:40<41:07, 224.36s/it]


üìÇ Subset 040 | n = 634


Declusters (mapping):  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 40/50 [2:40:27<37:30, 225.04s/it]


üìÇ Subset 041 | n = 634


Declusters (mapping):  82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 41/50 [2:44:15<33:55, 226.12s/it]


üìÇ Subset 042 | n = 634


Declusters (mapping):  84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 42/50 [2:48:05<30:16, 227.07s/it]


üìÇ Subset 043 | n = 634


Declusters (mapping):  86%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå | 43/50 [2:51:50<26:26, 226.59s/it]


üìÇ Subset 044 | n = 634


Declusters (mapping):  88%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä | 44/50 [2:55:37<22:40, 226.78s/it]


üìÇ Subset 045 | n = 634


Declusters (mapping):  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [2:59:29<19:01, 228.20s/it]


üìÇ Subset 046 | n = 634


Declusters (mapping):  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 46/50 [3:03:16<15:10, 227.75s/it]


üìÇ Subset 047 | n = 634


Declusters (mapping):  94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 47/50 [3:07:06<11:25, 228.46s/it]


üìÇ Subset 048 | n = 634


Declusters (mapping):  96%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 48/50 [3:11:05<07:43, 231.56s/it]


üìÇ Subset 049 | n = 634


Declusters (mapping):  98%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä| 49/50 [3:15:04<03:54, 234.03s/it]


üìÇ Subset 050 | n = 634


Declusters (mapping): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [3:19:04<00:00, 238.89s/it]



‚úÖ Bagged maps written to:
/Users/inesschwartz/Desktop/model/bagged_maps


all below just trial and error:

In [3]:
# =========================================================
# Decluster modeling loop with LOO Kriging CV + RF and RF+OK
# =========================================================

import os
import glob
import warnings

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold

from pykrige.ok import OrdinaryKriging
from tqdm import tqdm

warnings.filterwarnings("ignore")

# =========================================================
# PATHS AND PARAMETERS
# =========================================================
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned"
output_dir = "/Users/inesschwartz/Desktop/model/results_calibration"
os.makedirs(output_dir, exist_ok=True)

# --- RF parameters (from tuning) ---
rf_params = {
    'n_estimators': 500,
    'min_samples_leaf': 5,
    'max_features': 0.5,
    'max_depth': 20,
    'random_state': 42,
    'n_jobs': 4
}

# --- Variogram parameters (spherical + anisotropy) ---
variogram_model = "spherical"
variogram_params = {
    "nugget": 0.08,
    "sill": 0.22,
    "range": 39000,
    "anisotropy_scaling": 0.18,   # minor/major ratio
    "anisotropy_angle": 135       # degrees CCW from x-axis
}

# =========================================================
# HELPER FUNCTIONS
# =========================================================
def compute_metrics(y_true, y_pred):
    """Compute standard regression metrics."""
    return {
        "R2": r2_score(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAE": mean_absolute_error(y_true, y_pred),
    }


def cross_validate_rf(X, y, k=5):
    """
    k-fold CV for Random Forest.
    Returns:
        metrics (dict)
        y_pred_cv (np.ndarray): CV predictions for each observation.
    """
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    y_pred_cv = np.zeros_like(y, dtype=float)

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train = y[train_idx]

        rf = RandomForestRegressor(**rf_params)
        rf.fit(X_train, y_train)
        y_pred_cv[test_idx] = rf.predict(X_test)

    metrics = compute_metrics(y, y_pred_cv)
    return metrics, y_pred_cv


def loo_ok_cv(coords, values, var_params, model="spherical"):
    """
    Leave-one-out CV for Ordinary Kriging.

    For each point i:
      - Fit OK on all points except i
      - Predict at point i

    Returns:
        metrics (dict)
        y_pred_loo (np.ndarray): LOO predictions for each observation.
    """
    n = len(values)
    y_true = values
    y_pred = np.zeros(n, dtype=float)

    # Unpack variogram tuple in PyKrige format:
    # (sill, range, nugget, anisotropy_scaling, anisotropy_angle)
    vk_tuple = (
        var_params["sill"],
        var_params["range"],
        var_params["nugget"],
        var_params["anisotropy_scaling"],
        var_params["anisotropy_angle"],
    )

    for i in tqdm(range(n), desc="LOO OK", leave=False):
        mask = np.ones(n, dtype=bool)
        mask[i] = False

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=values[mask],
            variogram_model=model,
            variogram_parameters=vk_tuple,
            enable_plotting=False,
            verbose=False,
        )

        z, _ = ok.execute("points",
                          np.array([coords[i, 0]]),
                          np.array([coords[i, 1]]))

        # z is a masked array; take its first value
        y_pred[i] = float(z.data[0])

    metrics = compute_metrics(y_true, y_pred)
    return metrics, y_pred


def loo_ok_cv_on_residuals(coords, residuals, var_params, model="spherical"):
    """
    LOO CV for OK on residuals (RF+OK hybrid).

    For each point i:
      - Fit OK on residuals of all points except i
      - Predict residual at point i

    Returns:
        metrics (dict) on residuals
        resid_pred_loo (np.ndarray): LOO predictions of residuals.
    """
    n = len(residuals)
    y_true = residuals
    y_pred = np.zeros(n, dtype=float)

    vk_tuple = (
        var_params["sill"],
        var_params["range"],
        var_params["nugget"],
        var_params["anisotropy_scaling"],
        var_params["anisotropy_angle"],
    )

    for i in tqdm(range(n), desc="LOO OK (residuals)", leave=False):
        mask = np.ones(n, dtype=bool)
        mask[i] = False

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=residuals[mask],
            variogram_model=model,
            variogram_parameters=vk_tuple,
            enable_plotting=False,
            verbose=False,
        )

        z, _ = ok.execute("points",
                          np.array([coords[i, 0]]),
                          np.array([coords[i, 1]]))

        y_pred[i] = float(z.data[0])

    metrics = compute_metrics(y_true, y_pred)
    return metrics, y_pred


# =========================================================
# MAIN CALIBRATION LOOP
# =========================================================
files = sorted(glob.glob(os.path.join(decluster_dir, "decluster_run_*.csv")))
n_calibration = 10  # limit to first N decluster subsets for calibration

print(f"üîÅ Running calibration for first {n_calibration} declustered subsets...")

calibration_summary = []  # per-subset per-model metrics
point_errors = []         # per-point prediction errors

for i, file in enumerate(tqdm(files[:n_calibration], desc="Decluster subsets"), start=1):
    df = pd.read_csv(file)
    n_obs = len(df)
    print(f"\nüìÇ Processing decluster subset {i:03d} | n = {n_obs}")

    target = "log_soc_stock"

    # Features: drop non-feature columns
    X = df.drop(columns=[target, "X_coord", "Y_coord", "site_info_id"], errors="ignore")
    y = df[target].values
    coords = df[["X_coord", "Y_coord"]].values

    # Keep site_info_id if present; otherwise just index
    if "site_info_id" in df.columns:
        site_ids = df["site_info_id"].values
    else:
        site_ids = np.arange(n_obs)

    print("‚öôÔ∏è Running RF CV, LOO OK, and RF+OK hybrid...")

    # ---------------------------------
    # 1) Random Forest k-fold CV
    # ---------------------------------
    rf_metrics, y_pred_rf_cv = cross_validate_rf(X, y, k=5)

    # ---------------------------------
    # 2) LOO Ordinary Kriging (spherical + anisotropy)
    # ---------------------------------
    ok_metrics, y_pred_ok_loo = loo_ok_cv(coords, y, variogram_params, model=variogram_model)

    # ---------------------------------
    # 3) RF + OK residual kriging (CV RF + LOO OK on residuals)
    # ---------------------------------
    residuals = y - y_pred_rf_cv
    ok_resid_metrics, y_pred_resid_loo = loo_ok_cv_on_residuals(
        coords, residuals, variogram_params, model=variogram_model
    )

    y_pred_hybrid = y_pred_rf_cv + y_pred_resid_loo
    hybrid_metrics = compute_metrics(y, y_pred_hybrid)

    # ---------------------------------
    # Store summary metrics
    # ---------------------------------
    calibration_summary.append({"subset": i, "model": "RF_CV", **rf_metrics})
    calibration_summary.append({"subset": i, "model": "OK_LOO", **ok_metrics})
    calibration_summary.append({"subset": i, "model": "OK_resid_LOO", **ok_resid_metrics})
    calibration_summary.append({"subset": i, "model": "RF+OK", **hybrid_metrics})

    print(
        f"‚úÖ DC{i:03d} | "
        f"RF R¬≤={rf_metrics['R2']:.3f} | "
        f"OK_LOO R¬≤={ok_metrics['R2']:.3f} | "
        f"RF+OK R¬≤={hybrid_metrics['R2']:.3f}"
    )

    # ---------------------------------
    # Store per-point prediction errors
    # ---------------------------------
    models_point_preds = {
        "RF_CV": y_pred_rf_cv,
        "OK_LOO": y_pred_ok_loo,
        "RF+OK": y_pred_hybrid,
    }

    for model_name, y_pred_model in models_point_preds.items():
        errors = y - y_pred_model
        abs_errors = np.abs(errors)

        df_err = pd.DataFrame({
            "subset": i,
            "model": model_name,
            "site_info_id": site_ids,
            "y_true": y,
            "y_pred": y_pred_model,
            "error": errors,
            "abs_error": abs_errors,
        })
        point_errors.append(df_err)

# =========================================================
# SAVE RESULTS
# =========================================================
summary_csv = os.path.join(output_dir, "calibration_results_summary.csv")
errors_csv = os.path.join(output_dir, "calibration_point_errors.csv")

pd.DataFrame(calibration_summary).to_csv(summary_csv, index=False)
pd.concat(point_errors, ignore_index=True).to_csv(errors_csv, index=False)

print("\n‚úÖ Calibration diagnostics complete.")
print(f"üìÅ Summary metrics saved to: {summary_csv}")
print(f"üìÅ Point-level prediction errors saved to: {errors_csv}")


üîÅ Running calibration for first 10 declustered subsets...


Decluster subsets:   0%|          | 0/10 [00:00<?, ?it/s]


üìÇ Processing decluster subset 001 | n = 634
‚öôÔ∏è Running RF CV, LOO OK, and RF+OK hybrid...


Decluster subsets:   0%|          | 0/10 [00:02<?, ?it/s]


TypeError: Variogram model parameters must be provided in either a list or a dict when they are explicitly specified.

Calibration results summary (DC‚ÇÅ‚Äì‚ÇÅ‚ÇÄ):
Random Forest shows consistent moderate predictive ability (R¬≤ ‚âà 0.32, RMSE ‚âà 0.44).
However, Ordinary Kriging (OK) and hybrid RF+OK show perfect interpolation (R¬≤=1.0), indicating they were evaluated on the same training data rather than withheld samples.
The next step is to implement Leave-One-Out (LOO) kriging validation to obtain unbiased performance estimates for OK and RF+OK before proceeding to ensemble modeling.

## declusters--> models --> perfomance metrics and tifs --> aggregated ensemble for final

In [2]:
#1 cross validation metrics 

# =========================================================
# SOC_spatialCV_metrics.py
# =========================================================
# Runs 5-fold spatial CV (10 km GroupKFold) for RF, OK, RF+OK.
# Saves per-fold and per-decluster metrics.
# =========================================================

import os, sys, glob, gc
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from pykrige.ok import OrdinaryKriging

# ---------- User parameters ----------
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned"
metrics_dir   = "/Users/inesschwartz/Desktop/model/results_spatialCV"
os.makedirs(metrics_dir, exist_ok=True)

target = "log_soc_stock"
features = [
    "MRRTF", "MRVBF", "annual_precip", "grazing_1950", "cropland_1950",
    "precip_wettest_month", "relief_TRI", "standardized_height",
    "temp_annual_range", "terrain_surf_convexity", "terrain_surf_texture",
    "tmax_mean", "valley_depth", "faosoil_id", "slope_height"
]

rf_params = dict(
    n_estimators=1000, min_samples_leaf=3, max_features=0.5,
    max_depth=None, random_state=42, n_jobs=1
)
vmodel = "gaussian"
vparams = {"nugget": 0.0, "sill": 0.18, "range": 14000}

# ---------- Batch arguments ----------
if len(sys.argv) >= 3:
    start_idx, end_idx = int(sys.argv[1]), int(sys.argv[2])
else:
    start_idx, end_idx = 1, 10

print(f"\nüöÄ Running spatial CV for declusters {start_idx}‚Äì{end_idx}")

# ---------- Helpers ----------
def block_groups(df, block_m=10000):
    gx = (df["X_coord"] // block_m).astype(int)
    gy = (df["Y_coord"] // block_m).astype(int)
    return (gx.astype(str) + "_" + gy.astype(str)).values

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def krige_predict(xy_train, z_train, xy_test):
    ok = OrdinaryKriging(
        x=xy_train[:,0], y=xy_train[:,1], z=z_train,
        variogram_model=vmodel, variogram_parameters=vparams,
        enable_plotting=False, verbose=False
    )
    pred, _ = ok.execute("points", xy_test[:,0], xy_test[:,1])
    return np.asarray(pred)

# ---------- Main loop ----------
decluster_files = sorted(glob.glob(os.path.join(decluster_dir, "decluster_run_*.csv")))
subset = decluster_files[start_idx-1:end_idx]
all_rows = []

for path in subset:
    name = os.path.basename(path)
    it = int(name.split("_")[-1].split(".")[0])
    print(f"\nüß© Decluster {it:03d}")

    df = pd.read_csv(path)
    df = df.dropna(subset=[target]+features+["X_coord","Y_coord"])

    X = df[features].astype("float32").values
    y = df[target].astype("float32").values
    XY = df[["X_coord","Y_coord"]].astype("float32").values
    groups = block_groups(df, block_m=10000)
    cv = GroupKFold(n_splits=5)

    fold = 0
    for train_idx, test_idx in cv.split(X, y, groups=groups):
        fold += 1
        Xtr, Xte = X[train_idx], X[test_idx]
        ytr, yte = y[train_idx], y[test_idx]
        XYtr, XYte = XY[train_idx], XY[test_idx]

        # RF
        rf = RandomForestRegressor(**rf_params)
        rf.fit(Xtr, ytr)
        rf_te = rf.predict(Xte)

        # OK
        ok_te = krige_predict(XYtr, ytr, XYte)

        # RF+OK
        resid_tr = ytr - rf.predict(Xtr)
        rk_te = rf_te + krige_predict(XYtr, resid_tr, XYte)

        for model_name, preds in (("RF", rf_te), ("OK", ok_te), ("RF_OK", rk_te)):
            all_rows.append({
                "decluster": it, "fold": fold, "model": model_name,
                "R2": r2_score(yte, preds),
                "RMSE": rmse(yte, preds),
                "MAE": mean_absolute_error(yte, preds)
            })
        gc.collect()

# ---------- Save metrics ----------
metrics_df = pd.DataFrame(all_rows)
out_metrics = os.path.join(metrics_dir, f"spatialCV_metrics_{start_idx:03d}_{end_idx:03d}.csv")
metrics_df.to_csv(out_metrics, index=False)

summary = (
    metrics_df.groupby(["decluster","model"])
    .agg(R2_mean=("R2","mean"), RMSE_mean=("RMSE","mean"), MAE_mean=("MAE","mean"))
    .reset_index()
)
out_summary = os.path.join(metrics_dir, f"spatialCV_summary_{start_idx:03d}_{end_idx:03d}.csv")
summary.to_csv(out_summary, index=False)

print(f"\n‚úÖ Spatial CV done for {len(subset)} declusters.")
print(f"üìÅ Metrics saved to {out_metrics}")
print(f"üìÅ Summary saved to {out_summary}")



üöÄ Running spatial CV for declusters 1‚Äì10

üß© Decluster 001


KeyboardInterrupt: 

In [None]:
## make the final ensemble rasters (once both metrics and per cluster predictions are completed)
## run inverse variance aggregation 

## REsuls still very poor...checking 1 decluster at a time

In [14]:
# =========================================================
# SOC_RF_declu001_clean.py
# =========================================================
# Random Forest: 5-fold spatial CV + full prediction (log-space)
# =========================================================

import os, gc
import numpy as np
import pandas as pd
import rasterio
from rasterio.transform import rowcol
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ---------- Paths ----------
decluster_csv = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned/decluster_run_001.csv"
pred_grid_csv = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv"
ref_raster    = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/slope_height.tif"
out_dir       = "/Users/inesschwartz/Desktop/model/ensemble_preds"
metrics_dir   = "/Users/inesschwartz/Desktop/model/results_spatialCV"
os.makedirs(out_dir, exist_ok=True)
os.makedirs(metrics_dir, exist_ok=True)

# ---------- Target & Features ----------
target = "log_soc_stock"
features = [
    "MRRTF", "MRVBF", "annual_precip", "grazing_1950", "cropland_1950",
    "precip_wettest_month", "relief_TRI", "standardized_height",
    "temp_annual_range", "terrain_surf_convexity", "terrain_surf_texture",
    "tmax_mean", "valley_depth", "faosoil_id", "slope_height"
]

rf_params = dict(
    n_estimators=1500,
    min_samples_leaf=3,
    max_features=0.5,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

# =========================================================
# 1. Load and prepare data
# =========================================================
df = pd.read_csv(decluster_csv)
df = df.dropna(subset=[target] + features + ["X_coord", "Y_coord"])

# keep as DataFrame to preserve feature names (avoids sklearn warning)
X = df[features].astype("float32")
y = df[target].astype("float32")
XY = df[["X_coord", "Y_coord"]].astype("float32")

# =========================================================
# 2. Define spatial 5-fold CV blocks
# =========================================================
def block_groups(df, block_m=10000):
    gx = (df["X_coord"] // block_m).astype(int)
    gy = (df["Y_coord"] // block_m).astype(int)
    return (gx.astype(str) + "_" + gy.astype(str)).values

groups = block_groups(df)
cv = GroupKFold(n_splits=5)

# =========================================================
# 3. Spatial CV evaluation
# =========================================================
metrics = []

for fold, (tr, te) in enumerate(cv.split(X, y, groups=groups), 1):
    rf = RandomForestRegressor(**rf_params)
    rf.fit(X.iloc[tr], y.iloc[tr])
    y_pred = rf.predict(X.iloc[te])
    metrics.append(dict(
        fold=fold,
        R2=r2_score(y.iloc[te], y_pred),
        RMSE=np.sqrt(mean_squared_error(y.iloc[te], y_pred)),
        MAE=np.mean(np.abs(y.iloc[te] - y_pred))
    ))
    del rf
    gc.collect()

# Save and print mean metrics
metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv(f"{metrics_dir}/spatialCV_RF_001.csv", index=False)

print("\n===== Spatial CV Results (log-space) =====")
print(metrics_df)
print("------------------------------------------")
print("Mean R¬≤  :", round(metrics_df['R2'].mean(), 3))
print("Mean RMSE:", round(metrics_df['RMSE'].mean(), 3))
print("Mean MAE :", round(metrics_df['MAE'].mean(), 3))
print("==========================================\n")

# =========================================================
# 4. Full model training + prediction (still log-space)
# =========================================================
rf_full = RandomForestRegressor(**rf_params).fit(X, y)

df_pred = pd.read_csv(pred_grid_csv)
X_pred = df_pred[features].astype("float32").copy()
coords_pred = df_pred[["X_coord", "Y_coord"]].astype("float32").values

# --- Raster reference setup ---
with rasterio.open(ref_raster) as ref:
    profile = ref.profile
    transform = ref.transform
    width, height = ref.width, ref.height

rows, cols = rowcol(transform, df_pred["X_coord"], df_pred["Y_coord"])
rows, cols = np.array(rows), np.array(cols)
valid_mask = ((rows >= 0) & (rows < height) & (cols >= 0) & (cols < width))

# --- Predict (log space only) ---
pred_rf = rf_full.predict(X_pred)

# --- Write raster ---
arr = np.full((height, width), np.nan, dtype="float32")
arr[rows[valid_mask], cols[valid_mask]] = pred_rf[valid_mask].astype("float32")
profile.update(dtype="float32", count=1, compress="lzw", nodata=np.nan)

out_tif = os.path.join(out_dir, "RF_decluster_001_log.tif")
with rasterio.open(out_tif, "w", **profile) as dst:
    dst.write(arr, 1)

print(f"üíæ Saved RF log-space raster ‚Üí {out_tif}")




===== Spatial CV Results (log-space) =====
   fold        R2      RMSE       MAE
0     1  0.081217  0.481017  0.361800
1     2  0.310511  0.415188  0.304703
2     3  0.319061  0.434719  0.323934
3     4  0.332242  0.465566  0.345921
4     5  0.277523  0.453959  0.317253
------------------------------------------
Mean R¬≤  : 0.264
Mean RMSE: 0.45
Mean MAE : 0.331

üíæ Saved RF log-space raster ‚Üí /Users/inesschwartz/Desktop/model/ensemble_preds/RF_decluster_001_log.tif


## looking for best hyper parameters for RF

In [4]:
# =========================================================
# SOC_RF_param_search.py
# =========================================================
# Random Forest: spatial 5-fold CV hyperparameter search
# =========================================================

import os, itertools, gc
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# ---------- Paths ----------
decluster_csv = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned/decluster_run_001.csv"
metrics_dir   = "/Users/inesschwartz/Desktop/model/results_spatialCV"
os.makedirs(metrics_dir, exist_ok=True)

# ---------- Target & Features ----------
target = "log_soc_stock"
features = [
    "MRRTF", "MRVBF", "annual_precip", "grazing_1950", "cropland_1950",
    "precip_wettest_month", "relief_TRI", "standardized_height",
    "temp_annual_range", "terrain_surf_convexity", "terrain_surf_texture",
    "tmax_mean", "valley_depth", "faosoil_id", "slope_height"
]

# ---------- Load data ----------
df = pd.read_csv(decluster_csv)
df = df.dropna(subset=[target] + features + ["X_coord", "Y_coord"])

X = df[features].astype("float32").values
y = df[target].astype("float32").values
XY = df[["X_coord", "Y_coord"]].astype("float32")

# ---------- Define 5 spatial blocks ----------
def block_groups(df, block_m=10000):
    gx = (df["X_coord"] // block_m).astype(int)
    gy = (df["Y_coord"] // block_m).astype(int)
    return (gx.astype(str) + "_" + gy.astype(str)).values

groups = block_groups(df)
cv = GroupKFold(n_splits=5)

# ---------- Parameter grid ----------
param_grid = {
    "n_estimators": [500, 1000, 1500],
    "max_depth": [10, 20, None],
    "min_samples_leaf": [1, 3, 5],
    "max_features": [0.5, "sqrt", None],
}

# ---------- Run grid search ----------
results = []
combinations = list(itertools.product(
    param_grid["n_estimators"],
    param_grid["max_depth"],
    param_grid["min_samples_leaf"],
    param_grid["max_features"]
))

print(f"üîç Testing {len(combinations)} parameter combinations...")

for (n_est, m_dep, min_leaf, m_feat) in combinations:
    fold_metrics = []
    for fold, (tr, te) in enumerate(cv.split(X, y, groups=groups), 1):
        rf = RandomForestRegressor(
            n_estimators=n_est,
            max_depth=m_dep,
            min_samples_leaf=min_leaf,
            max_features=m_feat,
            random_state=42,
            n_jobs=-1
        )
        rf.fit(X[tr], y[tr])
        y_pred = rf.predict(X[te])
        fold_metrics.append(dict(
            fold=fold,
            R2=r2_score(y[te], y_pred),
            RMSE=np.sqrt(mean_squared_error(y[te], y_pred)),
            MAE=np.mean(np.abs(y[te] - y_pred))
        ))
        del rf; gc.collect()

    dfm = pd.DataFrame(fold_metrics)
    results.append(dict(
        n_estimators=n_est,
        max_depth=m_dep,
        min_samples_leaf=min_leaf,
        max_features=m_feat,
        mean_R2=dfm["R2"].mean(),
        mean_RMSE=dfm["RMSE"].mean(),
        mean_MAE=dfm["MAE"].mean()
    ))

# ---------- Save results ----------
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="mean_R2", ascending=False)
out_csv = os.path.join(metrics_dir, "RF_param_search_results.csv")
results_df.to_csv(out_csv, index=False)

best = results_df.iloc[0]
print("\n‚úÖ Best RF parameters:")
print(best)
print(f"\nüíæ Saved full results to: {out_csv}")


üîç Testing 81 parameter combinations...

‚úÖ Best RF parameters:
n_estimators            1000
max_depth               10.0
min_samples_leaf           3
max_features             0.5
mean_R2             0.266852
mean_RMSE           0.449482
mean_MAE             0.32952
Name: 30, dtype: object

üíæ Saved full results to: /Users/inesschwartz/Desktop/model/results_spatialCV/RF_param_search_results.csv


### OK for decluster 1

In [8]:
# =========================================================
# SOC_OK_declu001_clean.py
# =========================================================
# Ordinary Kriging: Leave-One-Out CV + full national prediction (log-space)
# =========================================================

import os, gc
import numpy as np
import pandas as pd
import rasterio
from rasterio.transform import rowcol
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from pykrige.ok import OrdinaryKriging
from tqdm import tqdm

# ---------- Paths ----------
decluster_csv = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned/decluster_run_001.csv"
pred_grid_csv = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv"
ref_raster    = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/slope_height.tif"
out_dir       = "/Users/inesschwartz/Desktop/model/ensemble_preds"
metrics_dir   = "/Users/inesschwartz/Desktop/model/results_spatialCV"
os.makedirs(out_dir, exist_ok=True)
os.makedirs(metrics_dir, exist_ok=True)

# ---------- Parameters ----------
target = "log_soc_stock"
variogram_model = "gaussian"
variogram_params = {"nugget": 0.000, "sill": 0.181, "range": 14206}
BLOCK_SIZE = 10000  # number of points per kriging batch

# =========================================================
# 1. Load and prepare data
# =========================================================
df = pd.read_csv(decluster_csv).dropna(subset=[target, "X_coord", "Y_coord"])
y = df[target].astype("float32").values
XY = df[["X_coord", "Y_coord"]].astype("float32").values

# =========================================================
# 2. Leave-One-Out Cross-Validation (LOO)
# =========================================================
preds = np.zeros_like(y)
n = len(y)
print(f"\nüîÅ Running Leave-One-Out CV for {n} samples...")

for i in tqdm(range(n)):
    mask = np.ones(n, dtype=bool)
    mask[i] = False
    try:
        ok = OrdinaryKriging(
            x=XY[mask, 0],
            y=XY[mask, 1],
            z=y[mask],
            variogram_model=variogram_model,
            variogram_parameters=variogram_params,
            enable_plotting=False,
            verbose=False,
        )
        pred, _ = ok.execute("points", XY[i:i+1, 0], XY[i:i+1, 1])
        preds[i] = pred[0]
    except Exception as e:
        preds[i] = np.nan
        print(f"‚ö†Ô∏è Point {i} failed: {e}")
    gc.collect()

# Evaluate CV results
cv_mask = ~np.isnan(preds)
R2 = r2_score(y[cv_mask], preds[cv_mask])
RMSE = np.sqrt(mean_squared_error(y[cv_mask], preds[cv_mask]))
MAE = np.mean(np.abs(y[cv_mask] - preds[cv_mask]))

cv_results = pd.DataFrame([{"R2": R2, "RMSE": RMSE, "MAE": MAE, "n_valid": cv_mask.sum()}])
cv_csv = os.path.join(metrics_dir, "LOO_OK_001.csv")
cv_results.to_csv(cv_csv, index=False)

print("\n===== Ordinary Kriging LOO Results (log-space) =====")
print(cv_results.round(4))
print("====================================================\n")

# =========================================================
# 3. National prediction (log-space)
# =========================================================
print("üåç Generating national OK predictions (log-space)...")

df_pred = pd.read_csv(pred_grid_csv)
coords_pred = df_pred[["X_coord", "Y_coord"]].astype("float32").values

with rasterio.open(ref_raster) as ref:
    profile = ref.profile
    transform = ref.transform
    width, height = ref.width, ref.height

rows, cols = rowcol(transform, df_pred["X_coord"], df_pred["Y_coord"])
rows, cols = np.array(rows), np.array(cols)
valid_mask = ((rows >= 0) & (rows < height) & (cols >= 0) & (cols < width))

# Fit final OK on all data
ok_full = OrdinaryKriging(
    x=XY[:, 0],
    y=XY[:, 1],
    z=y,
    variogram_model=variogram_model,
    variogram_parameters=variogram_params,
    enable_plotting=False,
    verbose=False,
)

# Predict in manageable chunks
preds_full = []
for i in tqdm(range(0, len(coords_pred), BLOCK_SIZE)):
    sub = coords_pred[i:i + BLOCK_SIZE]
    pred_block, _ = ok_full.execute("points", sub[:, 0], sub[:, 1])
    preds_full.append(pred_block)
pred_ok = np.concatenate(preds_full)

# =========================================================
# 4. Write raster (still in log-space)
# =========================================================
arr = np.full((height, width), np.nan, dtype="float32")
arr[rows[valid_mask], cols[valid_mask]] = pred_ok[valid_mask].astype("float32")

profile.update(dtype="float32", count=1, compress="lzw", nodata=np.nan)
out_tif = os.path.join(out_dir, "OK_decluster_001_log.tif")

with rasterio.open(out_tif, "w", **profile) as dst:
    dst.write(arr, 1)

print(f"üíæ Saved OK log-space raster ‚Üí {out_tif}")



üîÅ Running Leave-One-Out CV for 626 samples...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 626/626 [01:15<00:00,  8.31it/s]



===== Ordinary Kriging LOO Results (log-space) =====
       R2    RMSE     MAE  n_valid
0  0.0566  0.5122  0.3895      626

üåç Generating national OK predictions (log-space)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 126/126 [00:48<00:00,  2.61it/s]


üíæ Saved OK log-space raster ‚Üí /Users/inesschwartz/Desktop/model/ensemble_preds/OK_decluster_001_log.tif


### variogram hyper parameter tuning (re doing variogram parameters)

In [8]:
# =========================================================
#   Decluster Modeling with LOO OK CV + RF CV + RF+OK CV
#   Fully Patched Version ‚Äî using grid-search best variogram
# =========================================================

import os
import glob
import warnings
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
from pykrige.ok import OrdinaryKriging

warnings.filterwarnings("ignore")

# =========================================================
# PATHS AND PARAMETERS
# =========================================================
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned"
output_dir = "/Users/inesschwartz/Desktop/model/results_calibration"
os.makedirs(output_dir, exist_ok=True)

# --- Random Forest Parameters ---
rf_params = {
    'n_estimators': 500,
    'min_samples_leaf': 5,
    'max_features': 0.5,
    'max_depth': 20,
    'random_state': 42,
    'n_jobs': 4
}

# === Best variogram from your grid search ===
best_variogram = {
    "nugget": 0.08,
    "sill": 0.21,
    "range": 50000,
    "anisotropy_scaling": 0.40,
    "anisotropy_angle": 135
}

variogram_model = "spherical"

# =========================================================
# METRICS + CV HELPER FUNCTIONS
# =========================================================
def compute_metrics(y_true, y_pred):
    return {
        "R2": r2_score(y_true, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_true, y_pred)),
        "MAE": mean_absolute_error(y_true, y_pred)
    }

def cross_validate_rf(X, y, k=5):
    """RF k-fold CV returning metrics + per-point CV predictions."""
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    y_pred_cv = np.zeros_like(y, dtype=float)

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train = y[train_idx]

        rf = RandomForestRegressor(**rf_params)
        rf.fit(X_train, y_train)
        y_pred_cv[test_idx] = rf.predict(X_test)

    metrics = compute_metrics(y, y_pred_cv)
    return metrics, y_pred_cv


# =========================================================
#   LOO Ordinary Kriging Cross-Validation
# =========================================================
def loo_ok_cv(coords, values, var_params, model="spherical"):
    n = len(values)
    preds = np.zeros(n)

    for i in tqdm(range(n), desc="LOO OK", leave=False):
        mask = np.ones(n, dtype=bool)
        mask[i] = False

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=values[mask],
            variogram_model=model,
            variogram_parameters={
                "sill": var_params["sill"],
                "range": var_params["range"],
                "nugget": var_params["nugget"],
                "anisotropy_scaling": var_params["anisotropy_scaling"],
                "anisotropy_angle": var_params["anisotropy_angle"],
            },
            enable_plotting=False,
            verbose=False
        )

        z, _ = ok.execute(
            "points",
            np.array([coords[i, 0]]),
            np.array([coords[i, 1]])
        )

        preds[i] = float(z.data[0])

    metrics = compute_metrics(values, preds)
    return metrics, preds


# =========================================================
#   LOO OK on Residuals for RF+OK hybrid
# =========================================================
def loo_ok_residuals(coords, residuals, var_params, model="spherical"):
    n = len(residuals)
    preds = np.zeros(n)

    for i in tqdm(range(n), desc="LOO OK residuals", leave=False):
        mask = np.ones(n, dtype=bool)
        mask[i] = False

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=residuals[mask],
            variogram_model=model,
            variogram_parameters={
                "sill": var_params["sill"],
                "range": var_params["range"],
                "nugget": var_params["nugget"],
                "anisotropy_scaling": var_params["anisotropy_scaling"],
                "anisotropy_angle": var_params["anisotropy_angle"],
            },
            enable_plotting=False,
            verbose=False
        )

        z, _ = ok.execute(
            "points",
            np.array([coords[i, 0]]),
            np.array([coords[i, 1]])
        )

        preds[i] = float(z.data[0])

    metrics = compute_metrics(residuals, preds)
    return metrics, preds


# =========================================================
# MAIN LOOP OVER DECLUSTER FILES
# =========================================================
files = sorted(glob.glob(os.path.join(decluster_dir, "decluster_run_*.csv")))
n_calibration = 10

print(f"üîÅ Running calibration for first {n_calibration} decluster subsets...")

summary_records = []
error_records = []

for idx, file in enumerate(tqdm(files[:n_calibration], desc="Decluster sets"), start=1):
    df = pd.read_csv(file)
    n = len(df)

    print(f"\nüìÇ Subset {idx:03d} | n={n}")

    y = df["log_soc_stock"].values
    coords = df[["X_coord", "Y_coord"]].values
    site_ids = df["site_info_id"].values if "site_info_id" in df.columns else np.arange(n)
    X = df.drop(columns=["log_soc_stock", "X_coord", "Y_coord", "site_info_id"], errors="ignore")

    # 1. RF CV
    rf_metrics, y_pred_rf = cross_validate_rf(X, y, k=5)

    # 2. OK CV (using best variogram)
    ok_metrics, y_pred_ok = loo_ok_cv(coords, y, best_variogram, model=variogram_model)

    # 3. RF + OK hybrid
    residuals = y - y_pred_rf
    ok_resid_metrics, y_pred_resid = loo_ok_residuals(coords, residuals, best_variogram, model=variogram_model)
    y_pred_hybrid = y_pred_rf + y_pred_resid
    hybrid_metrics = compute_metrics(y, y_pred_hybrid)

    summary_records.append({"subset": idx, "model": "RF_CV", **rf_metrics})
    summary_records.append({"subset": idx, "model": "OK_LOO", **ok_metrics})
    summary_records.append({"subset": idx, "model": "OK_resid_LOO", **ok_resid_metrics})
    summary_records.append({"subset": idx, "model": "RF+OK", **hybrid_metrics})

    print(
        f"‚úÖ DC{idx:03d} | RF R¬≤={rf_metrics['R2']:.3f} | "
        f"OK R¬≤={ok_metrics['R2']:.3f} | "
        f"RF+OK R¬≤={hybrid_metrics['R2']:.3f}"
    )


print("\nüéâ DONE!")


üîÅ Running calibration for first 10 decluster subsets...


Decluster sets:   0%|          | 0/10 [00:00<?, ?it/s]


üìÇ Subset 001 | n=634


Decluster sets:  10%|‚ñà         | 1/10 [00:54<08:09, 54.39s/it]

‚úÖ DC001 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.259

üìÇ Subset 002 | n=634


Decluster sets:  20%|‚ñà‚ñà        | 2/10 [01:49<07:18, 54.79s/it]

‚úÖ DC002 | RF R¬≤=0.370 | OK R¬≤=0.188 | RF+OK R¬≤=0.244

üìÇ Subset 003 | n=634


Decluster sets:  30%|‚ñà‚ñà‚ñà       | 3/10 [02:42<06:17, 53.93s/it]

‚úÖ DC003 | RF R¬≤=0.384 | OK R¬≤=0.203 | RF+OK R¬≤=0.278

üìÇ Subset 004 | n=634


Decluster sets:  40%|‚ñà‚ñà‚ñà‚ñà      | 4/10 [03:50<05:57, 59.56s/it]

‚úÖ DC004 | RF R¬≤=0.384 | OK R¬≤=0.204 | RF+OK R¬≤=0.278

üìÇ Subset 005 | n=634


Decluster sets:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 5/10 [04:47<04:52, 58.57s/it]

‚úÖ DC005 | RF R¬≤=0.374 | OK R¬≤=0.208 | RF+OK R¬≤=0.265

üìÇ Subset 006 | n=634


Decluster sets:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 6/10 [05:44<03:51, 57.97s/it]

‚úÖ DC006 | RF R¬≤=0.388 | OK R¬≤=0.204 | RF+OK R¬≤=0.278

üìÇ Subset 007 | n=634


Decluster sets:  70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 7/10 [06:44<02:56, 58.68s/it]

‚úÖ DC007 | RF R¬≤=0.356 | OK R¬≤=0.188 | RF+OK R¬≤=0.229

üìÇ Subset 008 | n=634


Decluster sets:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 8/10 [07:51<02:02, 61.36s/it]

‚úÖ DC008 | RF R¬≤=0.358 | OK R¬≤=0.189 | RF+OK R¬≤=0.233

üìÇ Subset 009 | n=634


Decluster sets:  90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 9/10 [08:43<00:58, 58.56s/it]

‚úÖ DC009 | RF R¬≤=0.386 | OK R¬≤=0.204 | RF+OK R¬≤=0.281

üìÇ Subset 010 | n=634


Decluster sets: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [09:43<00:00, 58.32s/it]

‚úÖ DC010 | RF R¬≤=0.370 | OK R¬≤=0.206 | RF+OK R¬≤=0.259

üéâ DONE!





## RF + OK of residuals (Decluster 1)

In [9]:
# =========================================================
# SOC_RFOK_declu001_clean.py
# =========================================================
# Hybrid Regression-Kriging (RF + OK): log-space modeling
# =========================================================

import os, gc
import numpy as np
import pandas as pd
import rasterio
from rasterio.transform import rowcol
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from pykrige.ok import OrdinaryKriging
from tqdm import tqdm

# ---------- Paths ----------
decluster_csv = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned/decluster_run_001.csv"
pred_grid_csv = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv"
ref_raster    = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/slope_height.tif"
out_dir       = "/Users/inesschwartz/Desktop/model/ensemble_preds"
metrics_dir   = "/Users/inesschwartz/Desktop/model/results_spatialCV"
os.makedirs(out_dir, exist_ok=True)
os.makedirs(metrics_dir, exist_ok=True)

# ---------- Model parameters ----------
target = "log_soc_stock"
features = [
    "MRRTF", "MRVBF", "annual_precip", "grazing_1950", "cropland_1950",
    "precip_wettest_month", "relief_TRI", "standardized_height",
    "temp_annual_range", "terrain_surf_convexity", "terrain_surf_texture",
    "tmax_mean", "valley_depth", "faosoil_id", "slope_height"
]

rf_params = dict(
    n_estimators=1000,
    min_samples_leaf=3,
    max_features=0.5,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)

variogram_model = "gaussian"
variogram_params = {"nugget": 0.000, "sill": 0.181, "range": 14206}
BLOCK_SIZE = 10000  # prediction chunk size

# =========================================================
# 1. Load and prepare data
# =========================================================
df = pd.read_csv(decluster_csv).dropna(subset=[target] + features + ["X_coord", "Y_coord"])
X = df[features].astype("float32")
y = df[target].astype("float32")
XY = df[["X_coord", "Y_coord"]].astype("float32").values

# =========================================================
# 2. Random Forest base model
# =========================================================
print("\nüå≤ Training Random Forest base model (log-space)...")
rf = RandomForestRegressor(**rf_params)
rf.fit(X, y)
y_rf_pred = rf.predict(X)
residuals = y - y_rf_pred

# Evaluate RF only
rf_R2 = r2_score(y, y_rf_pred)
rf_RMSE = np.sqrt(mean_squared_error(y, y_rf_pred))
rf_MAE = np.mean(np.abs(y - y_rf_pred))
print(f"RF only ‚Üí R¬≤={rf_R2:.3f}, RMSE={rf_RMSE:.3f}, MAE={rf_MAE:.3f}")

# =========================================================
# 3. Ordinary Kriging on RF residuals
# =========================================================
print("\n‚öôÔ∏è  Fitting Ordinary Kriging on RF residuals...")
ok = OrdinaryKriging(
    x=XY[:, 0],
    y=XY[:, 1],
    z=residuals,
    variogram_model=variogram_model,
    variogram_parameters=variogram_params,
    enable_plotting=False,
    verbose=False,
)

# =========================================================
# 4. Leave-One-Out CV for hybrid (RF + OK)
# =========================================================
print("\nüîÅ Running Leave-One-Out CV for RF+OK hybrid...")
n = len(y)
preds_loo = np.zeros_like(y)

for i in tqdm(range(n)):
    mask = np.ones(n, dtype=bool)
    mask[i] = False
    try:
        rf_i = RandomForestRegressor(**rf_params)
        rf_i.fit(X.iloc[mask], y.iloc[mask])
        rf_pred_i = rf_i.predict(X.iloc[i:i+1])
        res_i = y.iloc[mask] - rf_i.predict(X.iloc[mask])
        ok_i = OrdinaryKriging(
            x=XY[mask, 0],
            y=XY[mask, 1],
            z=res_i,
            variogram_model=variogram_model,
            variogram_parameters=variogram_params,
            enable_plotting=False,
            verbose=False,
        )
        krig_pred_i, _ = ok_i.execute("points", XY[i:i+1, 0], XY[i:i+1, 1])
        preds_loo[i] = rf_pred_i + krig_pred_i[0]
    except Exception as e:
        preds_loo[i] = np.nan
        print(f"‚ö†Ô∏è LOO {i} failed: {e}")
    gc.collect()

mask_valid = ~np.isnan(preds_loo)
R2 = r2_score(y[mask_valid], preds_loo[mask_valid])
RMSE = np.sqrt(mean_squared_error(y[mask_valid], preds_loo[mask_valid]))
MAE = np.mean(np.abs(y[mask_valid] - preds_loo[mask_valid]))
cv_results = pd.DataFrame([{"R2": R2, "RMSE": RMSE, "MAE": MAE, "n_valid": mask_valid.sum()}])
cv_csv = os.path.join(metrics_dir, "LOO_RFOK_001.csv")
cv_results.to_csv(cv_csv, index=False)

print("\n===== RF + OK LOO Results (log-space) =====")
print(cv_results.round(4))
print("===========================================\n")

# =========================================================
# 5. National prediction (hybrid RF+OK)
# =========================================================
print("üåç Generating national hybrid predictions (log-space)...")
df_pred = pd.read_csv(pred_grid_csv)
X_pred = df_pred[features].astype("float32")
coords_pred = df_pred[["X_coord", "Y_coord"]].astype("float32").values

# Predict base RF component
rf_pred_grid = rf.predict(X_pred)

# Krige residuals across prediction grid
preds_resid = []
for i in tqdm(range(0, len(coords_pred), BLOCK_SIZE)):
    sub = coords_pred[i:i + BLOCK_SIZE]
    pred_block, _ = ok.execute("points", sub[:, 0], sub[:, 1])
    preds_resid.append(pred_block)
pred_resid_full = np.concatenate(preds_resid)

# Hybrid = RF prediction + Kriged residuals
pred_hybrid = rf_pred_grid + pred_resid_full

# =========================================================
# 6. Write raster (log-space)
# =========================================================
with rasterio.open(ref_raster) as ref:
    profile = ref.profile
    transform = ref.transform
    width, height = ref.width, ref.height

rows, cols = rowcol(transform, df_pred["X_coord"], df_pred["Y_coord"])
rows, cols = np.array(rows), np.array(cols)
valid_mask = ((rows >= 0) & (rows < height) & (cols >= 0) & (cols < width))

arr = np.full((height, width), np.nan, dtype="float32")
arr[rows[valid_mask], cols[valid_mask]] = pred_hybrid[valid_mask].astype("float32")
profile.update(dtype="float32", count=1, compress="lzw", nodata=np.nan)
out_tif = os.path.join(out_dir, "RFOK_decluster_001_log.tif")

with rasterio.open(out_tif, "w", **profile) as dst:
    dst.write(arr, 1)

print(f"üíæ Saved RF+OK log-space raster ‚Üí {out_tif}")



üå≤ Training Random Forest base model (log-space)...
RF only ‚Üí R¬≤=0.701, RMSE=0.288, MAE=0.207

‚öôÔ∏è  Fitting Ordinary Kriging on RF residuals...

üîÅ Running Leave-One-Out CV for RF+OK hybrid...


  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + krig_pred_i[0]
  preds_loo[i] = rf_pred_i + kri


===== RF + OK LOO Results (log-space) =====
       R2    RMSE     MAE  n_valid
0  0.2695  0.4507  0.3299      626

üåç Generating national hybrid predictions (log-space)...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 126/126 [00:48<00:00,  2.60it/s]


üíæ Saved RF+OK log-space raster ‚Üí /Users/inesschwartz/Desktop/model/ensemble_preds/RFOK_decluster_001_log.tif
