All models together to be run 50 times (once per subset/decluster)

**the modelling methodology**

“Each declustered subset (DCᵢ) was internally partitioned (80/20) to allow model validation on independent test data, ensuring that predictive performance and model stability were assessed under spatial independence. After internal validation, models were retrained on all samples in each DCᵢ to maximize predictive strength and produce the final maps used for ensemble averaging. The ensemble predictions were subsequently validated using the external holdout dataset (test₁) derived from the original data split.”

In [1]:
# No cross validation steps --All models together to be run 50 times (once per subset/decluster) 
#  =========================================================
# STEP — Run RF, OK, and RF+OK on all declustered subsets
# =========================================================

import os
import glob
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from pykrige.ok import OrdinaryKriging

# --- Paths ---
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs"
results_file = "/Users/inesschwartz/Desktop/model/results_all_models.csv"
output_dir = "/Users/inesschwartz/Desktop/model/predictions"
os.makedirs(output_dir, exist_ok=True)

# --- Hyperparameters (from tuning step) ---
rf_params = {
    'n_estimators': 1500,
    'min_samples_leaf': 3,
    'max_features': 0.5,
    'max_depth': None,
    'random_state': 42,
    'n_jobs': -1
}

# --- Fixed variogram parameters (from stability analysis) ---
variogram_params = {
    'model': 'exponential',
    'variogram_parameters': {'nugget': 0.0022, 'sill': 0.22, 'range': 15000}
}

# --- Helper for metrics ---
def compute_metrics(y_true, y_pred):
    return {
        'R2': r2_score(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred)
    }

# --- Results container ---
results = []

# --- Files ---
files = sorted(glob.glob(os.path.join(decluster_dir, "decluster_run_*.csv")))
print(f"Running RF, OK, and RF+OK for {len(files)} declustered subsets... ⏳")

# =========================================================
# MAIN LOOP
# =========================================================
for i, file in enumerate(files, start=1):
    df = pd.read_csv(file)
    target = 'log_soc_stock'

    # --- Train/test split ---
    train, test = train_test_split(df, test_size=0.2, random_state=42)
    X_train = train.drop(columns=[target, 'X_coord', 'Y_coord', 'site_info_id'], errors='ignore')
    y_train = train[target].values
    X_test = test.drop(columns=[target, 'X_coord', 'Y_coord', 'site_info_id'], errors='ignore')
    y_test = test[target].values

    coords_train = train[['X_coord', 'Y_coord']].values
    coords_test = test[['X_coord', 'Y_coord']].values

    # =====================================================
    # 1️⃣ Ordinary Kriging (OK)
    # =====================================================
    ok = OrdinaryKriging(
        x=coords_train[:, 0],
        y=coords_train[:, 1],
        z=y_train,
        variogram_model=variogram_params['model'],
        variogram_parameters=variogram_params['variogram_parameters'],
        enable_plotting=False,
        verbose=False
    )

    y_pred_ok, _ = ok.execute('points', coords_test[:, 0], coords_test[:, 1])
    metrics_ok = compute_metrics(y_test, y_pred_ok)
    results.append({'iteration': i, 'model': 'OK', **metrics_ok})

    # =====================================================
    # 2️⃣ Random Forest (RF)
    # =====================================================
    rf = RandomForestRegressor(**rf_params)
    rf.fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    metrics_rf = compute_metrics(y_test, y_pred_rf)
    results.append({'iteration': i, 'model': 'RF', **metrics_rf})

    # =====================================================
    # 3️⃣ Hybrid RF + OK (residual kriging)
    # =====================================================
    residuals = y_train - rf.predict(X_train)
    ok_resid = OrdinaryKriging(
        x=coords_train[:, 0],
        y=coords_train[:, 1],
        z=residuals,
        variogram_model=variogram_params['model'],
        variogram_parameters=variogram_params['variogram_parameters'],
        enable_plotting=False,
        verbose=False
    )

    residual_pred, _ = ok_resid.execute('points', coords_test[:, 0], coords_test[:, 1])
    y_pred_rfok = y_pred_rf + residual_pred

    metrics_rfok = compute_metrics(y_test, y_pred_rfok)
    results.append({'iteration': i, 'model': 'RF+OK', **metrics_rfok})

    print(f"✅ Iteration {i:02d} complete | RF R²={metrics_rf['R2']:.3f}, OK R²={metrics_ok['R2']:.3f}, RF+OK R²={metrics_rfok['R2']:.3f}")

    # --- Save iteration predictions (optional) ---
    out = test[['X_coord', 'Y_coord', target]].copy()
    out['pred_RF'] = y_pred_rf
    out['pred_OK'] = y_pred_ok
    out['pred_RF_OK'] = y_pred_rfok
    out.to_csv(os.path.join(output_dir, f"predictions_iter_{i:03d}.csv"), index=False)

# =====================================================
# Save all metrics
# =====================================================
results_df = pd.DataFrame(results)
results_df.to_csv(results_file, index=False)
print(f"\n📁 All model results saved to: {results_file}")


Running RF, OK, and RF+OK for 50 declustered subsets... ⏳
✅ Iteration 01 complete | RF R²=0.295, OK R²=0.111, RF+OK R²=0.312
✅ Iteration 02 complete | RF R²=0.231, OK R²=0.084, RF+OK R²=0.240
✅ Iteration 03 complete | RF R²=0.237, OK R²=0.094, RF+OK R²=0.256
✅ Iteration 04 complete | RF R²=0.244, OK R²=0.088, RF+OK R²=0.259
✅ Iteration 05 complete | RF R²=0.286, OK R²=0.111, RF+OK R²=0.303
✅ Iteration 06 complete | RF R²=0.246, OK R²=0.087, RF+OK R²=0.260
✅ Iteration 07 complete | RF R²=0.279, OK R²=0.108, RF+OK R²=0.292
✅ Iteration 08 complete | RF R²=0.280, OK R²=0.108, RF+OK R²=0.293
✅ Iteration 09 complete | RF R²=0.232, OK R²=0.088, RF+OK R²=0.246
✅ Iteration 10 complete | RF R²=0.290, OK R²=0.110, RF+OK R²=0.306
✅ Iteration 11 complete | RF R²=0.292, OK R²=0.109, RF+OK R²=0.307
✅ Iteration 12 complete | RF R²=0.295, OK R²=0.112, RF+OK R²=0.312
✅ Iteration 13 complete | RF R²=0.236, OK R²=0.088, RF+OK R²=0.250
✅ Iteration 14 complete | RF R²=0.278, OK R²=0.108, RF+OK R²=0.293
✅ It

In [4]:
# (Calibration Diagnostics Only) didn't handle ok data well...
# =========================================================
# STEP — Decluster modeling loop (Calibration Diagnostics Only)
# =========================================================

import os
import glob
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
from pykrige.ok import OrdinaryKriging
import warnings
warnings.filterwarnings("ignore")

# =========================================================
# PATHS AND PARAMETERS
# =========================================================
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs"
output_dir = "/Users/inesschwartz/Desktop/model/results_calibration"
os.makedirs(output_dir, exist_ok=True)

# --- RF parameters (from tuning) ---
rf_params = {
    'n_estimators': 1500,
    'min_samples_leaf': 3,
    'max_features': 0.5,
    'max_depth': None,
    'random_state': 42,
    'n_jobs': -1
}

# Variogram parameters
variogram_model = 'exponential'
variogram_params = {'nugget': 0.0022, 'sill': 0.22, 'range': 15000}

# =========================================================
# HELPER FUNCTIONS
# =========================================================
def compute_metrics(y_true, y_pred):
    """Compute standard regression metrics."""
    return {
        'R2': r2_score(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred)
    }

def cross_validate_rf(X, y, k=5):
    """Run k-fold cross-validation for Random Forest."""
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    metrics = []

    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        rf = RandomForestRegressor(**rf_params)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        metrics.append(compute_metrics(y_test, y_pred))

    avg = pd.DataFrame(metrics).mean().to_dict()
    return avg

def run_ok(coords, values, variogram_params, model='exponential'):
    """Run Ordinary Kriging with fixed variogram parameters."""
    ok = OrdinaryKriging(
        x=coords[:, 0],
        y=coords[:, 1],
        z=values,
        variogram_model=model,
        variogram_parameters={
            'sill': variogram_params['sill'],
            'range': variogram_params['range'],
            'nugget': variogram_params['nugget']
        },
        enable_plotting=False,
        verbose=False
    )
    return ok

def run_rf_ok(X, y, coords, variogram_params):
    """Run hybrid RF + OK (residual kriging)."""
    rf = RandomForestRegressor(**rf_params)
    rf.fit(X, y)
    y_pred_rf = rf.predict(X)
    residuals = y - y_pred_rf

    ok = run_ok(coords, residuals, variogram_params)
    residual_pred, _ = ok.execute('points', coords[:, 0], coords[:, 1])
    y_pred_hybrid = y_pred_rf + residual_pred.data
    return compute_metrics(y, y_pred_hybrid)

# =========================================================
# MAIN CALIBRATION LOOP
# =========================================================
files = sorted(glob.glob(os.path.join(decluster_dir, "decluster_run_*.csv")))
n_calibration = 10  # limit to first 10 DCs

print(f"🔁 Running calibration for first {n_calibration} declustered subsets...")

calibration_results = []

for i, file in enumerate(files[:n_calibration], start=1):
    df = pd.read_csv(file)
    print(f"\n📂 Processing decluster subset {i:03d} | n = {len(df)}")

    target = 'log_soc_stock'
    X = df.drop(columns=[target, 'X_coord', 'Y_coord', 'site_info_id'], errors='ignore')
    y = df[target].values
    coords = df[['X_coord', 'Y_coord']].values

    print("⚙️ Running internal 5-fold CV and validation...")

    # --- RF Cross-validation ---
    rf_cv_metrics = cross_validate_rf(X, y, k=5)

    # --- OK validation (LOO-style) ---
    ok = run_ok(coords, y, variogram_params)
    y_pred_ok, _ = ok.execute('points', coords[:, 0], coords[:, 1])
    ok_metrics = compute_metrics(y, y_pred_ok)

    # --- RF+OK residual kriging ---
    hybrid_metrics = run_rf_ok(X, y, coords, variogram_params)

    # --- Store results ---
    calibration_results.append({'subset': i, 'model': 'RF_CV', **rf_cv_metrics})
    calibration_results.append({'subset': i, 'model': 'OK', **ok_metrics})
    calibration_results.append({'subset': i, 'model': 'RF+OK', **hybrid_metrics})

    print(f"✅ DC{i:03d} | RF R²={rf_cv_metrics['R2']:.3f} | OK R²={ok_metrics['R2']:.3f} | Hybrid R²={hybrid_metrics['R2']:.3f}")

# =========================================================
# SAVE RESULTS
# =========================================================
out_csv = os.path.join(output_dir, "calibration_results.csv")
pd.DataFrame(calibration_results).to_csv(out_csv, index=False)

print("\n✅ Calibration diagnostics complete.")
print(f"📁 Results saved to: {out_csv}")


🔁 Running calibration for first 10 declustered subsets...

📂 Processing decluster subset 001 | n = 629
⚙️ Running internal 5-fold CV and validation...
✅ DC001 | RF R²=0.333 | OK R²=1.000 | Hybrid R²=1.000

📂 Processing decluster subset 002 | n = 629
⚙️ Running internal 5-fold CV and validation...
✅ DC002 | RF R²=0.304 | OK R²=1.000 | Hybrid R²=1.000

📂 Processing decluster subset 003 | n = 629
⚙️ Running internal 5-fold CV and validation...
✅ DC003 | RF R²=0.321 | OK R²=1.000 | Hybrid R²=1.000

📂 Processing decluster subset 004 | n = 629
⚙️ Running internal 5-fold CV and validation...
✅ DC004 | RF R²=0.318 | OK R²=1.000 | Hybrid R²=1.000

📂 Processing decluster subset 005 | n = 629
⚙️ Running internal 5-fold CV and validation...
✅ DC005 | RF R²=0.325 | OK R²=1.000 | Hybrid R²=1.000

📂 Processing decluster subset 006 | n = 629
⚙️ Running internal 5-fold CV and validation...
✅ DC006 | RF R²=0.317 | OK R²=1.000 | Hybrid R²=1.000

📂 Processing decluster subset 007 | n = 629
⚙️ Running inte

Calibration results summary (DC₁–₁₀):
Random Forest shows consistent moderate predictive ability (R² ≈ 0.32, RMSE ≈ 0.44).
However, Ordinary Kriging (OK) and hybrid RF+OK show perfect interpolation (R²=1.0), indicating they were evaluated on the same training data rather than withheld samples.
The next step is to implement Leave-One-Out (LOO) kriging validation to obtain unbiased performance estimates for OK and RF+OK before proceeding to ensemble modeling.

In [1]:
# =========================================================
# STEP — Decluster modeling loop with Spatial K-Fold & LOO
# =========================================================

import os
import glob
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
from pykrige.ok import OrdinaryKriging
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# =========================================================
# PATHS AND PARAMETERS
# =========================================================
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs"
output_dir = "/Users/inesschwartz/Desktop/model/results_final"
os.makedirs(output_dir, exist_ok=True)

# Random Forest tuned parameters
rf_params = {
    'n_estimators': 1500,
    'min_samples_leaf': 3,
    'max_features': 0.5,
    'max_depth': None,
    'random_state': 42,
    'n_jobs': -1
}

# Variogram parameters (from calibration)
variogram_model = 'exponential'
variogram_params = {'nugget': 0.0022, 'sill': 0.22, 'range': 15000}

# =========================================================
# HELPER FUNCTIONS
# =========================================================
def compute_metrics(y_true, y_pred):
    """Compute standard regression metrics."""
    return {
        'R2': r2_score(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred)
    }


def spatial_kfold(coords, n_splits=5):
    """
    Custom spatial K-Fold generator.
    Splits data into roughly spatially distinct clusters
    by sorting coordinates and splitting sequentially.
    """
    # Sort by X + Y coordinate to ensure spatial grouping
    idx = np.argsort(coords[:, 0] + coords[:, 1])
    fold_sizes = np.full(n_splits, len(coords) // n_splits, dtype=int)
    fold_sizes[:len(coords) % n_splits] += 1
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        test_idx = idx[start:stop]
        train_idx = np.setdiff1d(np.arange(len(coords)), test_idx)
        folds.append((train_idx, test_idx))
        current = stop
    return folds


def cross_validate_rf_spatial(X, y, coords, n_splits=5):
    """Spatial K-Fold cross-validation for Random Forest."""
    folds = spatial_kfold(coords, n_splits)
    metrics = []
    for train_idx, test_idx in folds:
        rf = RandomForestRegressor(**rf_params)
        rf.fit(X.iloc[train_idx], y[train_idx])
        y_pred = rf.predict(X.iloc[test_idx])
        metrics.append(compute_metrics(y[test_idx], y_pred))
    return pd.DataFrame(metrics).mean().to_dict()


def loo_validate_ok(coords, values, variogram_params, model='exponential'):
    """Leave-One-Out Cross-Validation for Ordinary Kriging."""
    preds = np.zeros(len(values))
    for i in range(len(values)):
        mask = np.ones(len(values), dtype=bool)
        mask[i] = False
        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=values[mask],
            variogram_model=model,
            variogram_parameters={
                'sill': variogram_params['sill'],
                'range': variogram_params['range'],
                'nugget': variogram_params['nugget']
            },
            enable_plotting=False,
            verbose=False
        )
        zhat, _ = ok.execute('points',
                             np.array([coords[i, 0]]),
                             np.array([coords[i, 1]]))
        preds[i] = zhat.data[0]
    return compute_metrics(values, preds)


def loo_validate_rf_ok(X, y, coords, variogram_params):
    """LOO validation for hybrid RF+OK (residual kriging)."""
    preds = np.zeros(len(y))
    for i in range(len(y)):
        mask = np.ones(len(y), dtype=bool)
        mask[i] = False
        rf = RandomForestRegressor(**rf_params)
        rf.fit(X.iloc[mask], y[mask])
        y_pred_rf = rf.predict(X.iloc[mask])
        residuals = y[mask] - y_pred_rf
        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=residuals,
            variogram_model='exponential',
            variogram_parameters={
                'sill': variogram_params['sill'],
                'range': variogram_params['range'],
                'nugget': variogram_params['nugget']
            },
            enable_plotting=False,
            verbose=False
        )
        zhat, _ = ok.execute('points',
                             np.array([coords[i, 0]]),
                             np.array([coords[i, 1]]))
        yhat_hybrid = rf.predict(X.iloc[[i]])[0] + zhat.data[0]
        preds[i] = yhat_hybrid
    return compute_metrics(y, preds)


# =========================================================
# MAIN CALIBRATION LOOP
# =========================================================
files = sorted(glob.glob(os.path.join(decluster_dir, "decluster_run_*.csv")))
calibration_results = []

print(f"🔁 Running RF (Spatial K-Fold), OK (LOO), and RF+OK (LOO) for {len(files[:10])} decluster subsets...")

for i, file in enumerate(tqdm(files[:10], desc="Calibration progress", ncols=100), start=1):
    df = pd.read_csv(file)
    target = 'log_soc_stock'
    X = df.drop(columns=[target, 'X_coord', 'Y_coord', 'site_info_id'], errors='ignore')
    y = df[target].values
    coords = df[['X_coord', 'Y_coord']].values

    rf_spatial = cross_validate_rf_spatial(X, y, coords, n_splits=5)
    ok_loo = loo_validate_ok(coords, y, variogram_params, model=variogram_model)
    hybrid_loo = loo_validate_rf_ok(X, y, coords, variogram_params)

    calibration_results.append({'subset': i, 'model': 'RF_SpatialKFold', **rf_spatial})
    calibration_results.append({'subset': i, 'model': 'OK_LOO', **ok_loo})
    calibration_results.append({'subset': i, 'model': 'RF+OK_LOO', **hybrid_loo})

# --- Save calibration results ---
calib_path = os.path.join(output_dir, "calibration_results_spatial.csv")
pd.DataFrame(calibration_results).to_csv(calib_path, index=False)

print(f"\n📁 Calibration results saved to: {calib_path}")
print("✅ Spatial K-Fold + LOO calibration complete.")


🔁 Running RF (Spatial K-Fold), OK (LOO), and RF+OK (LOO) for 10 decluster subsets...


Calibration progress: 100%|█████████████████████████████████████| 10/10 [3:18:05<00:00, 1188.54s/it]


📁 Calibration results saved to: /Users/inesschwartz/Desktop/model/results_final/calibration_results_spatial.csv
✅ Spatial K-Fold + LOO calibration complete.





In [1]:
## ok rf-ok trial

 # =========================================================
# STEP — Calibration test for OK and RF+OK (5 declusters)
# =========================================================

import os
import glob
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from pykrige.ok import OrdinaryKriging
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# =========================================================
# PATHS AND PARAMETERS
# =========================================================
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs"
output_dir = "/Users/inesschwartz/Desktop/model/results_calibration_ok"
os.makedirs(output_dir, exist_ok=True)

# Variogram parameters (from your calibration step)
variogram_model = 'exponential'
variogram_params = {'nugget': 0.0022, 'sill': 0.22, 'range': 15000}

# Random Forest tuned parameters
rf_params = {
    'n_estimators': 1500,
    'min_samples_leaf': 3,
    'max_features': 0.5,
    'max_depth': None,
    'random_state': 42,
    'n_jobs': -1
}

# =========================================================
# HELPER FUNCTIONS
# =========================================================
def compute_metrics(y_true, y_pred):
    """Compute regression metrics."""
    return {
        'R2': r2_score(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred)
    }


def loo_validate_ok(coords, values, variogram_params, model='exponential'):
    """Leave-One-Out Cross-Validation for Ordinary Kriging."""
    preds = np.zeros(len(values))
    for i in range(len(values)):
        mask = np.ones(len(values), dtype=bool)
        mask[i] = False

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=values[mask],
            variogram_model=model,
            variogram_parameters=[
                variogram_params['sill'],
                variogram_params['range'],
                variogram_params['nugget']
            ],
            enable_plotting=False,
            verbose=False
        )

        zhat, _ = ok.execute('points',
                             np.array([coords[i, 0]]),
                             np.array([coords[i, 1]]))
        preds[i] = zhat.data[0]

    return compute_metrics(values, preds)


def loo_validate_rf_ok(X, y, coords, variogram_params):
    """Leave-One-Out validation for hybrid RF+OK (residual kriging)."""
    preds = np.zeros(len(y))
    for i in range(len(y)):
        mask = np.ones(len(y), dtype=bool)
        mask[i] = False

        rf = RandomForestRegressor(**rf_params)
        rf.fit(X.iloc[mask], y[mask])
        y_pred_rf = rf.predict(X.iloc[mask])
        residuals = y[mask] - y_pred_rf

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=residuals,
            variogram_model='exponential',
            variogram_parameters=[
                variogram_params['sill'],
                variogram_params['range'],
                variogram_params['nugget']
            ],
            enable_plotting=False,
            verbose=False
        )

        zhat, _ = ok.execute('points',
                             np.array([coords[i, 0]]),
                             np.array([coords[i, 1]]))
        yhat_hybrid = rf.predict(X.iloc[[i]])[0] + zhat.data[0]
        preds[i] = yhat_hybrid

    return compute_metrics(y, preds)

# =========================================================
# MAIN CALIBRATION LOOP
# =========================================================
files = sorted(glob.glob(os.path.join(decluster_dir, "decluster_run_*.csv")))[:5]
results = []

print(f"🔁 Running OK and RF+OK (LOO) for {len(files)} declustered subsets...")

for i, file in enumerate(tqdm(files, desc="Calibration progress", ncols=100), start=1):
    df = pd.read_csv(file)
    target = 'log_soc_stock'
    X = df.drop(columns=[target, 'X_coord', 'Y_coord', 'site_info_id'], errors='ignore')
    y = df[target].values
    coords = df[['X_coord', 'Y_coord']].values

    ok_metrics = loo_validate_ok(coords, y, variogram_params, model=variogram_model)
    hybrid_metrics = loo_validate_rf_ok(X, y, coords, variogram_params)

    results.append({'subset': i, 'model': 'OK_LOO', **ok_metrics})
    results.append({'subset': i, 'model': 'RF+OK_LOO', **hybrid_metrics})

# =========================================================
# SAVE & DISPLAY RESULTS
# =========================================================
results_df = pd.DataFrame(results)
out_path = os.path.join(output_dir, "ok_rfok_calibration_results.csv")
results_df.to_csv(out_path, index=False)

print(f"\n📁 Calibration results saved to: {out_path}")
print("✅ Calibration complete.\n")

print("Summary (mean across 5 subsets):")
print(results_df.groupby("model")[["R2", "RMSE", "MAE"]].mean().round(3))


🔁 Running OK and RF+OK (LOO) for 5 declustered subsets...


Calibration progress: 100%|███████████████████████████████████████| 5/5 [1:55:39<00:00, 1387.82s/it]


📁 Calibration results saved to: /Users/inesschwartz/Desktop/model/results_calibration_ok/ok_rfok_calibration_results.csv
✅ Calibration complete.

Summary (mean across 5 subsets):
              R2   RMSE    MAE
model                         
OK_LOO     0.085  0.514  0.392
RF+OK_LOO  0.332  0.439  0.315





In [None]:
## full dc runs

# =========================================================
# STEP — Decluster modeling loop with Spatial K-Fold & LOO
# =========================================================

import os
import glob
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold
from pykrige.ok import OrdinaryKriging
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

# =========================================================
# PATHS AND PARAMETERS
# =========================================================
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs"
output_dir = "/Users/inesschwartz/Desktop/model/results_final"
os.makedirs(output_dir, exist_ok=True)

# Random Forest tuned parameters
rf_params = {
    'n_estimators': 1500,
    'min_samples_leaf': 3,
    'max_features': 0.5,
    'max_depth': None,
    'random_state': 42,
    'n_jobs': -1
}

# Variogram parameters (from your calibration step)
variogram_model = 'exponential'
variogram_params = {'nugget': 0.0022, 'sill': 0.22, 'range': 15000}

# =========================================================
# HELPER FUNCTIONS
# =========================================================
def compute_metrics(y_true, y_pred):
    """Compute standard regression metrics."""
    return {
        'R2': r2_score(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred)
    }


def spatial_kfold(coords, n_splits=5):
    """
    Custom spatial K-Fold generator.
    Splits data into roughly spatially distinct clusters
    by sorting coordinates and splitting sequentially.
    """
    # Sort by X + Y coordinate to ensure spatial grouping
    idx = np.argsort(coords[:, 0] + coords[:, 1])
    fold_sizes = np.full(n_splits, len(coords) // n_splits, dtype=int)
    fold_sizes[:len(coords) % n_splits] += 1
    current = 0
    folds = []
    for fold_size in fold_sizes:
        start, stop = current, current + fold_size
        test_idx = idx[start:stop]
        train_idx = np.setdiff1d(np.arange(len(coords)), test_idx)
        folds.append((train_idx, test_idx))
        current = stop
    return folds


def cross_validate_rf_spatial(X, y, coords, n_splits=5):
    """Spatial K-Fold cross-validation for Random Forest."""
    folds = spatial_kfold(coords, n_splits)
    metrics = []
    for train_idx, test_idx in folds:
        rf = RandomForestRegressor(**rf_params)
        rf.fit(X.iloc[train_idx], y[train_idx])
        y_pred = rf.predict(X.iloc[test_idx])
        metrics.append(compute_metrics(y[test_idx], y_pred))
    return pd.DataFrame(metrics).mean().to_dict()


def loo_validate_ok(coords, values, variogram_params, model='exponential'):
    """Leave-One-Out Cross-Validation for Ordinary Kriging."""
    preds = np.zeros(len(values))
    for i in range(len(values)):
        mask = np.ones(len(values), dtype=bool)
        mask[i] = False

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=values[mask],
            variogram_model=model,
            variogram_parameters=[
                variogram_params['sill'],
                variogram_params['range'],
                variogram_params['nugget']
            ],
            enable_plotting=False,
            verbose=False
        )

        zhat, _ = ok.execute('points',
                             np.array([coords[i, 0]]),
                             np.array([coords[i, 1]]))
        preds[i] = zhat.data[0]

    return compute_metrics(values, preds)


def loo_validate_rf_ok(X, y, coords, variogram_params):
    """Leave-One-Out validation for hybrid RF+OK (residual kriging)."""
    preds = np.zeros(len(y))
    for i in range(len(y)):
        mask = np.ones(len(y), dtype=bool)
        mask[i] = False

        rf = RandomForestRegressor(**rf_params)
        rf.fit(X.iloc[mask], y[mask])
        y_pred_rf = rf.predict(X.iloc[mask])
        residuals = y[mask] - y_pred_rf

        ok = OrdinaryKriging(
            x=coords[mask, 0],
            y=coords[mask, 1],
            z=residuals,
            variogram_model='exponential',
            variogram_parameters=[
                variogram_params['sill'],
                variogram_params['range'],
                variogram_params['nugget']
            ],
            enable_plotting=False,
            verbose=False
        )

        zhat, _ = ok.execute('points',
                             np.array([coords[i, 0]]),
                             np.array([coords[i, 1]]))
        yhat_hybrid = rf.predict(X.iloc[[i]])[0] + zhat.data[0]
        preds[i] = yhat_hybrid

    return compute_metrics(y, preds)

# =========================================================
# MAIN CALIBRATION LOOP # can take out now that I ran and liked the results?
# =========================================================
files = sorted(glob.glob(os.path.join(decluster_dir, "decluster_run_*.csv")))[:5]
results = []

print(f"🔁 Running OK and RF+OK (LOO) for {len(files)} declustered subsets...")

for i, file in enumerate(tqdm(files, desc="Calibration progress", ncols=100), start=1):
    df = pd.read_csv(file)
    target = 'log_soc_stock'
    X = df.drop(columns=[target, 'X_coord', 'Y_coord', 'site_info_id'], errors='ignore')
    y = df[target].values
    coords = df[['X_coord', 'Y_coord']].values

    ok_metrics = loo_validate_ok(coords, y, variogram_params, model=variogram_model)
    hybrid_metrics = loo_validate_rf_ok(X, y, coords, variogram_params)

    results.append({'subset': i, 'model': 'OK_LOO', **ok_metrics})
    results.append({'subset': i, 'model': 'RF+OK_LOO', **hybrid_metrics})

# =========================================================
# SAVE & DISPLAY RESULTS
# =========================================================
results_df = pd.DataFrame(results)
out_path = os.path.join(output_dir, "ok_rfok_calibration_results.csv")
results_df.to_csv(out_path, index=False)

print(f"\n📁 Calibration results saved to: {out_path}")
print("✅ Calibration complete.\n")

print("Summary (mean across 5 subsets):")
print(results_df.groupby("model")[["R2", "RMSE", "MAE"]].mean().round(3))
