In [24]:
import rasterio
import numpy as np
from scipy.optimize import minimize
from bayes_opt import BayesianOptimization
import os

def load_raster(raster_path: str) -> tuple[np.ndarray, rasterio.transform.Affine, rasterio.crs.CRS]:
    """Loads a raster into a NumPy array and returns its metadata."""
    with rasterio.open(raster_path) as src:
        array = src.read(1)
        transform = src.transform
        crs = src.crs
    return array, transform, crs

def write_raster(array: np.ndarray, transform: rasterio.transform.Affine, crs: rasterio.crs.CRS, output_path: str):
    """Writes a NumPy array to a GeoTIFF raster."""
    with rasterio.open(
        output_path,
        'w',
        driver='GTiff',
        height=array.shape[0],
        width=array.shape[1],
        count=1,
        dtype=array.dtype,
        transform=transform,
        crs=crs,
    ) as dst:
        dst.write(array, 1)

def calculate_rmse(predictions: np.ndarray, reference: np.ndarray) -> float:
    """Calculates the Root Mean Squared Error between two arrays."""
    return np.sqrt(np.mean((predictions - reference) ** 2))

def average_ensemble(prediction_arrays: list[np.ndarray]) -> np.ndarray:
    """Calculates the average ensemble of multiple prediction arrays."""
    return np.mean(np.stack(prediction_arrays), axis=0)

def maximize_rmse(weights: np.ndarray, prediction_arrays: list[np.ndarray], reference_array: np.ndarray) -> float:
    """Calculates the negative RMSE for optimization (since we want to maximize RMSE)."""
    ensemble = np.average(np.stack(prediction_arrays), axis=0, weights=weights)
    return -calculate_rmse(ensemble, reference_array)

def ensemble_prediction(pred_paths: list[str], ref_path: str, 
                        init_points:int=10, n_iter:int=100,
                        avge=True, opte=True):
    """
    init_points=5 n_iter=15 as baselines 
    Calculates and saves the average and optimized (maximizing RMSE) ensemble predictions.

    Args:
        pred_paths (list[str]): List of paths to the prediction raster files.
        ref_path (str): Path to the reference raster file.
        avge (bool, optional): Whether to calculate and save the average ensemble. Defaults to True.
        opte (bool, optional): Whether to calculate and save the optimized ensemble (maximizing RMSE). Defaults to True.

    Returns:
        tuple: A tuple containing the file paths of the average ensemble raster (if avge=True)
               and the optimized ensemble raster (if opte=True). Returns None for either if the
               corresponding argument is False.
    """
    prediction_arrays = []
    for path in pred_paths:
        array, transform, crs = load_raster(path)
        prediction_arrays.append(array)

    reference_array, _, _ = load_raster(ref_path)

    output_dir = os.path.dirname(pred_paths[0])
    avge_fn = None
    opte_fn = None

    if avge:
        avg_ensemble_array = average_ensemble(prediction_arrays)
        avge_fn = os.path.join(output_dir, "AVGe.tif")
        write_raster(avg_ensemble_array, transform, crs, avge_fn)
        print(f"Average ensemble saved to: {avge_fn}")

    if opte:
        num_predictions = len(prediction_arrays)
        print(f'num_predictions @{num_predictions}')

        def bayesian_optimization_function(w1, w2, w3, w4):
            weights = [w1, w2, w3, w4]
            ensemble = np.average(np.stack(prediction_arrays), axis=0, weights=weights)
            return -calculate_rmse(ensemble, reference_array)

        pbounds = {f'w{i+1}': (0, 1) for i in range(num_predictions)}
        print(f'pbounds @{pbounds}')

        optimizer = BayesianOptimization(
            f=bayesian_optimization_function,
            pbounds=pbounds,
            random_state=1,
        )

        
        optimizer.maximize(
            init_points=init_points,
            n_iter=n_iter,
        )

        best_weights = [optimizer.max['params'][f'w{i+1}'] for i in range(num_predictions)]
        print(print(f'best_weights @{best_weights}'))
        optimized_ensemble_array = np.average(np.stack(prediction_arrays), axis=0, weights=best_weights)
        opte_fn = os.path.join(output_dir, f"OPTe_{init_points}_{n_iter}.tif")
        write_raster(optimized_ensemble_array, transform, crs, opte_fn)
        print('==='*40)
        print(f"Optimized ensemble (maximizing RMSE) saved to: {opte_fn}")
        print(f"Optimized weights: {best_weights}")

    return avge_fn, opte_fn



In [25]:
from glob import glob 
pfiles = glob("/home/ljp238/Downloads/SAGA_DEV/N13E103_GWRd_svs_*_fmin.tif")
rfile = "/media/ljp238/12TBWolf/BRCHIEVE/TILES12/N13E103/N13E103_edem_egm.tif"
init_points = 10#0
n_iter=100
avg_raster, opt_raster = ensemble_prediction(pfiles, rfile,init_points, n_iter)

if avg_raster:
    print(f"Average raster saved at: {avg_raster}")
if opt_raster:
    print(f"Optimized raster saved at: {opt_raster}")

Average ensemble saved to: /home/ljp238/Downloads/SAGA_DEV/AVGe.tif
num_predictions @4
pbounds @{'w1': (0, 1), 'w2': (0, 1), 'w3': (0, 1), 'w4': (0, 1)}
|   iter    |  target   |    w1     |    w2     |    w3     |    w4     |
-------------------------------------------------------------------------
| [39m1        [39m | [39m-13.27   [39m | [39m0.417    [39m | [39m0.7203   [39m | [39m0.0001144[39m | [39m0.3023   [39m |
| [35m2        [39m | [35m-12.89   [39m | [35m0.1468   [39m | [35m0.09234  [39m | [35m0.1863   [39m | [35m0.3456   [39m |
| [39m3        [39m | [39m-12.97   [39m | [39m0.3968   [39m | [39m0.5388   [39m | [39m0.4192   [39m | [39m0.6852   [39m |
| [39m4        [39m | [39m-13.2    [39m | [39m0.2045   [39m | [39m0.8781   [39m | [39m0.02739  [39m | [39m0.6705   [39m |
| [39m5        [39m | [39m-13.15   [39m | [39m0.4173   [39m | [39m0.5587   [39m | [39m0.1404   [39m | [39m0.1981   [39m |
| [39m6        [39m | [39m

In [19]:
# if __name__ == '__main__':
#     # Create dummy raster files for testing
#     import numpy.ma as ma

#     def create_dummy_raster(file_path, data):
#         transform = rasterio.transform.from_origin(0, 0, 1, 1)
#         with rasterio.open(
#             file_path,
#             'w',
#             driver='GTiff',
#             height=data.shape[0],
#             width=data.shape[1],
#             count=1,
#             dtype=data.dtype,
#             crs='EPSG:4326',
#             transform=transform,
#         ) as dst:
#             dst.write(data, 1)

#     dummy_dir = "dummy_rasters"
#     os.makedirs(dummy_dir, exist_ok=True)

#     pred_paths_dummy = [
#         os.path.join(dummy_dir, "pred1.tif"),
#         os.path.join(dummy_dir, "pred2.tif"),
#         os.path.join(dummy_dir, "pred3.tif"),
#         os.path.join(dummy_dir, "pred4.tif"),
#     ]
#     ref_path_dummy = os.path.join(dummy_dir, "ref.tif")

#     data_ref = np.array([[10, 12], [15, 18]], dtype=np.float32)
#     data_pred1 = np.array([[11, 13], [14, 17]], dtype=np.float32)
#     data_pred2 = np.array([[9, 11], [16, 19]], dtype=np.float32)
#     data_pred3 = np.array([[12, 14], [13, 16]], dtype=np.float32)
#     data_pred4 = np.array([[8, 10], [17, 20]], dtype=np.float32)

#     create_dummy_raster(ref_path_dummy, data_ref)
#     create_dummy_raster(pred_paths_dummy[0], data_pred1)
#     create_dummy_raster(pred_paths_dummy[1], data_pred2)
#     create_dummy_raster(pred_paths_dummy[2], data_pred3)
#     create_dummy_raster(pred_paths_dummy[3], data_pred4)

#     avg_raster, opt_raster = ensemble_prediction(pred_paths_dummy, ref_path_dummy)

#     if avg_raster:
#         print(f"Average raster saved at: {avg_raster}")
#     if opt_raster:
#         print(f"Optimized raster saved at: {opt_raster}")

#     # Clean up dummy files
#     for path in pred_paths_dummy + [ref_path_dummy, avg_raster, opt_raster]:
#         if path and os.path.exists(path):
#             os.remove(path)
#     os.rmdir(dummy_dir)

['/home/ljp238/Downloads/SAGA_DEV/N13E103_GWRd_svs_dw2_fmin.tif',
 '/home/ljp238/Downloads/SAGA_DEV/N13E103_GWRd_svs_dw3_fmin.tif',
 '/home/ljp238/Downloads/SAGA_DEV/N13E103_GWRd_svs_dw0_fmin.tif',
 '/home/ljp238/Downloads/SAGA_DEV/N13E103_GWRd_svs_dw1_fmin.tif']

Average ensemble saved to: /home/ljp238/Downloads/SAGA_DEV/AVGe.tif
num_predictions @4
pbounds @{'w1': (0, 1), 'w2': (0, 1), 'w3': (0, 1), 'w4': (0, 1)}
|   iter    |  target   |    w1     |    w2     |    w3     |    w4     |
-------------------------------------------------------------------------
| [39m1        [39m | [39m-13.27   [39m | [39m0.417    [39m | [39m0.7203   [39m | [39m0.0001144[39m | [39m0.3023   [39m |
| [35m2        [39m | [35m-12.89   [39m | [35m0.1468   [39m | [35m0.09234  [39m | [35m0.1863   [39m | [35m0.3456   [39m |
| [39m3        [39m | [39m-12.97   [39m | [39m0.3968   [39m | [39m0.5388   [39m | [39m0.4192   [39m | [39m0.6852   [39m |
| [39m4        [39m | [39m-13.2    [39m | [39m0.2045   [39m | [39m0.8781   [39m | [39m0.02739  [39m | [39m0.6705   [39m |
| [39m5        [39m | [39m-13.15   [39m | [39m0.4173   [39m | [39m0.5587   [39m | [39m0.1404   [39m | [39m0.1981   [39m |
| [35m6        [39m | [35m

In [None]:
#import optuna
# ask, why are you not using optuna 
# add verbosity prining stages like in logging 
# can we add time to the baysian ensemble 