In [1]:
## random forest model to calculate soc map

## what I need: 
# 1) cov_final = "/Users/inesschwartz/Desktop/final_training_dataset.csv"
# X = cov_final(drop.log_soc_stock)
# Y = cov_final(log_soc_stock)
# W = sample_weight
# 1km aligned raster stack covariates for prediction (not yet built)

#steps to build model
# 1)  train weighted RF using  sample_weight
# 2) Out of sample CV predictions per point
# 3) spacial cross val
    # Split points into spatial folds / blocks
    # Train model on training folds with sample_weight
    # Predict held-out fold
    #          Compute metrics: weighted RMSE, R², bias
    #       Diagnostic plots: observed vs predicted, residual spatial patterns 
# 4) Random Forest SOC map apply final, tuned RF model (trained with the declustering sample_weight) to the 1 km raster stack of predictor covariates.
# 5) Uncertainty: model variance

### Create prediction grid aligned with final covariates

In [2]:
# ================================================
# Create prediction grid aligned with final covariates
# ================================================

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from shapely.geometry import Point

# -----------------------------
# 1️⃣ Load final training dataset
# -----------------------------
cov_final = pd.read_csv("/Users/inesschwartz/Desktop/final_training_dataset.csv")
predictor_cols = [col for col in cov_final.columns if col not in ['log_soc_stock', 'sample_weight']]

# -----------------------------
# 2️⃣ Define raster folders / files
# -----------------------------
dem_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/dem_1km/"
bioclim_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/"

# Individual categorical rasters
ecosystem_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/labelled_ecosystems32733_1km.tif"
litho_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/lithology_1km.tif"
landsurface_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/landsurfaceforms_1km.tif"
soil_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/angola_soil_data_raster_1km.tif"

# Helper to list valid .tif files only
def list_valid_tifs(folder):
    return [
        os.path.join(folder, f)
        for f in os.listdir(folder)
        if f.endswith('.tif') and not f.startswith('._')
    ]

terrain_files = list_valid_tifs(dem_folder)
bioclim_files = list_valid_tifs(bioclim_folder)


In [13]:
#  get the filenames (without full path)
print("\nbioclim filenames only:")
print([os.path.basename(f) for f in bioclim_files])



bioclim filenames only:
['isothermality_32733.tif', 'max_temp_warmest_month32733.tif', 'mean_temp_driest_quarter32733.tif', 'mean_temp_warmest_quarter32733.tif', 'mean_temp_wettest_quarter32733.tif', 'min_temp_coldest_month32733.tif', 'precip_coldest_quarter32733.tif', 'precip_driest_month32733.tif', 'precip_driest_quarter32733.tif', 'precip_warmest_quarter32733.tif', 'precip_wettest_month32733.tif', 'precip_wettest_quarter32733.tif', 'temp_annual_range32733.tif', 'temp_seasonality32733.tif', 'precip_seasonality2.tif', 'annual_precip2.tif', 'annual_mean_temp.tif', 'mean_temp_coldest_quarter32733.tif', '2mean_temp_coldest_quarter32733.tif']


In [12]:
cov_final.columns

Index(['X_coord', 'Y_coord', 'site_info_id', 'profile', 'district',
       'landsurface_value', 'litho_value', 'formation', 'annual_mean_temp',
       'annual_precip', 'isothermality', 'precip_driest_month',
       'precip_seasonality', 'precip_wettest_month', 'temp_annual_range',
       'temp_seasonality', 'MRVBF', 'RLD', 'aspect', 'aspect_cos',
       'aspect_sin', 'DEM', 'flow_accumulation', 'relief', 'slope', 'TWI',
       'valleydepth', 'faosoil_id', 'log_soc_stock'],
      dtype='object')

In [18]:
# ================================================
# Fast prediction grid aligned with final covariates
# ================================================

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import rioxarray as rxr
from shapely.geometry import Point
import xarray as xr

# -----------------------------
# 1️⃣ Load final training dataset
# -----------------------------
cov_final = pd.read_csv("/Users/inesschwartz/Desktop/final_training_dataset.csv")
predictor_cols = [c for c in cov_final.columns if c not in ['log_soc_stock', 'sample_weight']]

# -----------------------------
# 2️⃣ Define raster folders / files
# -----------------------------
dem_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/"
bioclim_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/"

ecosystem_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/labelled_ecosystems32733_1km.tif"
litho_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/lithology_1km.tif"
landsurface_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/landsurfaceforms_1km.tif"
soil_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/angola_soil_data_raster_1km.tif"

# -----------------------------
# 3️⃣ Map raster filenames to predictor columns
# -----------------------------
raster_to_colname = {
    # terrain
    "aspect_cos_1km": "aspect_cos",
    "aspect_sin_1km.tif": "aspect_sin",
    "dem_filledfiltered_1km.tif": "DEM",
    "MRVBF_1km.tif": "MRVBF",
    "RLD_1km.tif": "RLD",
    "flow_accumulation_1km.tif": "flow_accumulation",
    "relief_1km.tif": "relief",
    "slope_1km.tif": "slope",
    "twi_300m_1km.tif": "TWI",
    "valleydepth2_1km.tif": "valleydepth",
    # bioclim
    "annual_mean_temp.tif": "annual_mean_temp",
    "precip_seasonality2.tif": "precip_seasonality",
    "precip_wettest_month.tif": "precip_wettest_month",
    "temp_annual_range32733.tif": "temp_annual_range",
    "temp_seasonality32733.tif": "temp_seasonality",
    # categorical
    ecosystem_raster: "formation",
    landsurface_raster: "landsurface_value",
    litho_raster: "litho_value",
    soil_raster: "faosoil_id"
}

# -----------------------------
# 4️⃣ Load rasters as a stack using rioxarray
# -----------------------------
all_rasters = list(raster_to_colname.keys())
stack_list = []

for raster in all_rasters:
    da = rxr.open_rasterio(raster, masked=True).squeeze()
    da = da.rename(raster_to_colname[raster])
    stack_list.append(da)

# Merge into a single xarray dataset
covariate_stack = xr.merge(stack_list)

# -----------------------------
# 5️⃣ Clip to soil polygons
# -----------------------------
soil_gpkg = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_data2_32733.gpkg"
soil = gpd.read_file(soil_gpkg)

covariate_stack = covariate_stack.rio.clip(soil.geometry, soil.crs, drop=True)

# -----------------------------
# 6️⃣ Flatten to 2D DataFrame for prediction
# -----------------------------
flat_stack = covariate_stack.to_array().stack(pixel=("y", "x")).transpose("pixel", "variable")
X_grid = flat_stack.to_pandas()
X_grid.reset_index(drop=True, inplace=True)

# Handle categorical variables
categorical_cols = [v for v in raster_to_colname.values() if X_grid[v].dtype != float]
for col in categorical_cols:
    X_grid[col] = X_grid[col].astype(str)
X_grid_encoded = pd.get_dummies(X_grid, columns=categorical_cols, drop_first=True)

# Align columns to training dataset
for col in predictor_cols:
    if col not in X_grid_encoded.columns:
        X_grid_encoded[col] = 0
X_grid_encoded = X_grid_encoded[predictor_cols]

# -----------------------------
# 7️⃣ Save prediction grid
# -----------------------------
out_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/prediction_grid_covariates_fast.csv"
X_grid_encoded.to_csv(out_csv, index=False)

print("✅ Prediction grid ready:", X_grid_encoded.shape)
print(f"Saved to: {out_csv}")


DataSourceError: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_data2_32733.gpkg: No such file or directory

### Run model

In [None]:
## model

# ================================================
# Weighted Random Forest for SOC mapping
# ================================================

import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# 1️⃣ Load final training dataset
# -----------------------------
cov_final = pd.read_csv("/Users/inesschwartz/Desktop/final_training_dataset.csv")

# Separate predictors, target, and sample weights
X = cov_final.drop(columns=['log_soc_stock', 'sample_weight'])
y = cov_final['log_soc_stock']
sample_weight = cov_final['sample_weight']

predictor_cols = X.columns.tolist()

# -----------------------------
# 2️⃣ Spatial cross-validation setup
# -----------------------------
# If coordinates are available for block assignment:
coords_available = all(col in cov_final.columns for col in ['X_coord','Y_coord'])
if coords_available:
    # Assign spatial blocks (example: 10 km blocks)
    block_size = 10000  # meters
    cov_final['block_x'] = np.digitize(cov_final['X_coord'], np.arange(cov_final['X_coord'].min(), cov_final['X_coord'].max()+block_size, block_size)) - 1
    cov_final['block_y'] = np.digitize(cov_final['Y_coord'], np.arange(cov_final['Y_coord'].min(), cov_final['Y_coord'].max()+block_size, block_size)) - 1
    cov_final['block_id'] = cov_final['block_x'].astype(str) + "_" + cov_final['block_y'].astype(str)
else:
    # If no spatial info, fallback to normal CV
    cov_final['block_id'] = np.arange(len(cov_final))

# 5-fold GroupKFold based on spatial blocks
n_folds = 5
gkf = GroupKFold(n_splits=n_folds)

# -----------------------------
# 3️⃣ Out-of-sample / spatial CV
# -----------------------------
cv_results = []

for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=cov_final['block_id'])):
    print(f"Fold {fold+1}: Train={len(train_idx)} Test={len(test_idx)}")

    # Subset data
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    w_train = sample_weight.iloc[train_idx]

    # Train weighted RF
    rf = RandomForestRegressor(
        n_estimators=500, 
        max_features='sqrt', 
        n_jobs=-1, 
        random_state=42
    )
    rf.fit(X_train, y_train, sample_weight=w_train)

    # Predict held-out fold
    y_pred = rf.predict(X_test)

    # Compute metrics
    mse = mean_squared_error(y_test, y_pred, sample_weight=sample_weight.iloc[test_idx])
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    bias = np.mean(y_pred - y_test)
    
    cv_results.append({'fold': fold+1, 'RMSE': rmse, 'R2': r2, 'bias': bias})

    # Diagnostic plot: observed vs predicted
    plt.figure(figsize=(6,6))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel("Observed log(SOC)")
    plt.ylabel("Predicted log(SOC)")
    plt.title(f"Observed vs Predicted - Fold {fold+1}")
    plt.show()

# Summary of CV results
cv_df = pd.DataFrame(cv_results)
print("\nSpatial CV Summary:")
print(cv_df.describe())

# -----------------------------
# 4️⃣ Train final RF on full dataset
# -----------------------------
rf_final = RandomForestRegressor(
    n_estimators=500, 
    max_features='sqrt', 
    n_jobs=-1, 
    random_state=42
)
rf_final.fit(X, y, sample_weight=sample_weight)
print("Final RF trained on full dataset.")

# -----------------------------
# 5️⃣ Predict SOC on raster stack / prediction grid
# -----------------------------
# Load prepared prediction grid CSV (with covariates aligned to training dataset)
X_grid = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/prediction_grid_covariates.csv")

# Ensure columns match training predictors
missing_cols = [c for c in predictor_cols if c not in X_grid.columns]
for c in missing_cols:
    X_grid[c] = 0
X_grid = X_grid[predictor_cols]

# Predict SOC
soc_pred = rf_final.predict(X_grid)
X_grid['log_soc_stock_pred'] = soc_pred

# Optional: save prediction grid with SOC predictions
out_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/prediction_grid_SOC.csv"
X_grid.to_csv(out_csv, index=False)
print(f"Prediction grid with SOC saved to: {out_csv}")

# -----------------------------
# 6️⃣ Uncertainty estimation (variance across trees)
# -----------------------------
tree_preds = np.stack([tree.predict(X_grid) for tree in rf_final.estimators_], axis=0)
soc_variance = np.var(tree_preds, axis=0)
X_grid['soc_variance'] = soc_variance
print("Uncertainty (variance) added to prediction grid.")

# Optional: save variance as separate column or layer
