## make sure all rasters align


final_features = ['MRRTF', 'MRVBF', 'aspect', 'bio12', 'bio17', 'bio18', 'bio19', 'bio5', 
'bio7', 'cropland', 'ecoforms', 'faosoil_id', 'midslope_position', 'normalized_height', 
'rangeland', 'slope', 'slope_height', 'standardized_height', 'terrain_surf_texture', 'tmax']

In [1]:
## Check that all rasters share CRS, resolution, dimensions, and transform

import rasterio, os

# Updated rasters
rasters = { 
    "aspect": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/aspect_1km.tif",
    "MRRTF": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/MRRTF_1km.tif",
    "MRVBF": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/MRVBF_1km.tif",

    "bio5": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio5_1km.tif",
    "bio12": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio12_1km.tif",
    "bio17": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio17_1km.tif",
    "bio18": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio18_1km.tif",
    "bio19": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio19_1km.tif",
    "bio7": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio7_1km.tif",

    "tmax": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/tmax_1km.tif",

    "slope_height": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/slope_height_1km.tif",

    "cropland": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/cropland_avg1940_1960_percent_1km.tif_pct_1km.tif",
    "rangeland": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/rangeland_avg1940_1960_percent_1km.tif_pct_1km.tif",

    "ecoforms": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/ecoforms_1km.tif",
    "faosoil_id": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/faosoil_id_1km.tif",

    "midslope_position": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/midslope_position_1km.tif",
    "normalized_height": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/normalized_height_1km.tif",
    "slope": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/slope_1km.tif",
    "standardized_height": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/standardized_height_1km.tif",
    "terrain_surf_texture": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/terrain_surf_texture_1km.tif"
}

tmpl = None

for name, path in rasters.items():

    if not os.path.exists(path):
        print(f"{name}: FILE NOT FOUND ‚Üí {path}")
        continue

    with rasterio.open(path) as src:
        info = (str(src.crs), src.res, src.width, src.height, src.transform)

        if tmpl is None:
            tmpl = info
            print("\n=== TEMPLATE RASTER ===")
            print(f"Template: {name}")
            print(f"CRS: {info[0]}")
            print(f"Resolution: {info[1]}")
            print(f"Dimensions: {info[2]} x {info[3]}")
            print()
        else:
            ok = (
                info[0] == tmpl[0] and
                info[1] == tmpl[1] and
                info[2] == tmpl[2] and
                info[3] == tmpl[3] and
                info[4] == tmpl[4]
            )

            print(f"{name}: {'OK' if ok else 'MISMATCH'}")
            if not ok:
                print(f"  CRS: {info[0]}")
                print(f"  RES: {info[1]}")
                print(f"  DIM: {info[2]} x {info[3]}")
                print(f"  TRANSFORM: {info[4]}")
                print("---")



=== TEMPLATE RASTER ===
Template: aspect
CRS: EPSG:32733
Resolution: (1000.0, 1000.0)
Dimensions: 1352 x 1523

MRRTF: OK
MRVBF: OK
bio5: OK
bio12: OK
bio17: OK
bio18: OK
bio19: OK
bio7: OK
tmax: OK
slope_height: OK
cropland: OK
rangeland: OK
ecoforms: OK
faosoil_id: OK
midslope_position: OK
normalized_height: OK
slope: OK
standardized_height: OK
terrain_surf_texture: OK


## aligning rasters

In [2]:
# import os
# import rasterio
# import numpy as np
# from rasterio.warp import reproject, Resampling

# # =========================================================
# # 1Ô∏è‚É£  PATHS AND SETTINGS
# # =========================================================
# template = "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/TRI.tif"

# output_dir = "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/aligned_1km_rasterio"
# os.makedirs(output_dir, exist_ok=True)

# rasters = {
#     "slope_height": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/slope_height.tif",
#     "annual_precip": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/annual_precip2.tif",
#     "precip_wettest_month": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/precip_wettest_month1k.tif",
#     "tmax_mean": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/tmax_mean_mean1k.tif",
#     "MRRTF": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/MRRTF.tif",
#     "MRVBF": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/MRVBF.tif",
#     "standardized_height": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/standardized_height.tif",
#     "relief_TRI": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/relief_TRI.tif",
#     "terrain_surf_convexity": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/terrain_surf_convexity.tif",
#     "terrain_surf_texture": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/terrain_surf_texture.tif",
#     "valley_depth": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/valley_depth.tif",
#     "faosoil_id": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_gpkg_stuff/angola_soil_1k.tif",
#     "cropland_1950": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/hyde_lu_percent/hyde_lu_1km/cropland_1950_percent.tif",
#     "grazing_1950": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/hyde_lu_percent/hyde_lu_1km/grazing_1950_percent.tif",
#     "temp_annual_range": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/temp_annual_range1k.tif"
# }

# categorical = {"faosoil_id"}  # use nearest neighbor for categorical
# continuous = set(rasters.keys()) - categorical

# # =========================================================
# # 2Ô∏è‚É£  LOAD TEMPLATE (REFERENCE GRID)
# # =========================================================
# with rasterio.open(template) as ref:
#     ref_meta = ref.meta.copy()
#     crs = ref.crs
#     xres, yres = ref.res
#     ref_transform = ref.transform
#     width, height = ref.width, ref.height
#     xmin, ymin, xmax, ymax = ref.bounds

# # Fix Y order if needed
# if ymax < ymin:
#     ymin, ymax = ymax, ymin

# print(f"‚úÖ Reference grid: {crs}, {xres} m | size {width} √ó {height}")
# print(f"Bounds: {xmin:.1f}, {ymin:.1f}, {xmax:.1f}, {ymax:.1f}")

# # =========================================================
# # 3Ô∏è‚É£  HELPER FUNCTION TO CHECK ALIGNMENT
# # =========================================================
# def is_aligned(src, ref, tol_res=1e-6, tol_origin=1e-3):
#     same_crs = str(src.crs) == str(ref.crs)
#     same_res = all(abs(a - b) < tol_res for a, b in zip(src.res, ref.res))
#     same_dims = src.width == ref.width and src.height == ref.height
#     same_origin = all(abs(a - b) < tol_origin for a, b in zip(src.transform[2:], ref.transform[2:]))
#     return same_crs and same_res and same_dims and same_origin

# # =========================================================
# # 4Ô∏è‚É£  ALIGN EACH RASTER TO TEMPLATE
# # =========================================================
# for name, path in rasters.items():
#     out = os.path.join(output_dir, f"{name}_aligned1k.tif")
#     with rasterio.open(path) as src, rasterio.open(template) as ref:
#         if is_aligned(src, ref):
#             print(f"‚úÖ {name}: already aligned.")
#             continue

#         print(f"‚öôÔ∏è {name}: MISMATCH ‚Äî resampling to match slope_height grid.")

#         # Choose resampling method
#         resampling = Resampling.nearest if name in categorical else Resampling.bilinear

#         # Prepare output array
#         data = src.read(1)
#         dest = np.full((ref.height, ref.width), src.nodata if src.nodata is not None else -9999, dtype=data.dtype)

#         reproject(
#             source=data,
#             destination=dest,
#             src_transform=src.transform,
#             src_crs=src.crs,
#             dst_transform=ref.transform,
#             dst_crs=ref.crs,
#             resampling=resampling
#         )

#         # Save aligned raster
#         meta = ref_meta.copy()
#         meta.update({
#             "dtype": dest.dtype,
#             "count": 1,
#             "compress": "lzw",
#             "nodata": src.nodata if src.nodata is not None else -9999
#         })

#         with rasterio.open(out, "w", **meta) as dst:
#             dst.write(dest, 1)

#         print(f"‚úÖ Saved aligned raster: {out}")

# print("\nüéâ All rasters are now aligned to slope_height 1 km grid.")


In [3]:
# # Quick check after fixing
# for name, path in rasters.items():
#     with rasterio.open(path) as src:
#         info = (src.res, src.width, src.height, src.transform)
#         print(name, info)


aspect ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
MRRTF ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
MRVBF ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
bio5 ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
bio12 ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
bio17 ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
bio18 ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
bio19 ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
bio7 ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 143382.578,
       0.0, -1000.0, 9517305.0427))
tmax ((1000.0, 1000.0), 1352, 1523, Affine(1000.0, 0.0, 

## Build prediction grid

In [5]:
import rasterio
from rasterio import sample
import numpy as np
import pandas as pd
import os

# =========================================================
# 1Ô∏è‚É£  TEMPLATE RASTER (reference 1 km grid)
# =========================================================
template_path = "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/slope_height_1km.tif"

# =========================================================
# 2Ô∏è‚É£  ALIGNED 1 km COVARIATE RASTERS (your cleaned folder)
# =========================================================
rasters = { 
    "aspect": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/aspect_1km.tif",
    "MRRTF": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/MRRTF_1km.tif",
    "MRVBF": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/MRVBF_1km.tif",

    "bio5": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio5_1km.tif",
    "bio12": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio12_1km.tif",
    "bio17": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio17_1km.tif",
    "bio18": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio18_1km.tif",
    "bio19": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio19_1km.tif",
    "bio7": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio7_1km.tif",

    "tmax": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/tmax_1km.tif",

    "slope_height": template_path,

    "cropland": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/cropland_avg1940_1960_percent_1km.tif_pct_1km.tif",
    "rangeland": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/rangeland_avg1940_1960_percent_1km.tif_pct_1km.tif",

    "ecoforms": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/ecoforms_1km.tif",
    "faosoil_id": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/faosoil_id_1km.tif",

    "midslope_position": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/midslope_position_1km.tif",
    "normalized_height": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/normalized_height_1km.tif",
    "slope": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/slope_1km.tif",
    "standardized_height": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/standardized_height_1km.tif",
    "terrain_surf_texture": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/terrain_surf_texture_1km.tif"
}

# =========================================================
# 3Ô∏è‚É£  BUILD COORDINATE GRID FROM TEMPLATE
# =========================================================
with rasterio.open(template_path) as tmpl:
    transform = tmpl.transform
    nodata = tmpl.nodata
    base = tmpl.read(1)
    height, width = tmpl.height, tmpl.width

    # Build coordinate list
    xs, ys = np.meshgrid(np.arange(width), np.arange(height))
    xs, ys = rasterio.transform.xy(transform, ys, xs)
    coords = list(zip(xs.flat, ys.flat))

    # Mask valid pixels
    base_mask = np.isfinite(base)
    if nodata is not None:
        base_mask &= base != nodata
    coords = [c for c, m in zip(coords, base_mask.flat) if m]

print(f"üìè Prediction grid: {len(coords):,} valid pixels in template.")

grid = pd.DataFrame(coords, columns=["X_coord", "Y_coord"])

# =========================================================
# 4Ô∏è‚É£  SAMPLE EACH RASTER BY COORDINATE
# =========================================================
for name, path in rasters.items():
    if not os.path.exists(path):
        print(f"‚ö†Ô∏è Missing raster: {path}")
        continue
    with rasterio.open(path) as src:
        vals = np.array(list(sample.sample_gen(src, coords))).squeeze()
        vals = vals.astype(float)

        # Replace NoData with NaN
        if src.nodata is not None and not np.isnan(src.nodata):
            vals[vals == src.nodata] = np.nan
        vals[vals == -9999] = np.nan

        grid[name] = vals
        print(f"‚úÖ Sampled {name} ({np.isnan(vals).mean()*100:.2f}% NaN)")

# =========================================================
# 5Ô∏è‚É£  CLEAN CATEGORICAL VARIABLES
# =========================================================
categorical_cols = ["faosoil_id", "ecoforms"]

for col in categorical_cols:
    if col in grid.columns:
        grid[col] = grid[col].astype("Int32")


# =========================================================
# 6Ô∏è‚É£  SAVE PREDICTION GRID
# =========================================================
out_csv = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv"
grid.to_csv(out_csv, index=False)

print(f"\n‚úÖ Saved prediction grid:\n{out_csv}")
print(f"üßæ Shape: {grid.shape[0]} rows √ó {grid.shape[1]} columns")

# =========================================================
# 7Ô∏è‚É£  MISSING DATA SUMMARY
# =========================================================
print("\nüîç Missing-data summary (% NaN):")
print((grid.isna().mean() * 100).round(2).sort_values(ascending=False))


üìè Prediction grid: 1,259,342 valid pixels in template.
‚úÖ Sampled aspect (1.29% NaN)
‚úÖ Sampled MRRTF (0.16% NaN)
‚úÖ Sampled MRVBF (0.16% NaN)
‚úÖ Sampled bio5 (0.13% NaN)
‚úÖ Sampled bio12 (0.13% NaN)
‚úÖ Sampled bio17 (0.13% NaN)
‚úÖ Sampled bio18 (0.13% NaN)
‚úÖ Sampled bio19 (0.13% NaN)
‚úÖ Sampled bio7 (0.13% NaN)
‚úÖ Sampled tmax (0.33% NaN)
‚úÖ Sampled slope_height (0.00% NaN)
‚úÖ Sampled cropland (0.12% NaN)
‚úÖ Sampled rangeland (0.12% NaN)
‚úÖ Sampled ecoforms (0.33% NaN)
‚úÖ Sampled faosoil_id (0.60% NaN)
‚úÖ Sampled midslope_position (0.00% NaN)
‚úÖ Sampled normalized_height (0.00% NaN)
‚úÖ Sampled slope (0.00% NaN)
‚úÖ Sampled standardized_height (0.00% NaN)
‚úÖ Sampled terrain_surf_texture (0.00% NaN)

‚úÖ Saved prediction grid:
/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv
üßæ Shape: 1259342 rows √ó 22 columns

üîç Missing-data summary (% NaN):
aspect                  1.29
faosoil_id              0.60
ecoforms                0.33
tmax   

In [2]:
import pandas as pd
pred_path = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv"
pred = pd.read_csv(pred_path, nrows=5)

pred.head()





Unnamed: 0,X_coord,Y_coord,aspect,MRRTF,MRVBF,bio5,bio12,bio17,bio18,bio19,...,slope_height,cropland,rangeland,ecoforms,faosoil_id,midslope_position,normalized_height,slope,standardized_height,terrain_surf_texture
0,249882.578,9516805.0,0.824236,1.914895,2.349721,291.0,1275.0,2.0,565.0,2.0,...,191.591599,1.266659,1.578006,113,,0.337769,0.668884,0.080887,456.468018,54.297634
1,250882.578,9516805.0,0.851345,0.927454,1.811098,291.0,1275.0,2.0,565.0,2.0,...,168.131836,1.480068,1.119408,113,,0.237123,0.618561,0.105386,375.31485,53.434582
2,251882.578,9516805.0,0.792205,0.237248,1.096868,291.0,1275.0,2.0,565.0,2.0,...,154.49498,1.693477,0.66081,3,,0.145248,0.572624,0.078765,301.832153,52.560249
3,248882.578,9515805.0,4.258573,4.926786,8.853493,291.0,1275.0,2.0,565.0,2.0,...,209.756134,1.000946,1.839693,113,,0.342683,0.671342,0.05233,464.359985,56.328823
4,249882.578,9515805.0,1.593083,4.958667,8.911603,291.0,1275.0,2.0,565.0,2.0,...,209.205551,1.188077,1.480109,113,89.0,0.373117,0.686558,0.030373,493.57309,55.501736


## Aligning declusters w train_final and prediction grid

In [3]:
import pandas as pd

# Paths
train_path = "/Users/inesschwartz/Desktop/model/traning_datasets/train_final1.csv"
pred_path = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv"

# Load both
train = pd.read_csv(train_path)
pred = pd.read_csv(pred_path)

# Drop 'site_info_id' if present
if 'site_info_id' in train.columns:
    train.drop(columns=['site_info_id'], inplace=True) 
    print("‚úÖ Dropped 'site_info_id' from training dataframe.")

# Define key columns
target = "log_soc_stock"
coord_cols = ["X_coord", "Y_coord"]

# Extract order from training dataset
train_features = [c for c in train.columns if c not in [target] + coord_cols]

# Check consistency with prediction grid
shared_features = [c for c in train_features if c in pred.columns]
missing = [c for c in train_features if c not in pred.columns]
if missing:
    print(f"‚ö†Ô∏è Missing in prediction grid: {missing}")

# Reorder prediction grid safely
pred_reordered = pred[coord_cols + shared_features]

# Save aligned file
out_path = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_aligned.csv"
pred_reordered.to_csv(out_path, index=False)

print(f"\n‚úÖ Saved reordered prediction grid:\n{out_path}")
print(f"üßæ Columns now match training features ({len(shared_features)} shared features).")



‚úÖ Saved reordered prediction grid:
/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_aligned.csv
üßæ Columns now match training features (20 shared features).


In [1]:
# =========================================================
# 02. Create grid-based declustered training subsets (aligned)
# =========================================================
import pandas as pd
import numpy as np
import os
from shapely.geometry import Point
import geopandas as gpd

# ---------------------------------------------------------
# STEP 1 ‚Äî Load aligned, filtered, feature-selected training data
# ---------------------------------------------------------
train_filtered_csv = "/Users/inesschwartz/Desktop/model/traning_datasets/train_final1.csv"
df = pd.read_csv(train_filtered_csv)

target = 'log_soc_stock'

# Your final feature set
final_features = [
    'MRRTF', 'MRVBF', 'aspect',
    'bio12', 'bio17', 'bio18', 'bio19', 'bio5', 'bio7',
    'cropland', 'ecoforms', 'faosoil_id',
    'midslope_position', 'normalized_height', 'rangeland',
    'slope', 'slope_height', 'standardized_height',
    'terrain_surf_texture', 'tmax'
]

# ---------------------------------------------------------
# Verify required columns exist
# ---------------------------------------------------------
required = [target, 'site_info_id', 'X_coord', 'Y_coord'] + final_features
missing_cols = [c for c in required if c not in df.columns]

if missing_cols:
    raise ValueError(f"‚ùå Missing required columns: {missing_cols}")

# Keep only necessary columns (but DO NOT drop anything else yet)
df_final = df[required].copy()

print(f"‚úÖ Training dataset loaded with {len(df_final)} samples.")
print(f"   Required columns preserved: site_info_id, X_coord, Y_coord, target, covariates")

# ---------------------------------------------------------
# STEP 2 ‚Äî Convert to GeoDataFrame
# ---------------------------------------------------------
gdf = gpd.GeoDataFrame(
    df_final,
    geometry=gpd.points_from_xy(df_final.X_coord, df_final.Y_coord),
    crs="EPSG:32733"
)

print(f"‚úÖ GeoDataFrame created with CRS {gdf.crs} and {len(gdf)} points.")

# ---------------------------------------------------------
# STEP 3 ‚Äî Grid-based declustering (10 km √ó 10 km)
# ---------------------------------------------------------
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned"
os.makedirs(decluster_dir, exist_ok=True)

grid_size = 10000  # 10 km cells
xmin, ymin, xmax, ymax = gdf.total_bounds

print(f"üîç Declustering extent:")
print(f"   xmin={xmin:.0f}, ymin={ymin:.0f}, xmax={xmax:.0f}, ymax={ymax:.0f}")

for i in range(1, 51):  # 50 declustering iterations
    sampled_rows = []

    for x in np.arange(xmin, xmax, grid_size):
        for y in np.arange(ymin, ymax, grid_size):

            # Select samples in the grid cell
            cell = gdf.cx[x:x + grid_size, y:y + grid_size]

            if len(cell) > 0:
                # Sample 1 point from this cell, ensuring reproducibility
                sampled_rows.append(cell.sample(1, random_state=i))

    if sampled_rows:
        decluster_i = pd.concat(sampled_rows)

        # IMPORTANT: keep all required columns
        out_cols = ['site_info_id', 'X_coord', 'Y_coord', target] + final_features

        out_path = os.path.join(decluster_dir, f"decluster_run_{i:03d}.csv")
        decluster_i[out_cols].to_csv(out_path, index=False)

        print(f"‚úÖ decluster_run_{i:03d}.csv saved with {len(decluster_i)} samples")
    else:
        print(f"‚ö†Ô∏è decluster_run_{i:03d}: no samples found ‚Äî skipped.")

print("\nüéâ Completed creating 50 declustering subsets")
print(f"üìÅ Files saved in: {decluster_dir}")


‚úÖ Training dataset loaded with 736 samples.
   Required columns preserved: site_info_id, X_coord, Y_coord, target, covariates
‚úÖ GeoDataFrame created with CRS EPSG:32733 and 736 points.
üîç Declustering extent:
   xmin=174545, ymin=7998325, xmax=1457459, ymax=9508187
‚úÖ decluster_run_001.csv saved with 634 samples
‚úÖ decluster_run_002.csv saved with 634 samples
‚úÖ decluster_run_003.csv saved with 634 samples
‚úÖ decluster_run_004.csv saved with 634 samples
‚úÖ decluster_run_005.csv saved with 634 samples
‚úÖ decluster_run_006.csv saved with 634 samples
‚úÖ decluster_run_007.csv saved with 634 samples
‚úÖ decluster_run_008.csv saved with 634 samples
‚úÖ decluster_run_009.csv saved with 634 samples
‚úÖ decluster_run_010.csv saved with 634 samples
‚úÖ decluster_run_011.csv saved with 634 samples
‚úÖ decluster_run_012.csv saved with 634 samples
‚úÖ decluster_run_013.csv saved with 634 samples
‚úÖ decluster_run_014.csv saved with 634 samples
‚úÖ decluster_run_015.csv saved with 634 s

## output quality bad, diagonstic steps below:

In [28]:
import os, glob
import numpy as np
import rasterio

input_dir = "/Users/inesschwartz/Desktop/model/ensemble_preds"
output_dir = input_dir

# RF is log1p; OK and RF_OK are linear
MODEL_SCALE = {"RF": "log1p", "OK": "linear", "RF_OK": "linear"}

def aggregate(model):
    files = sorted(glob.glob(os.path.join(input_dir, f"{model}_decluster_*.tif")))
    if not files:
        print(f"‚ö†Ô∏è No rasters for {model}")
        return

    with rasterio.open(files[0]) as src0:
        profile = src0.profile
        H, W = src0.height, src0.width

    sum_arr  = np.zeros((H, W), dtype=float)
    sumsq_arr= np.zeros((H, W), dtype=float)
    count_arr= np.zeros((H, W), dtype=int)

    for fp in files:
        with rasterio.open(fp) as src:
            a = src.read(1).astype(float)
            if src.nodata is not None:
                a[a == src.nodata] = np.nan

        # back-transform RF only
        if MODEL_SCALE[model] == "log1p":
            a = np.expm1(a)

        # clean up
        a = np.where(np.isfinite(a), a, np.nan)
        a[a < 0] = 0.0  # clamp tiny negatives from interpolation

        m = np.isfinite(a)
        sum_arr[m]   += a[m]
        sumsq_arr[m] += a[m]**2
        count_arr[m] += 1

    valid = count_arr > 0
    mean_arr = np.full((H, W), np.nan, float)
    std_arr  = np.full((H, W), np.nan, float)
    mean_arr[valid] = sum_arr[valid] / count_arr[valid]
    var = (sumsq_arr[valid] / count_arr[valid]) - mean_arr[valid]**2
    std_arr[valid]  = np.sqrt(np.maximum(var, 0))

    # Optional: trim extreme OK / RF_OK spikes
    if model in ("OK", "RF_OK"):
        hi = np.nanpercentile(mean_arr, 99.5)
        mean_arr = np.clip(mean_arr, 0, hi)

    # Use a conventional nodata for GeoTIFF compatibility
    out_nodata = -9999.0
    def write(out_path, arr):
        prof = profile.copy()
        prof.update(dtype="float32", count=1, compress="lzw", nodata=out_nodata)
        arr_out = np.where(np.isfinite(arr), arr, out_nodata).astype("float32")
        with rasterio.open(out_path, "w", **prof) as dst:
            dst.write(arr_out, 1)

    write(os.path.join(output_dir, f"ensemble_mean_{model}.tif"), mean_arr)
    write(os.path.join(output_dir, f"ensemble_std_{model}.tif"),  std_arr)
    print(f"‚úÖ {model}: wrote mean/std in linear SOC units")

for m in ("RF", "OK", "RF_OK"):
    aggregate(m)
print("üéØ Done.")


‚úÖ RF: wrote mean/std in linear SOC units
‚úÖ OK: wrote mean/std in linear SOC units
‚úÖ RF_OK: wrote mean/std in linear SOC units
üéØ Done.


In [29]:
import os
import glob
import numpy as np
import rasterio
import pandas as pd

# === Paths ===
raster_dir = "/Users/inesschwartz/Desktop/model/ensemble_preds"  # <-- change if needed
pattern = "*.tif"

# === Initialize summary list ===
records = []

for path in sorted(glob.glob(os.path.join(raster_dir, pattern))):
    name = os.path.basename(path)
    with rasterio.open(path) as src:
        arr = src.read(1).astype("float64")
        nodata_val = src.nodata

        # Boolean masks
        is_nan = np.isnan(arr)
        is_neg9999 = arr == -9999
        is_nodata = np.full(arr.shape, False)

        if nodata_val is not None and not np.isnan(nodata_val):
            is_nodata = arr == nodata_val

        total = arr.size
        count_nan = np.count_nonzero(is_nan)
        count_neg9999 = np.count_nonzero(is_neg9999)
        count_nodata = np.count_nonzero(is_nodata)
        valid = total - (count_nan + count_neg9999 + count_nodata)

        records.append({
            "raster": name,
            "nodata_value": nodata_val,
            "total_pixels": total,
            "NaN_count": count_nan,
            "-9999_count": count_neg9999,
            "nodata_count": count_nodata,
            "valid_count": valid,
            "valid_%": 100 * valid / total
        })

# === Save summary table ===
df = pd.DataFrame(records)
df.sort_values("valid_%", ascending=True, inplace=True)

# Print summary nicely
pd.set_option("display.max_rows", None)
print("\nüìä Raster NoData Summary:\n")
print(df[["raster", "nodata_value", "NaN_count", "-9999_count", "nodata_count", "valid_%"]])

# # === Optional: export to CSV ===
# out_csv = os.path.join(raster_dir, "nodata_summary.csv")
# df.to_csv(out_csv, index=False)
# print(f"\nüíæ Summary saved to: {out_csv}")



üìä Raster NoData Summary:

                     raster  nodata_value  NaN_count  -9999_count  \
35   ensemble_std_RF_OK.tif       -9999.0          0       859673   
30     ensemble_mean_OK.tif       -9999.0          0       859673   
31     ensemble_mean_RF.tif       -9999.0          0       859673   
34      ensemble_std_RF.tif       -9999.0          0       859673   
33      ensemble_std_OK.tif       -9999.0          0       859673   
32  ensemble_mean_RF_OK.tif       -9999.0          0       859673   
29     RF_decluster_010.tif           NaN     859673            0   
28     RF_decluster_009.tif           NaN     859673            0   
27     RF_decluster_008.tif           NaN     859673            0   
18  RF_OK_decluster_009.tif           NaN     859673            0   
25     RF_decluster_006.tif           NaN     859673            0   
24     RF_decluster_005.tif           NaN     859673            0   
23     RF_decluster_004.tif           NaN     859673            0   
22  