## Technical check

### Checking that my covariate data prep procedures abide by fao norms

In [1]:
# bioclim -- # replace no data/bad values with no available data
bioclimraster_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/bioclimatic32733/"

# annual_precip2 = -9999
# isothermality_32733 = 0
# max_temp_warmest_month32733 = -3.40282e+38
# annual_mean_temp = -3.40282e+38
# mean_temp_warmest_quarter32733 = -3.40282e+38	
# mean_temp_driest_quarter32733 = -3.40282e+38	
# mean_temp_wettest_quarter32733 = -3.40282e+38	
# precip_coldest_quarter32733 = -9999
# min_temp_coldest_month32733 = -3.40282e+38	
# precip_driest_month32733 = -9999
# precip_driest_quarter32733 = -9999
# precip_seasonality2 = -9999
# precip_wettest_month32733 = -9999
# precip_warmest_quarter32733 = -9999
# temp_annual_range32733 = -3.40282e+38	
#temp_seasonality32733 = -3.40282e+38	
# precip_wettest_quarter32733 = -9999




In [3]:
import rasterio
import numpy as np
import os
import glob
import pandas as pd

# --- Folder with your DEM / terrain rasters ---
terrain_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/DEM_characteristics"

# --- List of files and their type (continuous vs categorical) ---
terrain_layers = {
    "MRVBF.tif": "continuous",
    "aspect_cos.tif": "continuous",
    "aspect_sin.tif": "continuous",
    "aspect.tif": "continuous",
    "aspect_classes.tif": "categorical",
    "slope.tif": "continuous",
    "RLD.tif": "continuous",
    "relief.tif": "continuous",
    "roughness.tif": "continuous",
    "DEM_filledfiltered.tif": "continuous",
    "TWI_300.tif": "continuous",
    "flow_accumulation.tif": "continuous",
    "curve_plan.tif": "continuous",
    "curv_profile.tif": "continuous",
    "ridge_levels.tif": "categorical",
    "valley_depth2.tif": "continuous"
}

# --- Function to check gdal/qgis hisotry to confirm resampling method resampling method ---


KeyboardInterrupt: 

Replacing NoData to not available data:

In the FAO Soil Organic Carbon Mapping Cookbook, when they say “not available data”, they mean that cells should be flagged as missing without assigning an arbitrary placeholder value like -9999 or 255. Instead, the raster should use the NoData field in the raster metadata (so GIS software knows these pixels are unavailable)

What script does:
- Writes everything as float32.
- Stores NaN values directly in the raster.
- Removes the nodata tag → so “not available” is represented only by NaN, not by a fixed code like -9999.

This matches FAO’s guideline: no arbitrary placeholder values, only true missing data markers.

In [4]:
## didn't use

import os
import glob
import numpy as np
import rasterio

# ========== USER CONFIG ==========
raster_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/bioclimatic32733/"
output_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/bioclimatic32733_cleaned/"
os.makedirs(output_folder, exist_ok=True)

# List raster base-names that are categorical (cannot store NaN)
categorical_names = []  # e.g. ["lithology2", "soil_type_id"]

# Map raster names to their sentinel (bad) values
bad_values_dict = {
    "annual_precip2": -9999,
    "isothermality_32733": 0,
    "max_temp_warmest_month32733": -3.40282e+38,
    "annual_mean_temp": -3.40282e+38,
    "mean_temp_warmest_quarter32733": -3.40282e+38,
    "mean_temp_driest_quarter32733": -3.40282e+38,
    "mean_temp_wettest_quarter32733": -3.40282e+38,
    "precip_coldest_quarter32733": -9999,
    "min_temp_coldest_month32733": -3.40282e+38,
    "precip_driest_month32733": -9999,
    "precip_driest_quarter32733": -9999,
    "precip_seasonality2": -9999,
    "precip_wettest_month32733": -9999,
    "precip_warmest_quarter32733": -9999,
    "temp_annual_range32733": -3.40282e+38,
    "temp_seasonality32733": -3.40282e+38,
    "precip_wettest_quarter32733": -9999
}
# =================================

raster_files = glob.glob(os.path.join(raster_folder, "*.tif"))
raster_files.sort()

for raster_path in raster_files:
    raster_name = os.path.splitext(os.path.basename(raster_path))[0]
    bad_value = bad_values_dict.get(raster_name, None)
    out_path = os.path.join(output_folder, os.path.basename(raster_path))

    with rasterio.open(raster_path) as src:
        meta = src.meta.copy()
        data = src.read(1).astype(np.float32)

        # --- Build valid data mask ---
        valid_mask = np.ones_like(data, dtype=bool)

        if bad_value is not None:
            valid_mask &= ~np.isclose(data, bad_value, rtol=1e-05, atol=1e-06)

        # Extra safeguard: exclude extreme float fill values
        valid_mask &= data > -1e20

        # --- Check if raster is categorical ---
        is_categorical = (raster_name in categorical_names) or src.dtypes[0].startswith(("int", "uint"))

        if is_categorical:
            # Keep categorical as int32 with nodata
            nodata_val = -9999
            out_arr = data.astype(np.int32)
            out_arr[~valid_mask] = nodata_val
            meta.update(dtype="int32", nodata=nodata_val, count=1)

            with rasterio.open(out_path, "w", **meta) as dst:
                dst.write(out_arr, 1)

            print(f"{raster_name}: categorical written with nodata={nodata_val}, valid pixels={valid_mask.mean()*100:.2f}%")

        else:
            # Continuous raster: float32 with internal mask
            meta.update(dtype="float32", count=1)
            meta.pop("nodata", None)  # remove nodata to avoid misinterpretation
            mask_uint8 = valid_mask.astype(np.uint8) * 255  # 255 = valid, 0 = unavailable

            with rasterio.open(out_path, "w", **meta) as dst:
                dst.write(data, 1)          # keep original sentinel values
                dst.write_mask(mask_uint8)  # internal mask flags unavailable pixels

            # Quick stats for valid pixels only
            finite_vals = data[valid_mask]
            if finite_vals.size > 0:
                vmin, vmax, vmean = finite_vals.min(), finite_vals.max(), finite_vals.mean()
                print(f"{raster_name}: continuous written with internal mask, valid pixels={valid_mask.mean()*100:.2f}%, min={vmin:.4g}, max={vmax:.4g}, mean={vmean:.4g}")
            else:
                print(f"{raster_name}: continuous written but NO valid pixels!")


annual_mean_temp: continuous written with internal mask, valid pixels=57.43%, min=17.41, max=27.59, mean=21.64
annual_precip2: categorical written with nodata=-9999, valid pixels=57.43%
isothermality_32733: continuous written with internal mask, valid pixels=57.63%, min=45.48, max=77, mean=64.37
max_temp_warmest_month32733: continuous written with internal mask, valid pixels=57.43%, min=24.94, max=35.27, mean=31.04
mean_temp_driest_quarter32733: continuous written with internal mask, valid pixels=57.43%, min=15.38, max=24.5, mean=18.9
mean_temp_warmest_quarter32733: continuous written with internal mask, valid pixels=57.43%, min=19.15, max=29.49, mean=23.31
mean_temp_wettest_quarter32733: continuous written with internal mask, valid pixels=57.43%, min=45, max=77, mean=64.35
min_temp_coldest_month32733: continuous written with internal mask, valid pixels=57.40%, min=4.1, max=20.1, mean=9.602
precip_coldest_quarter32733: categorical written with nodata=-9999, valid pixels=57.43%
precip_d

## resolution check 
### checking resolution abides by fao norms

In [1]:
import rasterio
import glob
import os

# --- Input folder (your DEM rasters) ---
terrain_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/DEM_characteristics"

# --- Get list of DEM files ---
terrain_files = sorted(glob.glob(os.path.join(terrain_folder, "*.tif")))

print("📏 DEM raster resolutions:")
for raster_path in terrain_files:
    with rasterio.open(raster_path) as src:
        res_x, res_y = src.res  # pixel size in map units (usually meters)
        print(f"{os.path.basename(raster_path)} → {res_x} x {res_y}")


📏 DEM raster resolutions:
MRVBF.tif → 300.0 x 300.0
RLD.tif → 300.0 x 300.0
aspect.tif → 30.591177525448536 x 30.59117752441045
aspect_classes.tif → 0.0002777776034428655 x 0.00027777786828493456
aspect_cos.tif → 30.591177525448536 x 30.59117752441045
aspect_sin.tif → 30.591177525448536 x 30.59117752441045
dem_filledfiltered.tif → 300.0 x 300.0
flow_accumulation.tif → 300.0 x 300.0
relief.tif → 300.0 x 300.0
ridge_levels.tif → 300.0 x 300.0
roughness.tif → 300.0 x 300.0
slope.tif → 30.591177525448536 x 30.59117752441045
twi_300m.tif → 300.0 x 300.0
valleydepth2.tif → 300.0 x 300.0


In [2]:
# og dem resolutionimport rasterio
import os

# --- Path to your original DEM ---
og_dem = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_intermediary/dem_utm.tif"

print("📏 Original DEM resolution:")
with rasterio.open(og_dem) as src:
    res_x, res_y = src.res  # pixel size in map units (usually meters)
    print(f"{os.path.basename(og_dem)} → {res_x} x {res_y}")


📏 Original DEM resolution:
dem_utm.tif → 30.591177524925087 x 30.591177524925087


In [None]:
## setting resolution to 300m (the resolution that matches a majority of my covariates) 

import rasterio
from rasterio.warp import calculate_default_transform, reproject, Resampling
import glob
import os

# --- Input and output folders ---
input_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/DEM_characteristics/"
output_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/DEM_characteristics_300m/"

os.makedirs(output_folder, exist_ok=True)

# --- Target resolution ---
target_res = (300, 300)  # (xres, yres) in meters

# --- Loop through all .tif files ---
for raster_path in glob.glob(os.path.join(input_folder, "*.tif")):
    with rasterio.open(raster_path) as src:
        # Define target transform and metadata
        transform, width, height = calculate_default_transform(
            src.crs,
            src.crs,   # same CRS, just resampling
            src.width,
            src.height,
            *src.bounds,
            resolution=target_res
        )

        kwargs = src.meta.copy()
        kwargs.update({
            "crs": src.crs,
            "transform": transform,
            "width": width,
            "height": height
        })

        # Output path
        out_path = os.path.join(output_folder, os.path.basename(raster_path))

        # Perform reprojection / resampling
        with rasterio.open(out_path, "w", **kwargs) as dst:
            for i in range(1, src.count + 1):
                reproject(
                    source=rasterio.band(src, i),
                    destination=rasterio.band(dst, i),
                    src_transform=src.transform,
                    src_crs=src.crs,
                    dst_transform=transform,
                    dst_crs=src.crs,
                    resampling=Resampling.average  # use bilinear for continuous, nearest for categorical
                )

        print(f"✅ Resampled {os.path.basename(raster_path)} → 300 m")

print("🎯 All rasters saved in:", output_folder)

In [6]:
## all covariates resolution check

import rasterio
import geopandas as gpd
import glob
import os

# --- Paths ---
bioclimraster_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/bioclimatic32733/"
ecosyst_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/labelled_ecosystems32733.tif"
soil_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_data2_32733.gpkg"
lithology_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/lithology2.gpkg"
landsurface_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/landsurfaceforms/landsurfaceforms.tif"

print("📏 Checking raster resolutions...")

# --- Bioclimatic rasters ---
bioclim_files = sorted(glob.glob(os.path.join(bioclimraster_folder, "*.tif")))
for raster_path in bioclim_files:
    with rasterio.open(raster_path) as src:
        res_x, res_y = src.res
        print(f"Bioclim {os.path.basename(raster_path)} → {res_x:.2f} x {res_y:.2f}")

# --- Ecosystem raster ---
with rasterio.open(ecosyst_path) as src:
    res_x, res_y = src.res
    print(f"Ecosystem.tif → {res_x:.2f} x {res_y:.2f}")

# --- Landsurface raster ---
with rasterio.open(landsurface_path) as src:
    res_x, res_y = src.res
    print(f"Landsurfaceforms.tif → {res_x:.2f} x {res_y:.2f}")

print("\n🌍 Checking vector datasets (no true resolution)...")

# --- Soil vector ---
soil = gpd.read_file(soil_path)
print(f"Soil vector: {len(soil)} features, bounds = {soil.total_bounds}")

# --- Lithology vector ---
lith = gpd.read_file(lithology_path)
print(f"Lithology vector: {len(lith)} features, bounds = {lith.total_bounds}")


📏 Checking raster resolutions...
Bioclim annual_mean_temp.tif → 18355.02 x 18355.02
Bioclim annual_precip2.tif → 18355.02 x 18355.02
Bioclim isothermality_32733.tif → 18116.64 x 18141.59
Bioclim max_temp_warmest_month32733.tif → 18355.00 x 18355.00
Bioclim mean_temp_driest_quarter32733.tif → 18355.00 x 18355.00
Bioclim mean_temp_warmest_quarter32733.tif → 18355.00 x 18355.00
Bioclim mean_temp_wettest_quarter32733.tif → 18355.00 x 18355.00
Bioclim min_temp_coldest_month32733.tif → 18355.00 x 18355.00
Bioclim precip_coldest_quarter32733.tif → 18355.00 x 18355.00
Bioclim precip_driest_month32733.tif → 18355.00 x 18355.00
Bioclim precip_driest_quarter32733.tif → 18355.00 x 18355.00
Bioclim precip_seasonality2.tif → 18355.02 x 18355.02
Bioclim precip_warmest_quarter32733.tif → 18355.00 x 18355.00
Bioclim precip_wettest_month32733.tif → 18355.00 x 18355.00
Bioclim precip_wettest_quarter32733.tif → 18355.00 x 18355.00
Bioclim temp_annual_range32733.tif → 18355.00 x 18355.00
Bioclim temp_seaso