In [1]:
import pandas as pd
import os
import rasterio
import numpy as np
from rasterio.warp import reproject, Resampling
from rasterio.mask import mask

points2 = pd.read_csv("/Users/inesschwartz/Desktop/soc_samples.csv")

### Standardize all rasters to 1000m resolution, same extent, same CRS, clipped to Angola boundary, THEN extracts values at SOC sample points and exports a training CSV.

In [3]:
# ============================================================
# ANGOLA SOC PROJECT — FULL CLEANING & EXTRACTION PIPELINE
# FINAL VERSION (NOVEMBER 2025)
# ============================================================

import os
import numpy as np
import pandas as pd
import geopandas as gpd
import rasterio
from rasterio.mask import mask
from rasterio.warp import reproject, Resampling


# ============================================================
# USER SETTINGS
# ============================================================

samples_path = "/Users/inesschwartz/Desktop/soc_samples.csv"

mask_gpkg = "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/buffered1km_angola_adm0.gpkg"

reference_raster = "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/buffered_angola_raster_1km.tif"

clean_raster_dir = "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned"
os.makedirs(clean_raster_dir, exist_ok=True)

output_csv = "/Users/inesschwartz/Desktop/train_covariates_1km.csv"

landuse_dir = "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/LULC_hyde_percent_1km_averaged/"


# ============================================================
# BASE RASTERS
# ============================================================

rasters_raw = {
    # Climate
    "tmin": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/tmin_mean.tif",
    "tmax": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/tmax_mean.tif",
    "precip_sum": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/precip_avg_sum.tif",

    # Bioclim (BIO1–BIO19)
    **{
        f"bio{i}": f"/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/bioclimatic/bio{i}.tif"
        for i in range(1, 20)
    },

    # Terrain continuous
    "TWI": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/TWI.tif",
    "flow_accumulation": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/flow_accumulation.tif",
    "slope": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/slope.tif",
    "MRRTF": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/MRRTF.tif",
    "MRVBF": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/MRVBF.tif",
    "ridge_level": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/ridge_level.tif",
    "valley_depth": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/valley_depth.tif",
    "terrain_surf_texture": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/terrain_surf_texture.tif",
    "terrain_surv_conv": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/terrain_surf_conv.tif",
    "midslope_position": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/midslope_position.tif",
    "normalized_height": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/normalized_height.tif",
    "slope_height": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/slope_height.tif",
    "standardized_height": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/standardized_height.tif",
    "slope_length": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/slope_length.tif",
    "hillshade": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/hillshade.tif",
    "aspect_sin": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/aspect_sin.tif",
    "aspect_cos": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/aspect_cos.tif",
    "aspect": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/aspect.tif",
    "total_curve": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/total_curve.tif",
    "plan_curve": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/plan_curve.tif",
    "gen_curve": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/gen_curve.tif",
    "profile_curve": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/profile_curve.tif",
    "min_curve": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/min_curve.tif",
    "max_curve": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/max_curve.tif",
    "DEM": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/terraincovs/DEM.tif",

    # Categorical
    "landsurface_forms": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/landsurfaceforms.tif",
    "lithology": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/lithology.tif",
    "ecoforms": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/intermediate_data/ecoforms.tif",
    "faosoil_id": "/Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/soiltype.tif"
}


# ============================================================
# LAND-USE ADDITION
# ============================================================

for f in os.listdir(landuse_dir):
    if f.startswith("._") or not f.endswith(".tif"):
        continue
    cleaned = f.replace("conv_", "").replace("_utm33s_1km_percent", "").replace("AD", "")
    varname = cleaned + "_pct"
    rasters_raw[varname] = os.path.join(landuse_dir, f)

print(f"Added {len(rasters_raw)} total raster variables.")


# ============================================================
# CATEGORICAL LIST
# ============================================================

categorical = {
    "landsurface_forms",
    "lithology",
    "ecoforms",
    "faosoil_id"
}


# ============================================================
# LOAD MASK + REFERENCE GRID
# ============================================================

boundary = gpd.read_file(mask_gpkg).to_crs("EPSG:32733")
shapes = [geom for geom in boundary.geometry]

with rasterio.open(reference_raster) as ref:
    dst_crs = ref.crs
    dst_transform = ref.transform
    dst_width = ref.width
    dst_height = ref.height
    dst_res = ref.res

print("Reference grid loaded.")


# ============================================================
# STANDARDIZATION FUNCTION — 
# ============================================================

def standardize_raster(in_path, out_path, is_cat):
    """
    Clip raster to polygon, force into 2D array, 
    reproject to reference 1km grid, save as 1-band GeoTIFF.
    """

    # -------------------------------------------------------------
    # 1. CLIP THE RASTER FIRST
    # -------------------------------------------------------------
    with rasterio.open(in_path) as src:

        # nodata for clipping
        nodata_clip = 255 if is_cat else -9999.0

        # Clip to polygon
        clipped, clipped_transform = mask(
            src,
            shapes=shapes,
            crop=True,
            filled=True,
            nodata=nodata_clip
        )

        # clipped may be 4D or 3D or 2D – we must crush it to 2D
        arr = np.squeeze(clipped)

        # If still >2 dims (rare but possible): reshape last 2 dims
        if arr.ndim > 2:
            # force (rows, cols)
            arr = arr.reshape(arr.shape[-2], arr.shape[-1])

        # If it somehow became 1D or any unexpected shape
        if arr.ndim != 2:
            raise ValueError(f"ERROR: After compression, array for {in_path} is not 2D. Got shape: {arr.shape}")

        src_crs = src.crs


    # -------------------------------------------------------------
    # 2. PREP DESTINATION ARRAY (ALSO 2D)
    # -------------------------------------------------------------
    if is_cat:
        dtype_out = "uint8"
        nodata = 255
        res_method = Resampling.nearest
        dest = np.zeros((dst_height, dst_width), dtype=np.uint8)
    else:
        dtype_out = "float32"
        nodata = -9999.0
        res_method = Resampling.bilinear
        dest = np.zeros((dst_height, dst_width), dtype=np.float32)


    # -------------------------------------------------------------
    # 3. REPROJECT 2D → 2D
    # -------------------------------------------------------------
    reproject(
        source=arr,
        destination=dest,
        src_transform=clipped_transform,
        src_crs=src_crs,
        src_nodata=nodata_clip,
        dst_transform=dst_transform,
        dst_crs=dst_crs,
        dst_nodata=nodata,
        resampling=res_method
    )


    # -------------------------------------------------------------
    # 4. SAVE AS 1-BAND GEOTIFF
    # -------------------------------------------------------------
    meta = {
        "driver": "GTiff",
        "height": dst_height,
        "width": dst_width,
        "count": 1,
        "crs": dst_crs,
        "transform": dst_transform,
        "dtype": dtype_out,
        "nodata": nodata,
        "compress": "lzw"
    }

    with rasterio.open(out_path, "w", **meta) as dst:
        dst.write(dest.astype(dtype_out), 1)


# ============================================================
# RUN STANDARDIZATION
# ============================================================

rasters_clean = {}

for name, in_path in rasters_raw.items():
    out_path = os.path.join(clean_raster_dir, f"{name}_1km.tif")
    rasters_clean[name] = out_path

    print(f"→ Processing {name}...")
    try:
        is_cat = name in categorical
        standardize_raster(in_path, out_path, is_cat)
        print(f"✓ Saved: {out_path}")
    except Exception as e:
        print(f"⚠️ FAILED for {name}: {e}")


# ============================================================
# EXTRACT VALUES TO POINTS
# ============================================================

points = pd.read_csv(samples_path)

gdf = gpd.GeoDataFrame(
    points,
    geometry=gpd.points_from_xy(points.X_coord, points.Y_coord),
    crs="EPSG:32733"
)

coords = [(pt.x, pt.y) for pt in gdf.geometry]

for name, path in rasters_clean.items():
    print(f"Extracting {name}...")
    with rasterio.open(path) as src:
        gdf[name] = [v[0] for v in src.sample(coords)]


# ============================================================
# EXPORT TRAINING CSV
# ============================================================

gdf.drop(columns="geometry").to_csv(output_csv, index=False)
print(f"\n=== COMPLETE ===\nFinal dataset saved to: {output_csv}")


Added 62 total raster variables.
Reference grid loaded.
→ Processing tmin...
✓ Saved: /Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/tmin_1km.tif
→ Processing tmax...
✓ Saved: /Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/tmax_1km.tif
→ Processing precip_sum...
✓ Saved: /Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/precip_sum_1km.tif
→ Processing bio1...
✓ Saved: /Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio1_1km.tif
→ Processing bio2...
✓ Saved: /Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio2_1km.tif
→ Processing bio3...
✓ Saved: /Volumes/One_Touch/angola_soils_thesis/gis_features_updated/processed_data/covariates_1km_cleaned/bio3_1km.tif
→ Processing bio4...
✓ Saved: /Volumes/One_Touch/angola_soils_thesis/gis_features_upda

In [4]:
import pandas as pd

csv_path = "/Users/inesschwartz/Desktop/train_covariates_1km.csv"

df = pd.read_csv(csv_path)

# --- CLEAN RULES --------------------------------------------------

cleaned_cols = {}

for col in df.columns:
    
    new = col

    # 1) Remove ".tif" inside names
    new = new.replace(".tif", "")

    # 2) Ensure "_pct" suffix is correct
    new = new.replace("_pct_pct", "_pct")  # safety
    new = new.replace("pct_", "pct")       # clean patterns

    # 3) Remove accidental double underscores
    new = new.replace("__", "_")

    cleaned_cols[col] = new

# --- APPLY CLEANING ------------------------------------------------
df = df.rename(columns=cleaned_cols)

# --- SAVE OUTPUT ---------------------------------------------------
out_path = "/Users/inesschwartz/Desktop/train_covariates_1km_cleanvars.csv"
df.to_csv(out_path, index=False)

print("Renamed CSV saved to:", out_path)

print("\nSample of renamed columns:\n", list(df.columns[:30]))


Renamed CSV saved to: /Users/inesschwartz/Desktop/train_covariates_1km_cleanvars.csv

Sample of renamed columns:
 ['Unnamed: 0', 'site_info_id', 'profile', 'X_coord', 'Y_coord', 'log_soc_stock', 'soc_stock_0_30cm', 'district', 'year', 'tmin', 'tmax', 'precip_sum', 'bio1', 'bio2', 'bio3', 'bio4', 'bio5', 'bio6', 'bio7', 'bio8', 'bio9', 'bio10', 'bio11', 'bio12', 'bio13', 'bio14', 'bio15', 'bio16', 'bio17', 'bio18']


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,site_info_id,profile,X_coord,Y_coord,log_soc_stock,soc_stock_0_30cm,district,year,tmin,...,cropland_avg1940_1960_percent_1km_pct,grazing_avg1940_1960_percent_1km_pct,ir_norice_avg1940_1960_percent_1km_pct,ir_rice_avg1940_1960_percent_1km_pct,pasture_avg1940_1960_percent_1km_pct,rf_norice_avg1940_1960_percent_1km_pct,rf_rice_avg1940_1960_percent_1km_pct,tot_irri_avg1940_1960_percent_1km_pct,tot_rainfed_avg1940_1960_percent_1km_pct,tot_rice_avg1940_1960_percent_1km_pct
0,0,2139,100_56,637881.888723,8608926.0,1.42,3.124415,Huambo,1956.0,12.332118,...,5.450137,53.989456,0.0,0.0,53.989456,5.450137,0.0,0.0,5.450137,0.0
1,1,1927,100_58,385725.69329,8669325.0,1.76,4.829379,Benguela,1958.0,18.68019,...,4.969031,45.26047,0.0,0.0,0.0,4.969031,0.0,0.0,4.969031,0.0
2,2,17,100_59,248538.63635,9488118.0,0.84,1.305039,Cabinda,1959.0,20.888538,...,1.279943,15.747161,0.0,0.0,6.365431,1.279943,0.0,0.0,1.279943,0.0
3,3,1701,100_63,840008.131292,8731220.0,1.17,2.223432,Malanje,1963.0,13.723731,...,0.194317,11.321901,0.0,0.0,0.0,0.194317,0.0,0.0,0.194317,0.0
4,4,1934,101A_58,384135.495798,8666721.0,1.48,3.384241,Benguela,1958.0,18.693819,...,3.274993,44.783375,0.0,0.0,0.0,3.274993,0.0,0.0,3.274993,0.0
