### SOC Stock Model Work

The plan: 
- **Gather the covariates** using the SCORPAN model 
- **add harmonized and transformed soc** (response)
- **clean and process training data** -- Take out variables with moderate to high amnts of missing data, impute variables w low to mod
- **Handle categorical variables** --> Convert text/categorical variables (e.g., moist_color_name, friability) to numeric codes or one-hot encoding.
- **Handle numeric predictors**. Scale/normalize if using regression methods sensitive to magnitude (optional, regression kriging is usually OK without scaling).Ensure no extreme outliers that could bias the model.

In [1]:
## Prepare Training Data to estimate SOC stock throughout Angola
import pandas as pd

training_data = pd.read_csv("/Users/inesschwartz/Desktop/training_data_table_final.csv")

In [2]:
training_data.columns

Index(['site_info_id', 'X_coord', 'Y_coord', 'profile', 'district', 'MRVBF',
       'RLD', 'aspect', 'aspect_classes', 'aspect_cos', 'aspect_sin',
       'dem_filledfiltered', 'flow_accumulation', 'relief', 'ridge_levels',
       'roughness', 'slope', 'twi_300m', 'valleydepth2', 'aspect_label',
       'annual_mean_temp', 'annual_precip2', 'isothermality_32733',
       'max_temp_warmest_month32733', 'mean_temp_driest_quarter32733',
       'mean_temp_warmest_quarter32733', 'mean_temp_wettest_quarter32733',
       'min_temp_coldest_month32733', 'precip_coldest_quarter32733',
       'precip_driest_month32733', 'precip_driest_quarter32733',
       'precip_seasonality2', 'precip_warmest_quarter32733',
       'precip_wettest_month32733', 'precip_wettest_quarter32733',
       'temp_annual_range32733', 'temp_seasonality32733', 'landsurface_value',
       'landsurface_label', 'eco_value', 'eco_subclass_clean',
       'eco_class_clean', 'eco_division_clean', 'eco_subclass_code',
       'eco_class

identify numeric vs categorical variables

In [6]:
training_data.columns

Index(['site_info_id', 'X_coord', 'Y_coord', 'profile', 'district', 'MRVBF',
       'RLD', 'aspect', 'aspect_classes', 'aspect_cos', 'aspect_sin',
       'dem_filledfiltered', 'flow_accumulation', 'relief', 'ridge_levels',
       'roughness', 'slope', 'twi_300m', 'valleydepth2', 'annual_mean_temp',
       'annual_precip2', 'isothermality_32733', 'max_temp_warmest_month32733',
       'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733',
       'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733',
       'precip_coldest_quarter32733', 'precip_driest_month32733',
       'precip_driest_quarter32733', 'precip_seasonality2',
       'precip_warmest_quarter32733', 'precip_wettest_month32733',
       'precip_wettest_quarter32733', 'temp_annual_range32733',
       'temp_seasonality32733', 'landsurface_value', 'landsurface_label',
       'eco_value', 'eco_subclass_clean', 'eco_class_clean',
       'eco_division_clean', 'eco_subclass_code', 'eco_class_code',
       'eco_div

In [12]:
## identify numerica vs categorical variables

import pandas as pd

# Load your dataset
training_data

# Separate numeric and categorical predictors
numeric_cols = training_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_cols = training_data.select_dtypes(include=['object']).columns.tolist()

# Remove response variable from predictors
numeric_cols.remove('log_soc_stock')  # response variable

print("Numeric predictors:")
print(numeric_cols)

print("\nCategorical predictors:")
print(categorical_cols)


Numeric predictors:
['MRVBF', 'RLD', 'aspect', 'aspect_classes', 'aspect_cos', 'aspect_sin', 'dem_filledfiltered', 'flow_accumulation', 'relief', 'ridge_levels', 'roughness', 'slope', 'twi_300m', 'valleydepth2', 'annual_mean_temp', 'annual_precip2', 'isothermality_32733', 'max_temp_warmest_month32733', 'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733', 'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733', 'precip_coldest_quarter32733', 'precip_driest_month32733', 'precip_driest_quarter32733', 'precip_seasonality2', 'precip_warmest_quarter32733', 'precip_wettest_month32733', 'precip_wettest_quarter32733', 'temp_annual_range32733', 'temp_seasonality32733', 'landsurface_value', 'eco_value', 'eco_subclass_code', 'eco_class_code', 'eco_division_code', 'faosoil_id']

Categorical predictors:
['DOMSOI', 'africa_lithology_90m.img.vat_lithology']


In [11]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# ======================================================
# --- Handle missing data and drop unwanted columns ---
# ======================================================

# Drop specific unwanted columns
cols_to_drop = ['profile', 'site_info_id', 'district', 'landsurface_label', 'eco_subclass_clean', 'eco_class_clean', 'eco_division_clean', 'FAOSOIL', 'X_coord', 'Y_coord']
training_data1 = training_data
training_data1.drop(columns=cols_to_drop, inplace=True, errors='ignore')


# ======================================================
# --- Feature selection with categorical + numeric ---
# ======================================================

# Separate target
y = training_data1['log_soc_stock']
X = training_data1.drop(columns=['log_soc_stock'], errors='ignore')

# Automatically detect categorical and numeric columns
all_categorical = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Safeguard: keep only categorical columns with <= 50 unique categories
cat_limit = 50
categorical_cols = [col for col in all_categorical if X[col].nunique() <= cat_limit]

print("Categorical columns included:", categorical_cols)
print("Categorical columns skipped (too many categories):",
      [col for col in all_categorical if col not in categorical_cols])
print("Numeric columns:", numeric_cols)

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ],
    remainder='drop'
)

# Model
model = RandomForestRegressor(random_state=42)

# Pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit model
pipeline.fit(X, y)

# Get feature names after OHE
ohe_feature_names = []
if categorical_cols:
    ohe_feature_names = pipeline.named_steps['preprocessor'] \
        .named_transformers_['cat'] \
        .get_feature_names_out(categorical_cols)

all_feature_names = np.concatenate([ohe_feature_names, numeric_cols])

# Feature importances
importances = pipeline.named_steps['model'].feature_importances_
feature_importance = pd.DataFrame({
    'feature': all_feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("\nTop 25 Features by Importance:")
print(feature_importance.head(25))


Categorical columns included: ['DOMSOI', 'africa_lithology_90m.img.vat_lithology']
Categorical columns skipped (too many categories): []
Numeric columns: ['MRVBF', 'RLD', 'aspect', 'aspect_classes', 'aspect_cos', 'aspect_sin', 'dem_filledfiltered', 'flow_accumulation', 'relief', 'ridge_levels', 'roughness', 'slope', 'twi_300m', 'valleydepth2', 'annual_mean_temp', 'annual_precip2', 'isothermality_32733', 'max_temp_warmest_month32733', 'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733', 'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733', 'precip_coldest_quarter32733', 'precip_driest_month32733', 'precip_driest_quarter32733', 'precip_seasonality2', 'precip_warmest_quarter32733', 'precip_wettest_month32733', 'precip_wettest_quarter32733', 'temp_annual_range32733', 'temp_seasonality32733', 'landsurface_value', 'eco_value', 'eco_subclass_code', 'eco_class_code', 'eco_division_code', 'faosoil_id']

Top 25 Features by Importance:
                           feature  

In [23]:
## Create prediction grid aligned with soil polygons
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# 1) Load soil polygons
soil_gpkg = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_data2_32733.gpkg"
soil = gpd.read_file(soil_gpkg)

print("Soil CRS:", soil.crs)
print("Soil columns:", soil.columns.tolist())

# 2) Soil bounding box
minx, miny, maxx, maxy = soil.total_bounds
print("Soil bounds:", soil.total_bounds)

# 3) Grid resolution (adjust spacing here, in metres since CRS=32733)
step = 300  # e.g. 3 km grid spacing
gridx = np.arange(minx, maxx, step)
gridy = np.arange(miny, maxy, step)

# 4) Create 2D meshgrid and flatten
xx, yy = np.meshgrid(gridx, gridy)
grid_points = np.column_stack([xx.ravel(), yy.ravel()])

# 5) Build GeoDataFrame
grid_df = gpd.GeoDataFrame(
    {'geometry': [Point(xy) for xy in grid_points]},
    crs=soil.crs
)

print("Number of raw grid points:", len(grid_df))

# 6) Clip grid to soil extent (points that fall inside polygons)
grid_clipped = gpd.sjoin(
    grid_df, 
    soil[['geometry']], 
    how="inner", 
    predicate="within"
).drop(columns=['index_right'])

print("Number of clipped grid points:", len(grid_clipped))

# 7) Spatial join: assign DOMSOI / FAOSOIL to points
grid_with_soil = gpd.sjoin(
    grid_clipped, 
    soil[['geometry','DOMSOI','FAOSOIL']], 
    how="left", 
    predicate="within"
)

print(grid_with_soil.head())


Soil CRS: EPSG:32733
Soil columns: ['SNUM', 'FAOSOIL', 'DOMSOI', 'PHASE1', 'PHASE2', 'MISCLU1', 'MISCLU2', 'PERMAFROST', 'CNTCODE', 'CNTNAME', 'SQKM', 'COUNTRY', 'faosoil_id', 'geometry']
Soil bounds: [ 150953.32714802 7996288.35591393 1492659.74526484 9514684.09008703]
Number of raw grid points: 227136
Number of clipped grid points: 138274
                            geometry  index_right DOMSOI  FAOSOIL
769  POINT (1113953.327 7999288.356)          297     Je  Je39-2a
770  POINT (1116953.327 7999288.356)          297     Je  Je39-2a
771  POINT (1119953.327 7999288.356)          297     Je  Je39-2a
772  POINT (1122953.327 7999288.356)          297     Je  Je39-2a
773  POINT (1125953.327 7999288.356)          297     Je  Je39-2a


## pipeline to map Random Forest

In [2]:
# ================================================
# 1) Load training dataset and fit Random Forest
# ================================================
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load training data
training_data = pd.read_csv("/Users/inesschwartz/Desktop/training_data_table_final.csv")

# Separate target
y = training_data['log_soc_stock']

# Drop only the target + columns you don't want as predictors
X = training_data.drop(columns=[
    'log_soc_stock', 'landsurface_label', 
    'eco_subclass_clean', 'eco_class_clean', 'eco_division_clean', 
    'FAOSOIL'
], errors='ignore')

# Optionally exclude coords and profile from predictors
predictor_cols = [c for c in X.columns if c not in ['X_coord', 'Y_coord', 'site_info_id', 'district']]
X = X[predictor_cols]

# Detect categorical and numeric columns
all_categorical = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# Keep only categorical columns with <=50 unique categories
cat_limit = 50
categorical_cols = [col for col in all_categorical if X[col].nunique() <= cat_limit]

print("Categorical columns included:", categorical_cols)
print("Categorical columns skipped:", [col for col in all_categorical if col not in categorical_cols])
print("Numeric columns:", numeric_cols)

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', 'passthrough', numeric_cols)
    ],
    remainder='drop'
)

# Random Forest model
model = RandomForestRegressor(random_state=42)

# Full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Fit pipeline
pipeline.fit(X, y)

# ================================================
# 2) Get feature names from training pipeline
# ================================================
ohe_feature_names = []
if categorical_cols:
    ohe_feature_names = pipeline.named_steps['preprocessor'] \
        .named_transformers_['cat'] \
        .get_feature_names_out(categorical_cols)

# Combine with numeric columns
all_feature_names = np.concatenate([ohe_feature_names, numeric_cols])
print("Total features after preprocessing:", len(all_feature_names))

# ================================================
# 3) Create prediction grid in UTM 33S
# ================================================
from shapely.geometry import Point
import geopandas as gpd

# Angola bounding box in UTM meters
minx, miny, maxx, maxy = 130235.0721, 7982540.7382, 1509635.0721, 9514640.7382
res = 300  # grid spacing = raster resolution

gridx = np.arange(minx, maxx + res, res)
gridy = np.arange(miny, maxy + res, res)
xx, yy = np.meshgrid(gridx, gridy)
grid_points = np.column_stack([xx.ravel(), yy.ravel()])

print("Number of prediction points:", len(grid_points))

grid_df = gpd.GeoDataFrame(
    {'geometry': [Point(xy) for xy in grid_points]},
    crs="EPSG:32733"
)




Categorical columns included: ['DOMSOI', 'lithology']
Categorical columns skipped: ['profile']
Numeric columns: ['MRVBF', 'RLD', 'aspect', 'aspect_classes', 'aspect_cos', 'aspect_sin', 'dem_filledfiltered', 'flow_accumulation', 'relief', 'ridge_levels', 'roughness', 'slope', 'twi_300m', 'valleydepth2', 'aspect_label', 'annual_mean_temp', 'annual_precip2', 'isothermality_32733', 'max_temp_warmest_month32733', 'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733', 'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733', 'precip_coldest_quarter32733', 'precip_driest_month32733', 'precip_driest_quarter32733', 'precip_seasonality2', 'precip_warmest_quarter32733', 'precip_wettest_month32733', 'precip_wettest_quarter32733', 'temp_annual_range32733', 'temp_seasonality32733', 'landsurface_value', 'eco_value', 'eco_subclass_code', 'eco_class_code', 'eco_division_code', 'faosoil_id']
Total features after preprocessing: 75
Number of prediction points: 23491692


In [4]:
import os

out_gpkg = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/prediction_grid_300m.gpkg"

# Remove if exists
if os.path.exists(out_gpkg):
    os.remove(out_gpkg)

# Save GeoPackage
grid_df.to_file(out_gpkg, driver="GPKG")
print("Prediction grid saved successfully!")


Prediction grid saved successfully!


ERROR 1: sqlite_rtree_bl_serialize() failed with database disk image is malformed


In [1]:
# ================================================
# 4) Extract covariate values at prediction grid points
# ================================================
import rasterio
import geopandas as gpd
from rasterio.sample import sample_gen

# Use the prediction grid GeoDataFrame (grid_df) created earlier
grid_coords = [(geom.x, geom.y) for geom in grid_df.geometry]

# Create DataFrame for covariates
X_grid = pd.DataFrame(index=range(len(grid_coords)))

for raster_path in covariate_rasters:
    col_name = raster_to_colname.get(raster_path, os.path.basename(raster_path).split('.')[0])
    with rasterio.open(raster_path) as src:
        # Much faster: pass the full coords list to src.sample once
        values = list(src.sample(grid_coords))
        # Flatten array (each val is an array like [number])
        X_grid[col_name] = np.array(values).ravel()

# Handle categorical variables
categorical_cols_grid = ['labelled_ecosystems32733', 'landsurfaceforms', 'lithology_raster', 'soil_raster']
for col in categorical_cols_grid:
    if col in X_grid.columns:
        X_grid[col] = X_grid[col].astype("Int64").astype(str)  # ensure categorical

# One-hot encode categorical covariates
X_grid_encoded = pd.get_dummies(X_grid, columns=categorical_cols_grid, drop_first=True)

# Align columns with training features
for col in all_feature_names:
    if col not in X_grid_encoded.columns:
        X_grid_encoded[col] = 0

X_grid_encoded = X_grid_encoded[all_feature_names]
print("Prediction grid ready with covariates:", X_grid_encoded.shape)


NameError: name 'grid_df' is not defined

In [None]:
## 4.5 save grid w covariates gpkg

# Merge covariates with the GeoDataFrame grid
grid_with_covariates = grid_df.copy()
grid_with_covariates = grid_with_covariates.reset_index(drop=True)
grid_with_covariates = pd.concat([grid_with_covariates, X_grid_encoded.reset_index(drop=True)], axis=1)

# Save as GeoPackage
grid_with_covariates.to_file(
    "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/output/prediction_grid_covariates.gpkg",
    layer="grid_covariates",
    driver="GPKG"
)
print("Prediction grid with covariates saved as GeoPackage.")


In [None]:
# ================================================
# 5) Predict SOC on the grid and save raster
# ================================================
from rasterio.transform import from_origin

# Predict log SOC
pred_log_soc = pipeline.predict(X_grid_encoded)
pred_soc = np.exp(pred_log_soc)

# Reshape predictions to grid shape
grid_shape = (len(gridy), len(gridx))
pred_grid = pred_soc.reshape(grid_shape)

# Define raster transform
transform = from_origin(minx, maxy, res, res)

# Save as GeoTIFF
output_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/output/angola_soc_map.tif"
with rasterio.open(
    output_raster,
    'w',
    driver='GTiff',
    height=pred_grid.shape[0],
    width=pred_grid.shape[1],
    count=1,
    dtype=pred_grid.dtype,
    crs="EPSG:32733",
    transform=transform,
) as dst:
    dst.write(pred_grid, 1)

print(f"SOC map raster saved to: {output_raster}")