In [None]:
## ANNK model to calculate soc map

## what I need: 
# 1) cov_final = "/Users/inesschwartz/Desktop/final_training_dataset.csv"
# X = cov_final(drop.log_soc_stock)
# Y = cov_final(log_soc_stock)
# W = sample_weight
# 1km aligned raster stack covariates for prediction (not yet built)

#steps to build model
# 1) ANN model training
# 2) Compute regression residuals :residual = SOC - ANN prediction
# 3) Spatial cross validation
    # Split points into spatial folds / blocks
    # Train model on training folds with sample_weight
    # Predict held-out fold
    #          Compute metrics: weighted RMSE, R², bias
    #       Diagnostic plots: observed vs predicted, residual spatial patterns 
# 4) residual variogram & OK
# 5) Add kriged residuals
# 6) ANNK: ML_pred + kriged residual
    # regression_pred + kriged residual 
# 7) Uncertainty: model variance + kriging var


In [None]:
# ================================================
# ANN + Kriging (ANNK) for SOC mapping
# ================================================

import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from pykrige.ok import OrdinaryKriging
import matplotlib.pyplot as plt
import seaborn as sns

# -----------------------------
# 1️⃣ Load final training dataset
# -----------------------------
cov_final = pd.read_csv("/Users/inesschwartz/Desktop/final_training_dataset.csv")

# Separate predictors, target, and sample weights
X = cov_final.drop(columns=['log_soc_stock', 'sample_weight'])
y = cov_final['log_soc_stock'].values
sample_weight = cov_final['sample_weight'].values

predictor_cols = X.columns.tolist()

# Standardize predictors for ANN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# 2️⃣ Define ANN model
# -----------------------------
def build_ann(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_dim=input_dim),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

ann_model = build_ann(X_scaled.shape[1])

# -----------------------------
# 3️⃣ Train ANN with sample weights
# -----------------------------
early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = ann_model.fit(
    X_scaled, y,
    sample_weight=sample_weight,
    validation_split=0.2,
    epochs=200,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

# ANN predictions and residuals
y_pred = ann_model.predict(X_scaled).flatten()
residuals = y - y_pred
cov_final['residuals'] = residuals

# -----------------------------
# 4️⃣ Spatial cross-validation
# -----------------------------
coords_available = all(col in cov_final.columns for col in ['X_coord','Y_coord'])
if coords_available:
    block_size = 10000  # 10 km blocks
    cov_final['block_x'] = np.digitize(cov_final['X_coord'], np.arange(cov_final['X_coord'].min(), cov_final['X_coord'].max()+block_size, block_size)) - 1
    cov_final['block_y'] = np.digitize(cov_final['Y_coord'], np.arange(cov_final['Y_coord'].min(), cov_final['Y_coord'].max()+block_size, block_size)) - 1
    cov_final['block_id'] = cov_final['block_x'].astype(str) + "_" + cov_final['block_y'].astype(str)
else:
    cov_final['block_id'] = np.arange(len(cov_final))

n_folds = 5
gkf = GroupKFold(n_splits=n_folds)

cv_results = []
for fold, (train_idx, test_idx) in enumerate(gkf.split(X_scaled, y, groups=cov_final['block_id'])):
    X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    w_train = sample_weight[train_idx]

    fold_model = build_ann(X_train.shape[1])
    fold_model.fit(X_train, y_train, sample_weight=w_train, epochs=200, batch_size=32,
                   callbacks=[EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)],
                   verbose=0)
    
    y_pred_fold = fold_model.predict(X_test).flatten()
    
    mse = mean_squared_error(y_test, y_pred_fold, sample_weight=sample_weight[test_idx])
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred_fold)
    bias = np.mean(y_pred_fold - y_test)
    
    cv_results.append({'fold': fold+1, 'RMSE': rmse, 'R2': r2, 'bias': bias})
    
    # Diagnostic plot
    plt.figure(figsize=(6,6))
    sns.scatterplot(x=y_test, y=y_pred_fold, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel("Observed log(SOC)")
    plt.ylabel("Predicted log(SOC)")
    plt.title(f"Observed vs Predicted - Fold {fold+1}")
    plt.show()

cv_df = pd.DataFrame(cv_results)
print("\nSpatial CV Summary:")
print(cv_df.describe())

# -----------------------------
# 5️⃣ Kriging of residuals
# -----------------------------
x = cov_final['X_coord'].values
y_coords = cov_final['Y_coord'].values
residual_values = cov_final['residuals'].values

ok_model = OrdinaryKriging(
    x, y_coords, residual_values,
    variogram_model='spherical',
    verbose=True,
    enable_plotting=True
)

# -----------------------------
# 6️⃣ Load prediction grid / raster stack covariates
# -----------------------------
X_grid = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/prediction_grid_covariates.csv")
grid_x = X_grid['X_coord'].values
grid_y = X_grid['Y_coord'].values

# Ensure columns match training predictors
missing_cols = [c for c in predictor_cols if c not in X_grid.columns]
for c in missing_cols:
    X_grid[c] = 0
X_grid = X_grid[predictor_cols]

# Scale prediction grid
X_grid_scaled = scaler.transform(X_grid)

# ANN predictions on grid
ann_grid_pred = ann_model.predict(X_grid_scaled).flatten()

# Krige residuals on grid
kriged_residuals, kriged_var = ok_model.execute('points', grid_x, grid_y)

# -----------------------------
# 7️⃣ Hybrid ANNK SOC map
# -----------------------------
SOC_ANNK = ann_grid_pred + kriged_residuals
X_grid['log_soc_stock_ANNK'] = SOC_ANNK
X_grid['kriging_var'] = kriged_var

# Save final ANNK SOC map
out_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/prediction_grid_SOC_ANNK.csv"
X_grid.to_csv(out_csv, index=False)
print(f"ANNK SOC map saved to: {out_csv}")
