In [None]:
## regression kriging model to calculate soc map

## what I need: 
# 1) cov_final = "/Users/inesschwartz/Desktop/final_training_dataset.csv"
# X = cov_final(drop.log_soc_stock)
# Y = cov_final(log_soc_stock)
# W = sample_weight
# 1km aligned raster stack covariates for prediction (not yet built)

#steps to build model
# 1) Regression Kriging -weighted regression using sample_weight
# 2) Compute regression residuals :residual = SOC - reg_pred
# 3) Spatial cross validation
    # Split points into spatial folds / blocks
    # Train model on training folds with sample_weight
    # Predict held-out fold
    #          Compute metrics: weighted RMSE, R², bias
    #       Diagnostic plots: observed vs predicted, residual spatial patterns 
# 4) residual variogram & OK
# 5) Add kriged residuals
# 6) Regression Kriging MAP
    # regression_pred + kriged residual 
# 7) Uncertainty:  kriging var


In [None]:
# ================================================
# Regression Kriging for SOC mapping
# ================================================

import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from pykrige.ok import OrdinaryKriging

# -----------------------------
# 1️⃣ Load final training dataset
# -----------------------------
cov_final = pd.read_csv("/Users/inesschwartz/Desktop/final_training_dataset.csv")

# Separate predictors, target, and sample weights
X = cov_final.drop(columns=['log_soc_stock', 'sample_weight'])
y = cov_final['log_soc_stock']
sample_weight = cov_final['sample_weight']

predictor_cols = X.columns.tolist()

# -----------------------------
# 2️⃣ Weighted Regression for RK
# -----------------------------
# Fit weighted linear regression (could also use RF if desired)
reg_model = LinearRegression()
reg_model.fit(X, y, sample_weight=sample_weight)

# Regression predictions
reg_pred = reg_model.predict(X)
residuals = y - reg_pred
cov_final['residuals'] = residuals

# Optional: plot residual histogram
plt.figure(figsize=(6,4))
plt.hist(residuals, bins=30, alpha=0.7)
plt.xlabel("Residual (Observed - Predicted)")
plt.ylabel("Number of points")
plt.title("Residuals from weighted regression")
plt.show()

# -----------------------------
# 3️⃣ Spatial cross-validation (weighted)
# -----------------------------
coords_available = all(col in cov_final.columns for col in ['X_coord','Y_coord'])
if coords_available:
    block_size = 10000  # e.g., 10 km blocks
    cov_final['block_x'] = np.digitize(cov_final['X_coord'], np.arange(cov_final['X_coord'].min(), cov_final['X_coord'].max()+block_size, block_size)) - 1
    cov_final['block_y'] = np.digitize(cov_final['Y_coord'], np.arange(cov_final['Y_coord'].min(), cov_final['Y_coord'].max()+block_size, block_size)) - 1
    cov_final['block_id'] = cov_final['block_x'].astype(str) + "_" + cov_final['block_y'].astype(str)
else:
    cov_final['block_id'] = np.arange(len(cov_final))

n_folds = 5
gkf = GroupKFold(n_splits=n_folds)

cv_results = []
for fold, (train_idx, test_idx) in enumerate(gkf.split(X, y, groups=cov_final['block_id'])):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    w_train = sample_weight.iloc[train_idx]

    # Weighted regression per fold
    fold_model = LinearRegression()
    fold_model.fit(X_train, y_train, sample_weight=w_train)
    y_pred = fold_model.predict(X_test)
    
    # Metrics
    mse = mean_squared_error(y_test, y_pred, sample_weight=sample_weight.iloc[test_idx])
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    bias = np.mean(y_pred - y_test)
    
    cv_results.append({'fold': fold+1, 'RMSE': rmse, 'R2': r2, 'bias': bias})
    
    # Diagnostic plot
    plt.figure(figsize=(6,6))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
    plt.xlabel("Observed log(SOC)")
    plt.ylabel("Predicted log(SOC)")
    plt.title(f"Observed vs Predicted - Fold {fold+1}")
    plt.show()

cv_df = pd.DataFrame(cv_results)
print("\nSpatial CV Summary:")
print(cv_df.describe())

# -----------------------------
# 4️⃣ Variogram & Ordinary Kriging of residuals
# -----------------------------
# Coordinates
x = cov_final['X_coord'].values
y_coords = cov_final['Y_coord'].values
residuals_values = cov_final['residuals'].values

# Fit OK model
ok_model = OrdinaryKriging(
    x, y_coords, residuals_values,
    variogram_model='spherical',
    verbose=True,
    enable_plotting=True
)

# -----------------------------
# 5️⃣ Load prediction grid / raster stack covariates
# -----------------------------
X_grid = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/prediction_grid_covariates.csv")
grid_x = X_grid['X_coord'].values
grid_y = X_grid['Y_coord'].values

# Ensure columns match training predictors
missing_cols = [c for c in predictor_cols if c not in X_grid.columns]
for c in missing_cols:
    X_grid[c] = 0
X_grid = X_grid[predictor_cols]

# Regression prediction on grid
reg_grid_pred = reg_model.predict(X_grid)

# -----------------------------
# 6️⃣ Kriging residuals on grid
# -----------------------------
kriged_residuals, kriged_var = ok_model.execute('points', grid_x, grid_y)

# -----------------------------
# 7️⃣ Regression Kriging SOC map
# -----------------------------
SOC_rk_pred = reg_grid_pred + kriged_residuals
X_grid['log_soc_stock_RK'] = SOC_rk_pred
X_grid['kriging_var'] = kriged_var

# Save final regression kriging map
out_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/prediction_grid_SOC_RK.csv"
X_grid.to_csv(out_csv, index=False)
print(f"Regression Kriging SOC map saved to: {out_csv}")
