### trying predictionk grid w different paramenters

In [None]:
import rasterio
from rasterio import sample
import numpy as np
import pandas as pd
import os

# =========================================================
# 1Ô∏è‚É£  TEMPLATE RASTER (reference grid)
# =========================================================
template_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/slope_height.tif"

# =========================================================
# 2Ô∏è‚É£  ALIGNED RASTERS (1 km, EPSG:32733)
# =========================================================
rasters = {
    "slope_height": template_path,
    "annual_precip": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/aligned_1km_auto/annual_precip_aligned1k.tif",
    "precip_wettest_month": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/aligned_1km_auto/precip_wettest_month_aligned1k.tif",
    "tmax_mean": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/tmax_mean_mean1k.tif",
    "MRRTF": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/MRRTF.tif",
    "MRVBF": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/MRVBF.tif",
    "standardized_height": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/standardized_height.tif",
    "relief_TRI": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/relief_TRI.tif",
    "terrain_surf_convexity": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/terrain_surf_convexity.tif",
    "terrain_surf_texture": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/terrain_surf_texture.tif",
    "valley_depth": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs/valley_depth.tif",
    "faosoil_id": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_gpkg_stuff/angola_soil_1k.tif",
    "cropland_1950": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/aligned_1km_auto/cropland_1950_aligned1k.tif",
    "grazing_1950": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/aligned_1km_auto/grazing_1950_aligned1k.tif",
    "temp_annual_range": "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/aligned_1km_auto/temp_annual_range_aligned1k.tif"
}

# =========================================================
# 3Ô∏è‚É£  BUILD COORDINATE GRID FROM TEMPLATE
# =========================================================
with rasterio.open(template_path) as tmpl:
    transform = tmpl.transform
    nodata = tmpl.nodata
    base = tmpl.read(1)
    height, width = tmpl.height, tmpl.width

    # Build coordinate list
    xs, ys = np.meshgrid(np.arange(width), np.arange(height))
    xs, ys = rasterio.transform.xy(transform, ys, xs)
    coords = list(zip(xs.flat, ys.flat))

    # Mask valid pixels
    base_mask = np.isfinite(base)
    if nodata is not None:
        base_mask &= base != nodata
    coords = [c for c, m in zip(coords, base_mask.flat) if m]

print(f"üìè Prediction grid: {len(coords):,} valid pixels in template.")

# Create base DataFrame
grid = pd.DataFrame(coords, columns=["X_coord", "Y_coord"])

# =========================================================
# 4Ô∏è‚É£  SAMPLE EACH RASTER BY COORDINATE
# =========================================================
for name, path in rasters.items():
    if not os.path.exists(path):
        print(f"‚ö†Ô∏è Missing raster: {path}")
        continue
    with rasterio.open(path) as src:
        vals = np.array(list(sample.sample_gen(src, coords))).squeeze()

        # --- Convert to float to allow NaN replacement safely ---
        vals = vals.astype(float)

        # --- Replace nodata / placeholders with NaN ---
        if src.nodata is not None and not np.isnan(src.nodata):
            vals[vals == src.nodata] = np.nan
        vals[vals == -9999] = np.nan

        grid[name] = vals
        print(f"‚úÖ Sampled {name} ({np.isnan(vals).mean()*100:.2f}% missing)")

# =========================================================
# 5Ô∏è‚É£  OPTIONAL: CLEAN CATEGORICAL VARIABLE
# =========================================================
if "faosoil_id" in grid.columns:
    grid["faosoil_id"] = grid["faosoil_id"].astype("Int32")

# =========================================================
# 6Ô∏è‚É£  SAVE PREDICTION GRID
# =========================================================
out_csv = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv"
grid.to_csv(out_csv, index=False)

print(f"\n‚úÖ Wrote prediction covariate stack:\n{out_csv}")
print(f"üßæ Grid shape: {grid.shape[0]} rows √ó {grid.shape[1]} columns")

# =========================================================
# 7Ô∏è‚É£  QUICK MISSING DATA SUMMARY
# =========================================================
print("\nüîç Missing-data summary (% of pixels with NaN):")
print((grid.isna().mean() * 100).round(2).sort_values(ascending=False))

In [3]:
import pandas as pd

# --- Load training dataset (used for model training) ---
decluster_dir = "/Users/inesschwartz/Desktop/model/decluster_runs_aligned"
os.makedirs(decluster_dir, exist_ok=True)

train = pd.read_csv(decluster_dir)

# #drop site info id
# if 'site_info_id' in train.columns:
#     train.drop(columns=['site_info_id'], inplace=True)

# --- Load prediction grid ---
pred_path = "/Users/inesschwartz/Desktop/model/covariates_stack_1km_utm33s_fixed.csv"
pred = pd.read_csv(pred_path, nrows=5)  # read first few rows for speed

# --- Identify feature columns (excluding target + coords) ---
target = "log_soc_stock"
coord_cols = ["X_coord", "Y_coord"]

train_features = [c for c in train.columns if c not in [target] + coord_cols]
pred_features = [c for c in pred.columns if c not in coord_cols]

# --- Compare features ---
missing_in_pred = [f for f in train_features if f not in pred_features]
extra_in_pred = [f for f in pred_features if f not in train_features]
order_diff = [f for f, g in zip(train_features, pred_features) if f != g]

print("‚úÖ TRAIN feature count:", len(train_features))
print("‚úÖ PREDICTION feature count:", len(pred_features))
print("\nüß© Missing in prediction grid:", missing_in_pred or "None")
print("üîÅ Extra in prediction grid:", extra_in_pred or "None")
print("‚ö†Ô∏è Order mismatch:", order_diff or "None")

# --- Optional: enforce same column order before prediction ---
if not missing_in_pred and not extra_in_pred:
    pred = pred[["X_coord", "Y_coord"] + train_features]
    print("\n‚úÖ Columns aligned for prediction use.")
else:
    print("\n‚ö†Ô∏è Fix column mismatches before running model predictions.")


NameError: name 'os' is not defined