### Making training dataset

1. bring on chemical and physical soil variables (*to what depth??*)

2. extract vectors (soil type, lithology, and ecosystem type) per profile/sample site
3. extract rasters (bioclimatic and DEM/terrain) per profile/sample site
4. merge into one training dataset


In [29]:
## Load data from database

# === 1. Imports ===
import pandas as pd
from sqlalchemy import create_engine

# === 2. Connect to PostgreSQL ===
USER = "inesschwartz"
PASSWORD = "aa4862aa"
HOST = "localhost"
PORT = "5432"
DB = "soils_angola"

engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}")

# === 3. Load tables ===
analyses = pd.read_sql("SELECT * FROM analyses", engine)
morpho = pd.read_sql("SELECT * FROM morpho", engine)
usable_sites = pd.read_sql("SELECT * FROM usable_site_info", engine)
samples = pd.read_sql("SELECT * FROM samples", engine)


# Subset analyses, keep all morpho columns
analyses_subset = analyses.copy()
morpho_subset = morpho.copy()

# Filter only profiles present in site_info
valid_profiles = usable_sites['profile'].dropna().unique()
analyses_filtered = analyses_subset[analyses_subset['profile'].isin(valid_profiles)]
morpho_filtered = morpho_subset[morpho_subset['profile'].isin(valid_profiles)]

# Ensure merge keys have same dtype
analyses_filtered['sample_id'] = analyses_filtered['sample_id'].astype(str)
morpho_filtered['sample_id'] = morpho_filtered['sample_id'].astype(str)

# === 5. Merge analyses and morpho ===
combined_data = pd.merge(
    analyses_filtered,
    morpho_filtered,
    on=['profile', 'morpho_id'],
    how='inner',
    suffixes=('', '_morpho')
)

combined_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  analyses_filtered['sample_id'] = analyses_filtered['sample_id'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  morpho_filtered['sample_id'] = morpho_filtered['sample_id'].astype(str)


Unnamed: 0,lab_sample_id,analysis_id,morpho_id,sample_id,profile,soil_biology_id,eg,thick_sand,fine_sand,silt,...,dry_chroma,moist_color_name,moist_hue,moist_value,moist_chroma,compaction,durability,friability,thick_contents_count,thick_contents_nature
0,458,Bg_113/57_4_1,Bg_113/57_4_1,5040,113_57,,59.0,42.9,13.1,10.9,...,6.0,,,,,,,,,
1,894,Bg_253/57_4_1,Bg_253/57_4_1,5456,253_57,,,,0.0,0.0,...,7.0,,,,,Irregularmente pouco compacto e medianamente c...,"Muito firme (com alguns torroes da mesma cor, ...",,Raro,Saibro quartzoso e de gres
2,7,B_109/62_1_1,B_109/62_1_1,11011,109_62,,1.0,15.3,48.5,17.6,...,2.0,Pardo-avermelhado,5YR,4.0,3.0,Pequena,Brando e brando a ligeiramente duro,,,
3,1549,Cb_20/59_3_1,Cb_20/59_3_1,7277,20_59,,,,,,...,6.0,Pardo-avermelhado,5YR,4.0,4.0,,,,,
4,5965,Mj_27/63_3_1,Mj_27/63_3_1,13391,27_63,,,,,,...,6.0,Laranja a pardo-avermelhado-escuro,5YR,4.0,6.0,Media a grande,Ligeiramente duro,,,


In [30]:
# === 6. Join with site_info ===
merged_final = pd.merge(
    combined_data,
    usable_sites,
    on='profile',
    how='inner'
)

# === 7. Optionally merge 'year' from samples ===
if 'sample_id' in samples.columns and 'year' in samples.columns:
    merged_final = pd.merge(
        merged_final,
        samples[['sample_id', 'year']],
        on='sample_id',
        how='left'
    )

merged_final.head()

Unnamed: 0,lab_sample_id,analysis_id,morpho_id,sample_id,profile,soil_biology_id,eg,thick_sand,fine_sand,silt,...,site_info_id,X_coord,Y_coord,district,geo_features_id,climate_id,topo_id,soil_type_id,geom,year
0,458,Bg_113/57_4_1,Bg_113/57_4_1,5040,113_57,,59.0,42.9,13.1,10.9,...,2400,14.624539,-13.31507,Benguela,197,197,197,112,0101000020E6100000000000A0C33F2D40000000E050A1...,1957.0
1,894,Bg_253/57_4_1,Bg_253/57_4_1,5456,253_57,,,,0.0,0.0,...,2045,15.001662,-12.3417,Benguela,1945,1945,1945,1194,0101000020E6100000000000E0D9002E4000000040F3AE...,1957.0
2,7,B_109/62_1_1,B_109/62_1_1,11011,109_62,,1.0,15.3,48.5,17.6,...,2523,16.778372,-13.896562,Bie,137,137,137,76,0101000020E61000000000006043C73040000000200ACB...,1962.0
3,1549,Cb_20/59_3_1,Cb_20/59_3_1,7277,20_59,,,,,,...,33,12.553629,-4.762017,Cabinda,1288,1288,1288,789,0101000020E610000000000040751B2940000000404E0C...,1959.0
4,5965,Mj_27/63_3_1,Mj_27/63_3_1,13391,27_63,,,,,,...,1060,17.135303,-9.595469,Malanje,2105,2105,2105,1289,0101000020E610000000000040A322314000000060E130...,1963.0


In [31]:
# List of columns to drop
cols_to_drop = [
    'geo_features_id',
    'climate_id',
    'topo_id',
    'soil_type_id',
    'geom',
    'field_sample_code',  
    'depth',
    'al',
    'si',
    'p',
    's',
    'cl',
    'ti',
    'cr',
    'mn',
    'fe',
    'co',
    'ni',
    'cu',
    'zn',
    'arsenic',
    'se',
    'rb',
    'sr',
    'zr',
    'nb',
    'mo',
    'cd',
    'sn',
    'sb',
    'ba',
    'ta',
    'w',
    'pt',
    'au',
    'hg',
    'tl',
    'pb',
    'bi',
    'th',
    'u'
]

# Drop these columns if they exist in the DataFrame
merged_final = merged_final.drop(columns=[col for col in cols_to_drop if col in merged_final.columns])

# Check the remaining columns
list(merged_final.columns)

['lab_sample_id',
 'analysis_id',
 'morpho_id',
 'sample_id',
 'profile',
 'soil_biology_id',
 'eg',
 'thick_sand',
 'fine_sand',
 'silt',
 'clay',
 'eq_hum',
 'atm_1/3',
 'atm_15',
 'caco3',
 'gypsum',
 'free_iron',
 'organic_carbon',
 'total_n',
 'p205',
 'organic_material',
 'ph_h2o',
 'ph_kcl',
 'Ca++',
 'Mg++',
 'Na+',
 'K+',
 'exchangable_bases_sum',
 'cec',
 'v',
 'conductivity',
 'soluble_sodium',
 'Min_<0,002',
 'Min_0,05-0,02',
 'Min_0,2-0,05',
 'Min_2-0,2',
 'porosity',
 'bulk_density',
 'sample_depth',
 'sample_id_morpho',
 'horizon_layer',
 'upper_depth',
 'lower_depth',
 'moisture_degree',
 'root_quantity',
 'root_diameter',
 'texture',
 'structure_type',
 'structure_class',
 'structure_degree',
 'pore_diameter',
 'pore_quantity',
 'pore_shape',
 'dry_color_name',
 'dry_hue',
 'dry_value',
 'dry_chroma',
 'moist_color_name',
 'moist_hue',
 'moist_value',
 'moist_chroma',
 'compaction',
 'durability',
 'friability',
 'thick_contents_count',
 'thick_contents_nature',
 'site

In [32]:
import numpy as np
import pandas as pd
from scipy.interpolate import UnivariateSpline
from collections import Counter

# ------------------------------
# 1) Mass-preserving spline function (robust)
# ------------------------------
def fit_mpspline(depth_top, depth_bottom, values, lam=1.0, target_depths=[0,30]):
    depth_top = np.array(depth_top)
    depth_bottom = np.array(depth_bottom)
    values = np.array(values)
    
    # Remove NaNs
    mask = ~np.isnan(depth_top) & ~np.isnan(depth_bottom) & ~np.isnan(values)
    depth_top = depth_top[mask]
    depth_bottom = depth_bottom[mask]
    values = values[mask]
    
    n = len(values)
    if n == 0:
        return np.nan
    elif n == 1:
        # One horizon → just return the value
        return values[0]
    elif n == 2:
        # Two horizons → thickness-weighted average
        thickness = depth_bottom - depth_top
        return np.average(values, weights=thickness)
    
    # For 3+ horizons → spline
    midpoints = (depth_top + depth_bottom)/2
    sort_idx = np.argsort(midpoints)
    midpoints = midpoints[sort_idx]
    values = values[sort_idx]
    
    # Remove duplicate midpoints
    unique_midpoints, unique_indices = np.unique(midpoints, return_index=True)
    values = values[unique_indices]
    
    # Ensure spline order fits number of points
    k = min(3, len(values)-1)
    if k < 1:
        return np.nan
    
    spline = UnivariateSpline(unique_midpoints, values, s=lam, k=k)
    
    top, bottom = target_depths
    integral = spline.integral(top, bottom)
    return integral / (bottom - top)

# ------------------------------
# 2) Identify variable types
# ------------------------------
drop_cols = [
    'lab_sample_id','analysis_id','morpho_id','sample_id','sample_id_morpho',
    'site_info_id','X_coord','Y_coord','profile',
    'upper_depth','lower_depth','sample_depth'
]

numeric_cols = merged_final.select_dtypes(include=np.number).columns.tolist()
numeric_cols = [col for col in numeric_cols if col not in drop_cols]

categorical_cols = merged_final.select_dtypes(include=['object','category']).columns.tolist()
categorical_cols = [col for col in categorical_cols if col not in drop_cols]

# ------------------------------
# 3) Create harmonized dataset per profile
# ------------------------------
profiles = merged_final['profile'].unique()
harmonized_list = []

for pid in profiles:
    profile = merged_final[merged_final['profile'] == pid].copy()
    if profile.empty:
        continue
    
    profile_harmonized = {'profile': pid}
    
    # Numeric variables
    for col in numeric_cols:
        if col in ['organic_carbon','total_n','bulk_density']:
            profile_harmonized[col] = fit_mpspline(
                depth_top=profile['upper_depth'],
                depth_bottom=profile['lower_depth'],
                values=profile[col],
                lam=1.0,
                target_depths=[0,30]
            )
        else:
            thickness = profile['lower_depth'] - profile['upper_depth']
            valid_mask = ~profile[col].isna() & ~thickness.isna()
            if valid_mask.sum() > 0:
                profile_harmonized[col] = np.average(
                    profile[col][valid_mask], 
                    weights=thickness[valid_mask]
                )
            else:
                profile_harmonized[col] = np.nan
    
    # Categorical variables → mode
    for col in categorical_cols:
        vals = profile[col].dropna()
        if not vals.empty:
            profile_harmonized[col] = vals.mode().iloc[0]
        else:
            profile_harmonized[col] = np.nan
    
    # Metadata
    meta_cols = ['X_coord','Y_coord','district','year']
    for col in meta_cols:
        if col in profile.columns:
            profile_harmonized[col] = profile[col].iloc[0]
    
    harmonized_list.append(profile_harmonized)

# ------------------------------
# 4) Convert to DataFrame
# ------------------------------
profile_0_30cm = pd.DataFrame(harmonized_list)

print(f"Shape: {profile_0_30cm.shape} (profiles x variables)")


Shape: (1488, 62) (profiles x variables)


In [33]:
profile_0_30cm.head()

Unnamed: 0,profile,eg,thick_sand,fine_sand,silt,clay,eq_hum,atm_15,free_iron,organic_carbon,...,moist_value,moist_chroma,compaction,durability,friability,thick_contents_count,thick_contents_nature,district,X_coord,Y_coord
0,113_57,31.933333,46.273684,15.352632,9.594737,28.8,17.318182,,1.646364,0.784545,...,,,,,,,,Benguela,14.624539,-13.31507
1,253_57,,,0.0,0.0,,,,,,...,,,Irregularmente pouco compacto e medianamente c...,"Muito firme (com alguns torroes da mesma cor, ...",,Raro,Saibro quartzoso e de gres,Benguela,15.001662,-12.3417
2,109_62,11.2,12.8625,37.7525,20.93,28.4475,21.1975,9.4,2.28375,0.45116,...,4.0,6.0,Media,,,,,Bie,16.778372,-13.896562
3,20_59,,,,,,,,,,...,,,,,,,,Cabinda,12.553629,-4.762017
4,27_63,,1.0,45.318261,7.532174,46.117391,,,,,...,4.0,6.0,Media a grande,,,,,Malanje,17.135303,-9.595469


In [34]:
# Count unique profiles
n_profiles = profile_0_30cm['profile'].nunique()
print(f"Unique profiles: {n_profiles}")

Unique profiles: 1488


In [35]:
profile_0_30cm.columns

Index(['profile', 'eg', 'thick_sand', 'fine_sand', 'silt', 'clay', 'eq_hum',
       'atm_15', 'free_iron', 'organic_carbon', 'p205', 'organic_material',
       'ph_h2o', 'ph_kcl', 'cec', 'soluble_sodium', 'porosity', 'bulk_density',
       'horizon_layer', 'year', 'soil_biology_id', 'atm_1/3', 'caco3',
       'gypsum', 'total_n', 'Ca++', 'Mg++', 'Na+', 'K+',
       'exchangable_bases_sum', 'v', 'conductivity', 'Min_<0,002',
       'Min_0,05-0,02', 'Min_0,2-0,05', 'Min_2-0,2', 'moisture_degree',
       'root_quantity', 'root_diameter', 'texture', 'structure_type',
       'structure_class', 'structure_degree', 'pore_diameter', 'pore_quantity',
       'pore_shape', 'dry_color_name', 'dry_hue', 'dry_value', 'dry_chroma',
       'moist_color_name', 'moist_hue', 'moist_value', 'moist_chroma',
       'compaction', 'durability', 'friability', 'thick_contents_count',
       'thick_contents_nature', 'district', 'X_coord', 'Y_coord'],
      dtype='object')

## Extract soil data and lithology from .tif to csv per profile

In [37]:
import geopandas as gpd
import pandas as pd
import os
import re  # for regex

# --- Input paths ---
points_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/usable_site_info_epsg32733.gpkg"
soil_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_data2_32733.gpkg"
lithology_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/lithology2.gpkg"


# --- Load data ---
points = gpd.read_file(points_path)
soil = gpd.read_file(soil_path)[["FAOSOIL", "DOMSOI", "faosoil_id", "geometry"]]
lithology = gpd.read_file(lithology_path)[["africa_lithology_90m.img.vat_lithology", "geometry"]]

# --- Ensure CRS matches ---
if points.crs != soil.crs:
    soil = soil.to_crs(points.crs)
if points.crs != lithology.crs:
    lithology = lithology.to_crs(points.crs)

# --- Spatial join ---
points_soil = gpd.sjoin(points, soil, how="left", predicate="intersects")

# Drop index_right from first join
if "index_right" in points_soil.columns:
    points_soil = points_soil.drop(columns=["index_right"])

points_soil_lith = gpd.sjoin(points_soil, lithology, how="left", predicate="intersects")

# Drop index_right from second join
if "index_right" in points_soil_lith.columns:
    points_soil_lith = points_soil_lith.drop(columns=["index_right"])

# --- Remove numbers from lithology column ---
# Example: "1. Carbonate" → "Carbonate"
points_soil_lith["africa_lithology_90m.img.vat_lithology"] = points_soil_lith[
    "africa_lithology_90m.img.vat_lithology"
].apply(lambda x: re.sub(r"^\d+\.\s*", "", str(x)))

# Drop multiple columns
points_soil_lith_clean = points_soil_lith
points_soil_lith_clean = points_soil_lith_clean.drop(columns=["geo_features_id", "climate_id", "topo_id", "soil_type_id"])
points_soil_lith_clean = points_soil_lith_clean.drop(columns=["geometry"])

# --- Save clean CSV ---
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_vector_clean.csv"
points_soil_lith_clean.to_csv(output_csv, index=False)

print("✅ CSV saved at:", output_csv)

✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_vector_clean.csv


In [38]:
points_soil_lith_clean.columns

Index(['site_info_id', 'profile', 'X_coord', 'Y_coord', 'district', 'FAOSOIL',
       'DOMSOI', 'faosoil_id', 'africa_lithology_90m.img.vat_lithology'],
      dtype='object')

In [40]:
# Count unique profiles
n_profiles1 = points_soil_lith_clean['profile'].nunique()
print(f"Unique profiles: {n_profiles1}")

Unique profiles: 1488


## Extract the landsurfaceforms raster values per profile

In [3]:
## extract the “landsurfaceforms” raster values at your sample points

import rasterio
import geopandas as gpd
import pandas as pd

# --- Load your points ---
points_path = "/Volumes/One_Touch/angola_thesis_gis/GIS_Angola/tables/joined_landsurf_test.gpkg"
points = gpd.read_file(points_path)

# --- Load your raster ---
raster_path = "/Volumes/One_Touch/angola_thesis_gis/GIS_Angola/data_processed/landsurfaceforms/landsurfaceforms.tif"
raster = rasterio.open(raster_path)

# --- Ensure points are in the same CRS as the raster ---
if points.crs != raster.crs:
    points = points.to_crs(raster.crs)

# --- Extract raster values at point locations ---
coords = [(x, y) for x, y in zip(points.geometry.x, points.geometry.y)]
points['landsurface_value'] = [val[0] for val in raster.sample(coords)]

# --- Optional: convert codes to descriptive categories ---
# Example: update according to your raster's legend
landsurface_lookup = {
    1: "smooth_plains",
    2: "irregular_plains",
    3: "escarpments",
    4: "hills",
    5: "breaks",
    6: "low_mountains",
    7: "high_mountains/deep_canyons"
}
points['landsurface_label'] = points['landsurface_value'].map(landsurface_lookup)

# --- Save to CSV ---
output_csv = "/Volumes/One_Touch/angola_thesis_gis/GIS_Angola/tables/landsurface_sample_points.csv"
points.drop(columns='geometry').to_csv(output_csv, index=False)

print("CSV saved at:", output_csv)


CSV saved at: /Volumes/One_Touch/angola_thesis_gis/GIS_Angola/tables/landsurface_sample_points.csv


## extract the ecosystem raster values per profile

In [43]:
import rasterio
import numpy as np

ecosyst_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/labelled_ecosystems32733.tif"

with rasterio.open(ecosyst_path) as src:
    band1 = src.read(1)  # read first band
    unique_vals = np.unique(band1[band1 != src.nodata])  # exclude nodata if defined

print("Unique raster values in band 1:", unique_vals)

Unique raster values in band 1: [  0   1   3   4   6   8  51  61  66  67  68  87  88  89  96  97  98 101
 112 113 116 117 118 119 120 122 123 143 151 157 166 167 168 169 196 198
 199 200 201 202 204 211 216 236 241 246 271 304 601 602 800]


In [46]:
import rasterio
import geopandas as gpd
import pandas as pd
from dbfread import DBF

# --- Load your points ---
points_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/usable_site_info_epsg32733.gpkg"
points = gpd.read_file(points_path)

# --- Load raster (ecosystem.tif) ---
ecosyst_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/labelled_ecosystems32733.tif"
raster = rasterio.open(ecosyst_path)

# --- Ensure CRS match ---
if points.crs != raster.crs:
    points = points.to_crs(raster.crs)

# --- Extract raster values (band 1) ---
coords = [(x, y) for x, y in zip(points.geometry.x, points.geometry.y)]
points['eco_value'] = [val[0] for val in raster.sample(coords)]

# --- Read the VAT (.dbf) ---
vat_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_raw/Africa Terrestrial Ecosystems/africa_labeledecosystems_90m/africa_labeledecosystems_90m.tif.vat.dbf"
vat_records = DBF(vat_path, load=True)
vat_df = pd.DataFrame(iter(vat_records))

# --- Build lookup tables ---
subclass_lookup = dict(zip(vat_df['Value'], vat_df['subclass']))
class_lookup    = dict(zip(vat_df['Value'], vat_df['class']))
division_lookup = dict(zip(vat_df['Value'], vat_df['Division']))

# --- Map values to labels ---
points['eco_subclass'] = points['eco_value'].map(subclass_lookup)
points['eco_class']    = points['eco_value'].map(class_lookup)
points['eco_division'] = points['eco_value'].map(division_lookup)

# --- Check for unmatched raster codes ---
unique_raster_vals = set(points['eco_value'].unique())
vat_vals = set(vat_df['Value'].unique())
missing = unique_raster_vals - vat_vals

print("Unique raster values in points:", sorted(unique_raster_vals))
print("Unique VAT values:", sorted(vat_vals)[:20], "...")
print("Missing values (in raster but not in VAT):", missing)

# --- Save results ---
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/ecosystem_w_points.csv"
points.drop(columns='geometry').to_csv(output_csv, index=False)

print("CSV saved at:", output_csv)


Unique raster values in points: [-2147483648, 0, 1, 3, 4, 66, 67, 89, 96, 97, 98, 113, 116, 118, 123, 143, 157, 200, 202, 204, 211, 216, 236, 246, 602]
Unique VAT values: [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 21, 27, 51, 52, 61] ...
Missing values (in raster but not in VAT): {-2147483648}
CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/ecosystem_w_points.csv


## bioclimatic rasters to csv for each sample point

In [6]:
#### bioclimatic rasters to csv for each sample point

import rasterio
import geopandas as gpd
import pandas as pd
import glob
import os
import numpy as np

# --- Input paths ---
points_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/usable_site_info_epsg32733.gpkg"
bioclimraster_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/bioclimatic32733/"

# --- Load point data ---
points = gpd.read_file(points_path)

# --- Get list of rasters (assuming .tif files) ---
bioclimraster_files = sorted(glob.glob(os.path.join(bioclimraster_folder, "*.tif")))

# --- Reproject points once (using first raster as reference) ---
if bioclimraster_files:
    with rasterio.open(bioclimraster_files[0]) as src_ref:
        if points.crs != src_ref.crs:
            points = points.to_crs(src_ref.crs)

# --- Extract raster values for each point ---
coords = [(x, y) for x, y in zip(points.geometry.x, points.geometry.y)]

for raster_path in bioclimraster_files:
    name = os.path.splitext(os.path.basename(raster_path))[0]  # e.g. bio1.tif → "bio1"
    with rasterio.open(raster_path) as src:
        values = []
        for val in src.sample(coords):
            # Handle NoData / masked values
            if val is None or np.isnan(val[0]):
                values.append(np.nan)
            else:
                values.append(val[0])
        points[name] = values

# --- Save to CSV (without geometry column) ---
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_bioclim1.csv"
points.drop(columns="geometry").to_csv(output_csv, index=False)

print("✅ CSV saved at:", output_csv)


✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_bioclim1.csv


## terrain / DEM features to csv for each sample point

In [8]:
## terrain / DEM features to csv for each sample point


import rasterio
import geopandas as gpd
import pandas as pd
import glob
import os
import numpy as np

# --- Input paths ---
points_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/usable_site_info_epsg32733.gpkg"
terrain_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/DEM_characteristics"

# --- Load point data ---
points = gpd.read_file(points_path)

# --- Get list of rasters (assuming .tif files) ---
terrain_files = sorted(glob.glob(os.path.join(terrain_folder, "*.tif")))

# --- Reproject points once (using first raster as reference) ---
if terrain_files:
    with rasterio.open(terrain_files[0]) as src_ref:
        if points.crs != src_ref.crs:
            points = points.to_crs(src_ref.crs)

# --- Extract raster values for each point ---
coords = [(x, y) for x, y in zip(points.geometry.x, points.geometry.y)]

for raster_path in terrain_files:
    name = os.path.splitext(os.path.basename(raster_path))[0]
    with rasterio.open(raster_path) as src:
        values = []
        for val in src.sample(coords):
            if val is None or np.isnan(val[0]):
                values.append(np.nan)
            else:
                values.append(val[0])
        points[name] = values
    print(f"✅ Extracted {name}")

# --- Add categorical labels for aspect_classes ---
if "aspect_classes" in points.columns:
    aspect_lookup = {
        1: "N", 2: "NE", 3: "E", 4: "SE",
        5: "S", 6: "SW", 7: "W", 8: "NW"
    }
    points["aspect_label"] = points["aspect_classes"].map(aspect_lookup)

# --- ridge_levels left as numeric (bands 100–500).
#     If you want labels like "100-200m", we can map them here later.

# --- Save to CSV (without geometry column) ---
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_terrain.csv"
points.drop(columns="geometry").to_csv(output_csv, index=False)

print("✅ CSV saved at:", output_csv)


✅ Extracted MRVBF
✅ Extracted RLD
✅ Extracted aspect
✅ Extracted aspect_classes
✅ Extracted aspect_cos
✅ Extracted aspect_sin
✅ Extracted dem_filledfiltered
✅ Extracted flow_accumulation
✅ Extracted relief
✅ Extracted ridge_levels
✅ Extracted roughness
✅ Extracted slope
✅ Extracted twi_300m
✅ Extracted valleydepth2
✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_terrain.csv
