In [1]:
# === 1. Imports ===
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from sqlalchemy import create_engine

# === 2. Connect to PostgreSQL ===
USER = "inesschwartz"
PASSWORD = "aa4862aa"
HOST = "localhost"
PORT = "5432"
DB = "soils_angola"

engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}")

# === 3. Load tables from Postgres ===
analyses = pd.read_sql("SELECT * FROM analyses", engine)
morpho = pd.read_sql("SELECT * FROM morpho", engine)
samples = pd.read_sql("SELECT * FROM samples", engine)

# === 4. Load usable_sites from GeoPackage (already in UTM 33S) ===
gpkg_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/usable_site_info_epsg32733_clean.gpkg"
usable_sites = gpd.read_file(gpkg_path, layer="usable_sites_clean")

print("usable_sites CRS:", usable_sites.crs)
print("usable_sites bounds:", usable_sites.total_bounds)

# --- Keep only relevant columns for merging ---
usable_sites_subset = usable_sites[['profile', 'X_coord', 'Y_coord', 'site_info_id', 'district']].copy()

# --- Filter analyses and morpho by valid profiles ---
valid_profiles = usable_sites_subset['profile'].dropna().unique()
analyses_filtered = analyses[analyses['profile'].isin(valid_profiles)].copy()
morpho_filtered = morpho[morpho['profile'].isin(valid_profiles)].copy()

# --- Ensure merge keys are strings ---
analyses_filtered.loc[:, 'sample_id'] = analyses_filtered['sample_id'].astype(str)
morpho_filtered.loc[:, 'sample_id'] = morpho_filtered['sample_id'].astype(str)

# --- Merge analyses and morpho ---
combined_data = pd.merge(
    analyses_filtered,
    morpho_filtered,
    on=['profile', 'morpho_id'],
    how='inner',
    suffixes=('', '_morpho')
)

# --- Merge with usable_sites (retain UTM coordinates) ---
merged_final = pd.merge(
    combined_data,
    usable_sites_subset,
    on='profile',
    how='inner'
)

# --- Merge 'year' from samples if present ---
if 'sample_id' in samples.columns and 'year' in samples.columns:
    merged_final = pd.merge(
        merged_final,
        samples[['sample_id', 'year']],
        on='sample_id',
        how='left'
    )

# --- Convert to GeoDataFrame using X/Y coordinates (assume currently in UTM meters from GeoPackage) ---
merged_final_gdf = gpd.GeoDataFrame(
    merged_final,
    geometry=gpd.points_from_xy(merged_final['X_coord'], merged_final['Y_coord']),
    crs="EPSG:32733"  # UTM Zone 33S
)

# convert from degrees to utm
merged_final_gdf = gpd.GeoDataFrame(
    merged_final,
    geometry=gpd.points_from_xy(merged_final['X_coord'], merged_final['Y_coord']),
    crs="EPSG:4326"
).to_crs("EPSG:32733")
merged_final_gdf['X_coord'] = merged_final_gdf.geometry.x
merged_final_gdf['Y_coord'] = merged_final_gdf.geometry.y

# --- Check results ---
print("merged_final_gdf CRS:", merged_final_gdf.crs)
print("merged_final_gdf bounds:", merged_final_gdf.total_bounds)
print("Merged dataset columns:", merged_final_gdf.columns)
print("Number of rows:", merged_final_gdf.shape[0])


usable_sites CRS: EPSG:32733
usable_sites bounds: [ 174544.65507126 7998325.21414117 1482544.10707881 9508186.76580929]
merged_final_gdf CRS: EPSG:32733
merged_final_gdf bounds: [ 174544.60741366 7998325.19151826 1482544.14618381 9508186.76347845]
Merged dataset columns: Index(['lab_sample_id', 'analysis_id', 'morpho_id', 'sample_id', 'profile',
       'soil_biology_id', 'eg', 'thick_sand', 'fine_sand', 'silt',
       ...
       'durability', 'friability', 'thick_contents_count',
       'thick_contents_nature', 'X_coord', 'Y_coord', 'site_info_id',
       'district', 'year', 'geometry'],
      dtype='object', length=108)
Number of rows: 6880


  analyses_filtered.loc[:, 'sample_id'] = analyses_filtered['sample_id'].astype(str)


In [2]:
import pandas as pd

# --- Create a regular DataFrame from the GeoDataFrame ---
merged_final_df = merged_final_gdf.copy()

# --- Ensure X_coord and Y_coord are included (from geometry if needed) ---
if 'geometry' in merged_final_df.columns:
    merged_final_df['X_coord'] = merged_final_df.geometry.x
    merged_final_df['Y_coord'] = merged_final_df.geometry.y

# --- Drop the geometry column ---
merged_final_df = merged_final_df.drop(columns=['geometry'], errors='ignore')

# --- Save to CSV ---
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/merged_final.csv"
merged_final_df.to_csv(output_csv, index=False)

print(f"CSV saved at: {output_csv}")
print(f"Rows: {merged_final_df.shape[0]}, Columns: {merged_final_df.shape[1]}")


CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/merged_final.csv
Rows: 6880, Columns: 107


In [4]:
# List of columns to drop
cols_to_drop = [
    'geo_features_id',
    'climate_id',
    'topo_id',
    'soil_type_id',
    'geom',
    'field_sample_code',  
    'depth',
    'al',
    'si',
    'p',
    's',
    'cl',
    'ti',
    'cr',
    'mn',
    'fe',
    'co',
    'ni',
    'cu',
    'zn',
    'arsenic',
    'se',
    'rb',
    'sr',
    'zr',
    'nb',
    'mo',
    'cd',
    'sn',
    'sb',
    'ba',
    'ta',
    'w',
    'pt',
    'au',
    'hg',
    'tl',
    'pb',
    'bi',
    'th',
    'u'
]

# Drop these columns if they exist in the DataFrame
merged_final = merged_final.drop(columns=[col for col in cols_to_drop if col in merged_final.columns])

# Check the remaining columns
list(merged_final.columns)

['lab_sample_id',
 'analysis_id',
 'morpho_id',
 'sample_id',
 'profile',
 'soil_biology_id',
 'eg',
 'thick_sand',
 'fine_sand',
 'silt',
 'clay',
 'eq_hum',
 'atm_1/3',
 'atm_15',
 'caco3',
 'gypsum',
 'free_iron',
 'organic_carbon',
 'total_n',
 'p205',
 'organic_material',
 'ph_h2o',
 'ph_kcl',
 'Ca++',
 'Mg++',
 'Na+',
 'K+',
 'exchangable_bases_sum',
 'cec',
 'v',
 'conductivity',
 'soluble_sodium',
 'Min_<0,002',
 'Min_0,05-0,02',
 'Min_0,2-0,05',
 'Min_2-0,2',
 'porosity',
 'bulk_density',
 'sample_depth',
 'sample_id_morpho',
 'horizon_layer',
 'upper_depth',
 'lower_depth',
 'moisture_degree',
 'root_quantity',
 'root_diameter',
 'texture',
 'structure_type',
 'structure_class',
 'structure_degree',
 'pore_diameter',
 'pore_quantity',
 'pore_shape',
 'dry_color_name',
 'dry_hue',
 'dry_value',
 'dry_chroma',
 'moist_color_name',
 'moist_hue',
 'moist_value',
 'moist_chroma',
 'compaction',
 'durability',
 'friability',
 'thick_contents_count',
 'thick_contents_nature',
 'X_co

In [6]:
merged_final_df.columns

Index(['lab_sample_id', 'analysis_id', 'morpho_id', 'sample_id', 'profile',
       'soil_biology_id', 'eg', 'thick_sand', 'fine_sand', 'silt',
       ...
       'compaction', 'durability', 'friability', 'thick_contents_count',
       'thick_contents_nature', 'X_coord', 'Y_coord', 'site_info_id',
       'district', 'year'],
      dtype='object', length=107)

In [3]:
import rasterio
import geopandas as gpd
import pandas as pd
import glob
import os
import numpy as np
from shapely.geometry import Point

# --- Use only coordinates from merged_final_gdf ---
points = pd.DataFrame({
    "X_coord": merged_final_gdf["X_coord"],
    "Y_coord": merged_final_gdf["Y_coord"],
    "site_info_id": merged_final_gdf["site_info_id"],
    "profile": merged_final_gdf["profile"],
    "district": merged_final_gdf["district"]
})

# --- Convert to GeoDataFrame ---
points_gdf = gpd.GeoDataFrame(
    points,
    geometry=gpd.points_from_xy(points["X_coord"], points["Y_coord"]),
    crs=merged_final_gdf.crs
)

In [4]:
points_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 6880 entries, 0 to 6879
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   X_coord       6880 non-null   float64 
 1   Y_coord       6880 non-null   float64 
 2   site_info_id  6880 non-null   object  
 3   profile       6880 non-null   object  
 4   district      6880 non-null   object  
 5   geometry      6880 non-null   geometry
dtypes: float64(2), geometry(1), object(3)
memory usage: 322.6+ KB


In [5]:
# Keep only one row per site_info_id
points_unique = points_gdf.drop_duplicates(subset='site_info_id').copy()  # <-- copy()!!

# Optional: keep only the columns you care about, and copy again to be safe
points_unique = points_unique[['site_info_id', 'X_coord', 'Y_coord', 'profile', 'district', 'geometry']].copy()

print(points_unique.shape)


(1470, 6)


In [6]:
# -----------------------------
# Soil polygon join
# -----------------------------
soil_gpkg = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_gpkg_stuff/angola_soil_data2.gpkg"

# Load soil polygons
soil = gpd.read_file(soil_gpkg)[["faosoil_id", "geometry"]]

# Keep only one row per site in points_gdf
points_unique = points_gdf.drop_duplicates(subset='site_info_id')

# Ensure CRS is defined for both
if points_unique.crs is None:
    points_unique = points_unique.set_crs(epsg=32733)  # adjust if needed
if soil.crs is None:
    soil = soil.set_crs(epsg=32733)

# Reproject soil to match points
if points_unique.crs != soil.crs:
    soil = soil.to_crs(points_unique.crs)

# Spatial join
points_soil = gpd.sjoin(points_unique, soil, how="left", predicate="intersects")

# Keep only the columns you need
points_soil_clean = points_soil[['site_info_id', 'faosoil_id']].copy()

# Save to CSV
points_soil_clean.to_csv(
    "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_from_vector_clean.csv",
    index=False
)


In [7]:
points_soil_clean.head()

Unnamed: 0,site_info_id,faosoil_id
0,2400,41.0
1,2045,1.0
2,2523,110.0
3,33,112.0
4,1060,18.0


In [9]:
import rasterio

# -----------------------------
# 2️⃣ Landsurface extraction
landsurface_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/landsurfaceforms_1km.tif"

with rasterio.open(landsurface_raster) as src:
    if points_unique.crs != src.crs:
        points_unique = points_unique.to_crs(src.crs)
    
    coords = [(x, y) for x, y in zip(points_unique.geometry.x, points_unique.geometry.y)]
    
    # Use .loc to assign column safely
    points_unique.loc[:, 'landsurface_value'] = [val[0] for val in src.sample(coords)]

# -----------------------------
# Optional: map numeric codes to labels
# -----------------------------
# landsurface_lookup = {
#     1: "smooth_plains",
#     2: "irregular_plains",
#     3: "escarpments",
#     4: "hills",
#     5: "breaks",
#     6: "low_mountains",
#     7: "high_mountains/deep_canyons"
# }

# points_unique.loc[:, 'landsurface_label'] = points_unique['landsurface_value'].map(landsurface_lookup)

# -----------------------------
# Save clean CSV
# -----------------------------
landsurface_points_clean = points_unique[['site_info_id', 'landsurface_value']].copy()
landsurface_points_clean.to_csv(
    "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/landsurface_sample_points.csv",
    index=False
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [12]:
# -----------------------------
# 3️⃣ Lithology raster extraction
# -----------------------------
litho_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/lithology_1km.tif"
with rasterio.open(litho_raster) as src:
    if points_unique.crs != src.crs:
        points_unique = points_unique.to_crs(src.crs)
    coords = [(x, y) for x, y in zip(points_unique.geometry.x, points_unique.geometry.y)]
    # Use .loc to assign column safely
    points_unique.loc[:, 'litho_value'] = [val[0] for val in src.sample(coords)]

# litho_lookup = {
#     1: "Carbonate", 2: "Karst", 3: "Non-Carbonate",
#     4: "Metasedimentary", 5: "Alkaline Intrusive Volcanic",
#     6: "Silicic", 7: "Metaigneous"
# }
# points_unique.loc['litho_label'] = points_unique['litho_value'].map(litho_lookup)

#save site_info_id and litho value csv
litho_points_clean = points_unique[['site_info_id', 'litho_value']].copy()
litho_points_clean.to_csv(
    "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/litho_sample_points.csv",
    index=False
)

# points_gdf[['site_info_id', 'litho_value', 'litho_label']].to_csv(
#     "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/litho_sample_points.csv",
#     index=False
# )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [14]:
points_unique.head()

Unnamed: 0,X_coord,Y_coord,site_info_id,profile,district,geometry,landsurface_value,litho_value
0,459336.209497,8527991.0,2400,113_57,Benguela,POINT (459336.209 8527990.763),2,2
1,500180.692732,8635661.0,2045,253_57,Benguela,POINT (500180.693 8635661.383),6,5
2,692160.845254,8462997.0,2523,109_62,Bie,POINT (692160.845 8462996.974),2,2
3,228631.694806,9473161.0,33,20_59,Cabinda,POINT (228631.695 9473160.686),2,2
4,734354.85048,8938585.0,1060,27_63,Malanje,POINT (734354.85 8938585.159),1,1


In [18]:
# 4️⃣ Ecosystem raster extraction
import rasterio
import geopandas as gpd
import numpy as np

ecosystem_raster = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/labelled_ecosystems32733_1km.tif"

points_unique = points_unique.copy()

with rasterio.open(ecosystem_raster) as src:
    if points_unique.crs != src.crs:
        points_unique = points_unique.to_crs(src.crs)

    # # Compute raster bounds
    # left, bottom, right, top = src.bounds
    
    # # Keep only points inside raster extent
    # points_inside = points_unique.cx[left:right, bottom:top].copy()

    # Extract raster values
    coords = [(x, y) for x, y in zip(points_unique.geometry.x, points_unique.geometry.y)]
    points_unique.loc[:, 'formation'] = [val[0] for val in src.sample(coords)]

# Save CSV
ecoformation_points_clean = points_unique[['site_info_id', 'formation']].copy()
ecoformation_points_clean.to_csv(
    "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/ecoformation_sample_points.csv",
    index=False
)


In [19]:
ecoformation_points_clean.describe

<bound method NDFrame.describe of      site_info_id  formation
0            2400        113
1            2045         96
2            2523        113
3              33          1
4            1060         96
...           ...        ...
6843          179          3
6854          139          3
6861          128        113
6867          176          1
6874          348        113

[1470 rows x 2 columns]>

In [20]:
points_unique.head()

Unnamed: 0,X_coord,Y_coord,site_info_id,profile,district,geometry,landsurface_value,litho_value,formation
0,459336.209497,8527991.0,2400,113_57,Benguela,POINT (459336.209 8527990.763),2,2,113
1,500180.692732,8635661.0,2045,253_57,Benguela,POINT (500180.693 8635661.383),6,5,96
2,692160.845254,8462997.0,2523,109_62,Bie,POINT (692160.845 8462996.974),2,2,113
3,228631.694806,9473161.0,33,20_59,Cabinda,POINT (228631.695 9473160.686),2,2,1
4,734354.85048,8938585.0,1060,27_63,Malanje,POINT (734354.85 8938585.159),1,1,96


In [22]:
import os, glob
import rasterio

# -----------------------------
# 5️⃣ Bioclimatic rasters extraction (multiple rasters)
# -----------------------------
bioclim_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/"
bioclim_files = sorted(glob.glob(os.path.join(bioclim_folder, "*.tif")))

# Extract base names for columns
bioclim_cols = [os.path.splitext(os.path.basename(f))[0] for f in bioclim_files]

# # Work on a minimal copy of points_unique
# points_for_bioclim = points_unique[['site_info_id', 'X_coord', 'Y_coord', 'profile', 'district', 'geometry']].copy()

# Loop through rasters and extract values
for raster_path, colname in zip(bioclim_files, bioclim_cols):
    with rasterio.open(raster_path) as src:
        # Reproject points if needed
        if points_unique.crs != src.crs:
            points_unique = points_unique.to_crs(src.crs)

        #extract values
        coords = [(x, y) for x, y in zip(points_unique.geometry.x, points_unique.geometry.y)]
        
        # Extract values as floats
        values = [float(val[0]) if val[0] is not None else float('nan') for val in src.sample(coords)]
        points_unique[colname] = values

# ✅ Export only site_info_id plus bioclim columns
bioclim_points_clean = points_unique[['site_info_id'] + bioclim_cols].copy()
bioclim_points_clean.to_csv(
    "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_bioclim1.csv",
    index=False
)


In [23]:
bioclim_points_clean.head()

Unnamed: 0,site_info_id,2mean_temp_coldest_quarter32733,annual_mean_temp,annual_precip2,isothermality_32733,max_temp_warmest_month32733,mean_temp_coldest_quarter32733,mean_temp_driest_quarter32733,mean_temp_warmest_quarter32733,mean_temp_wettest_quarter32733,min_temp_coldest_month32733,precip_coldest_quarter32733,precip_driest_month32733,precip_driest_quarter32733,precip_seasonality2,precip_warmest_quarter32733,precip_wettest_month32733,precip_wettest_quarter32733,temp_annual_range32733,temp_seasonality32733
0,2400,18.200001,20.195724,1239.0,67.431435,28.630302,-3.4028230000000003e+38,18.17577,21.258272,68.204468,9.9,3.0,0.0,1.0,91.0,298.0,297.0,598.0,18.728491,12.426725
1,2045,17.700001,19.793007,1246.0,66.914627,28.080009,-3.4028230000000003e+38,17.69533,20.801792,66.398239,9.6,1.0,0.0,1.0,83.0,356.0,233.0,563.0,18.484959,13.45796
2,2523,16.6,19.878946,1094.0,60.992401,30.474224,-3.4028230000000003e+38,16.667294,21.873129,60.989052,6.2,11.0,0.0,1.0,91.0,214.0,217.0,570.0,24.277786,21.55159
3,33,22.200001,24.979763,1190.0,55.958679,31.16983,-3.4028230000000003e+38,22.184814,26.679775,56.000671,17.9,3.0,0.0,3.0,78.0,541.0,202.0,541.0,13.290055,18.010233
4,1060,20.4,21.02907,1329.0,72.0,30.091702,-3.4028230000000003e+38,20.379465,21.370792,72.16349,10.8,8.0,0.0,8.0,76.0,241.0,225.0,569.0,19.310223,4.925413


In [25]:
import os, glob
import rasterio

# -----------------------------
# 6️⃣ DEM / terrain rasters
# -----------------------------
dem_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/dem_1km/"
dem_files = sorted(glob.glob(os.path.join(dem_folder, "*.tif")))

# Extract base names once for column selection
dem_cols = [os.path.splitext(os.path.basename(f))[0] for f in dem_files]

for raster_path, colname in zip(dem_files, dem_cols):
    with rasterio.open(raster_path) as src:
        if points_unique.crs != src.crs:
            points_unique = points_unique.to_crs(src.crs)
        coords = [(x, y) for x, y in zip(points_unique.geometry.x, points_unique.geometry.y)]
        points_unique[colname] = [val[0] for val in src.sample(coords)]
    print(f"✅ Extracted {colname}")

# Optional: aspect class labels
# if "aspect_classes" in points_gdf.columns:
#     aspect_lookup = {1:"N",2:"NE",3:"E",4:"SE",5:"S",6:"SW",7:"W",8:"NW"}
#     points_gdf["aspect_label"] = points_gdf["aspect_classes"].map(aspect_lookup)

# ✅ Export only site_info_id plus all DEM-derived columns
dem_points_clean = points_unique[['site_info_id'] + dem_cols].copy()
dem_points_clean.to_csv(
    "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_DEM.csv",
    index=False
)

print("✅ All feature extractions completed")


✅ Extracted MRVBF_1km
✅ Extracted RLD_1km
✅ Extracted aspect_1km
✅ Extracted aspect_cos_1km
✅ Extracted aspect_sin_1km
✅ Extracted dem_filledfiltered_1km
✅ Extracted flow_accumulation_1km
✅ Extracted relief_1km
✅ Extracted ridge_levels_1km
✅ Extracted roughness_1km
✅ Extracted slope_1km
✅ Extracted twi_300m_1km
✅ Extracted valleydepth2_1km
✅ All feature extractions completed


In [26]:
dem_points_clean.head()

Unnamed: 0,site_info_id,MRVBF_1km,RLD_1km,aspect_1km,aspect_cos_1km,aspect_sin_1km,dem_filledfiltered_1km,flow_accumulation_1km,relief_1km,ridge_levels_1km,roughness_1km,slope_1km,twi_300m_1km,valleydepth2_1km
0,2400,3.559784,187.669235,215.393768,0.438118,-0.288529,1116.900635,170716.2,9.05576,1897.40918,9.05576,2.67782,21.252899,780.508606
1,2045,0.38707,500.754242,229.070862,-0.492347,-0.64404,1773.384277,440724.8,140.375885,2150.668945,140.375885,13.268339,20.468124,377.284607
2,2523,8.861734,60.752499,132.237305,-0.13054,0.380688,1583.868164,184287.8,18.363409,1622.848999,18.363409,3.495776,21.380291,38.980824
3,33,0.400248,291.799225,187.118362,-0.224682,-0.106604,-21.849752,1319044.0,136.305099,594.587646,136.305099,4.931882,23.071821,616.437439
4,1060,2.034931,76.119286,208.759354,-0.085453,-0.153432,1163.915771,251142.6,21.164112,1236.079346,21.164112,2.98364,21.475948,72.163498


In [30]:
points_unique.head()

Unnamed: 0,X_coord,Y_coord,site_info_id,profile,district,geometry,landsurface_value,litho_value,formation,2mean_temp_coldest_quarter32733,...,aspect_cos_1km,aspect_sin_1km,dem_filledfiltered_1km,flow_accumulation_1km,relief_1km,ridge_levels_1km,roughness_1km,slope_1km,twi_300m_1km,valleydepth2_1km
0,459336.209497,8527991.0,2400,113_57,Benguela,POINT (459336.209 8527990.763),2,2,113,18.200001,...,0.438118,-0.288529,1116.900635,170716.2,9.05576,1897.40918,9.05576,2.67782,21.252899,780.508606
1,500180.692732,8635661.0,2045,253_57,Benguela,POINT (500180.693 8635661.383),6,5,96,17.700001,...,-0.492347,-0.64404,1773.384277,440724.8,140.375885,2150.668945,140.375885,13.268339,20.468124,377.284607
2,692160.845254,8462997.0,2523,109_62,Bie,POINT (692160.845 8462996.974),2,2,113,16.6,...,-0.13054,0.380688,1583.868164,184287.8,18.363409,1622.848999,18.363409,3.495776,21.380291,38.980824
3,228631.694806,9473161.0,33,20_59,Cabinda,POINT (228631.695 9473160.686),2,2,1,22.200001,...,-0.224682,-0.106604,-21.849752,1319044.0,136.305099,594.587646,136.305099,4.931882,23.071821,616.437439
4,734354.85048,8938585.0,1060,27_63,Malanje,POINT (734354.85 8938585.159),1,1,96,20.4,...,-0.085453,-0.153432,1163.915771,251142.6,21.164112,1236.079346,21.164112,2.98364,21.475948,72.163498


In [34]:
# Merge faosoil_id from points_soil_clean into points_unique
points_unique = points_unique.merge(
    points_soil_clean[['site_info_id', 'faosoil_id']],
    on='site_info_id',
    how='inner'  
)


In [35]:
points_unique.columns

Index(['X_coord', 'Y_coord', 'site_info_id', 'profile', 'district', 'geometry',
       'landsurface_value', 'litho_value', 'formation',
       '2mean_temp_coldest_quarter32733', 'annual_mean_temp', 'annual_precip2',
       'isothermality_32733', 'max_temp_warmest_month32733',
       'mean_temp_coldest_quarter32733', 'mean_temp_driest_quarter32733',
       'mean_temp_warmest_quarter32733', 'mean_temp_wettest_quarter32733',
       'min_temp_coldest_month32733', 'precip_coldest_quarter32733',
       'precip_driest_month32733', 'precip_driest_quarter32733',
       'precip_seasonality2', 'precip_warmest_quarter32733',
       'precip_wettest_month32733', 'precip_wettest_quarter32733',
       'temp_annual_range32733', 'temp_seasonality32733', 'MRVBF_1km',
       'RLD_1km', 'aspect_1km', 'aspect_cos_1km', 'aspect_sin_1km',
       'dem_filledfiltered_1km', 'flow_accumulation_1km', 'relief_1km',
       'ridge_levels_1km', 'roughness_1km', 'slope_1km', 'twi_300m_1km',
       'valleydepth2_1km', '

SAVE points_unique

In [36]:
import pandas as pd
from functools import reduce


# --- Save final table ---
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/training_data_table_final.csv"
points_unique.to_csv(output_csv, index=False)

print(f"✅ Covariate dataset created: {points_unique.shape[0]} rows x {points_unique.shape[1]} columns")

✅ Covariate dataset created: 1470 rows x 42 columns


ADD harmonized SOC dataset

In [37]:
import pandas as pd

# Load SOC stock CSV
soc_stock = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/harmonized_soc_with_log.csv")

# Keep only profile and log_soc_stock, drop other columns
soc_stock_clean = soc_stock[['profile', 'log_soc_stock']]

# Load your training dataset
train_df_final = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/training_data_table_final.csv")

# Merge training dataset with SOC stock by profile
train_with_soc = pd.merge(train_df_final, soc_stock_clean, on='profile', how='inner')

# Optional: save the merged dataset to CSV
train_with_soc.to_csv("/Users/inesschwartz/Desktop/training_data_with_log_soc.csv", index=False)

print("Merged training dataset created with log-transformed SOC stock as response variable.")
train_with_soc.head()


Merged training dataset created with log-transformed SOC stock as response variable.


Unnamed: 0,X_coord,Y_coord,site_info_id,profile,district,geometry,landsurface_value,litho_value,formation,2mean_temp_coldest_quarter32733,...,dem_filledfiltered_1km,flow_accumulation_1km,relief_1km,ridge_levels_1km,roughness_1km,slope_1km,twi_300m_1km,valleydepth2_1km,faosoil_id,log_soc_stock
0,459336.209497,8527991.0,2400,113_57,Benguela,POINT (459336.209497001 8527990.762870297),2,2,113,18.200001,...,1116.9006,170716.2,9.05576,1897.4092,9.05576,2.67782,21.2529,780.5086,41.0,1.643861
1,692160.845254,8462997.0,2523,109_62,Bie,POINT (692160.845254297 8462996.974415973),2,2,113,16.6,...,1583.8682,184287.77,18.363409,1622.849,18.363409,3.495776,21.380291,38.980824,110.0,1.06804
2,604679.723785,8395856.0,2615,410_55,Huila,POINT (604679.7237846216 8395856.279717674),2,2,113,16.5,...,1419.5433,8284751.0,127.71323,1573.6846,127.71323,2.879179,23.242464,154.14119,47.0,0.440282
3,736205.31619,8452868.0,2537,118_62,Bie,POINT (736205.316190272 8452867.693218533),1,1,113,16.299999,...,1506.2876,-49909.29,14.437837,1602.216,14.437837,2.014339,21.308989,95.92831,104.0,0.986052
4,785664.442674,8728324.0,1714,111C_63,Malanje,POINT (785664.4426735085 8728324.346077582),1,1,97,18.700001,...,1232.9146,238993.12,25.479649,1413.0155,25.479649,2.751681,21.013489,180.10092,40.0,1.154745


### Adjust column names

In [38]:
columns_renamed = {
    "aspect_1km": "aspect",
    "aspect_cos_1km": "aspect_cos",
    "aspect_sin_1km": "aspect_sin",
    "MRVBF_1km": "MRVBF",
    "RLD_1km": "RLD",
    "dem_filledfiltered_1km": "DEM",
    "flow_accumulation_1km": "flow_accumulation",
    "relief_1km": "relief",
    "ridge_levels_1km": "ridge_levels",
    "roughness_1km": "roughness",  # corrected key
    "slope_1km": "slope",
    "twi_300m_1km": "TWI",
    "valleydepth2_1km": "valleydepth",

    "2mean_temp_coldest_quarter32733": "mean_temp_coldest_quarter",
    "annual_precip2": "annual_precip",
    "isothermality_32733": "isothermality",
    "max_temp_warmest_month32733": "max_temp_warmest_month",
    # duplicate dropped: "mean_temp_coldest_quarter32733"
    "mean_temp_driest_quarter32733": "mean_temp_driest_quarter",
    "mean_temp_warmest_quarter32733": "mean_temp_warmest_quarter",
    "mean_temp_wettest_quarter32733": "mean_temp_wettest_quarter",
    "min_temp_coldest_month32733": "min_temp_coldest_month",
    "precip_coldest_quarter32733": "precip_coldest_quarter",
    "precip_driest_month32733": "precip_driest_month",
    "precip_warmest_quarter32733": "precip_warmest_quarter",
    "precip_driest_quarter32733": "precip_driest_quarter",
    "precip_wettest_month32733": "precip_wettest_month",
    "precip_seasonality2": "precip_seasonality",
    "temp_annual_range32733": "temp_annual_range",
    "temp_seasonality32733": "temp_seasonality",
    "precip_wettest_quarter32733": "precip_wettest_quarter"
}

columns_to_drop = [
    "mean_temp_coldest_quarter32733",  # duplicate column
    "landsurface_label",
    "litho_label",
]

# Rename + drop
train_with_soc = (
    train_with_soc
        .rename(columns=columns_renamed)
        .drop(columns=columns_to_drop, errors="ignore")
)

# Save to CSV
output_path = "/Users/inesschwartz/Desktop/training_data.csv"
train_with_soc.to_csv(output_path, index=False)

print(f"Cleaned DataFrame saved to: {output_path}")


Cleaned DataFrame saved to: /Users/inesschwartz/Desktop/training_data.csv
