### Making training dataset

0. Check out data I am working with pre 0-30cm profile harmonization

1. (not doing) bring on chemical and physical soil variables (*harmonized to 0-30cm per profile via mass preserving spline, mode, and categorical majority*)

2. extract vectors (soil type) per profile/sample site
3. extract rasters (bioclimatic, precipitation, temp max and min, ecosystem, lithology, landsurface type, land cover % and DEM/terrain) per profile/sample site
4. merge into one training dataset by profile (or site_id)


## actual table prep

In [1]:
# === 1. Imports ===
import pandas as pd
import geopandas as gpd
from sqlalchemy import create_engine

# === 2. Connect to PostgreSQL ===
USER = "inesschwartz"
PASSWORD = "aa4862aa"
HOST = "localhost"
PORT = "5432"
DB = "soils_angola"

engine = create_engine(f"postgresql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}")

# === 3. Load tables from Postgres ===
analyses = pd.read_sql("SELECT * FROM analyses", engine)
morpho = pd.read_sql("SELECT * FROM morpho", engine)
samples = pd.read_sql("SELECT * FROM samples", engine)

# === 4. Load usable_sites from GeoPackage ===
gpkg_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/usable_site_info_epsg32733_clean.gpkg"
usable_sites = gpd.read_file(gpkg_path, layer="usable_sites_clean")

print("Original usable_sites CRS:", usable_sites.crs)
print("Bounds:", usable_sites.total_bounds)

# --- Keep only relevant columns ---
usable_sites_subset = (
    usable_sites[['site_info_id', 'profile', 'X_coord', 'Y_coord', 'district']]
    .dropna()
    .drop_duplicates()
    .copy()
)

# --- Step 1: Set correct CRS (degrees) without changing coordinates ---
usable_sites_subset = gpd.GeoDataFrame(
    usable_sites_subset,
    geometry=gpd.points_from_xy(usable_sites_subset['X_coord'], usable_sites_subset['Y_coord']),
    crs="EPSG:4326"  # coordinates are in degrees
)

# --- Step 2: Reproject to UTM 33S (meters) ---
usable_sites_subset = usable_sites_subset.to_crs("EPSG:32733")

# --- Step 3: Update X_coord/Y_coord in meters from geometry ---
usable_sites_subset['X_coord'] = usable_sites_subset.geometry.x
usable_sites_subset['Y_coord'] = usable_sites_subset.geometry.y

# --- Check results ---
print("✅ Reprojected usable_sites_subset CRS:", usable_sites_subset.crs)
print("Bounds (meters):", usable_sites_subset.total_bounds)
print("Columns:", usable_sites_subset.columns.tolist())
print("Number of rows:", len(usable_sites_subset))
print(usable_sites_subset.head())


Original usable_sites CRS: EPSG:32733
Bounds: [ 174544.65507126 7998325.21414117 1482544.10707881 9508186.76580929]
✅ Reprojected usable_sites_subset CRS: EPSG:32733
Bounds (meters): [ 174544.60741366 7998325.19151826 1482544.14618381 9508186.76347845]
Columns: ['site_info_id', 'profile', 'X_coord', 'Y_coord', 'district', 'geometry']
Number of rows: 1474
  site_info_id profile        X_coord       Y_coord district  \
0         2770    1_57  195025.879923  8.315069e+06   Namibe   
1           48    1_59  231130.898920  9.461556e+06  Cabinda   
2          881    1_63  728708.842390  8.974120e+06  Malanje   
3         2675   10_54  440331.375297  8.350151e+06    Huila   
4         2698   10_55  333872.178710  8.343614e+06    Huila   

                         geometry  
0   POINT (195025.88 8315068.551)  
1   POINT (231130.899 9461556.11)  
2  POINT (728708.842 8974120.108)  
3  POINT (440331.375 8350150.631)  
4  POINT (333872.179 8343613.753)  


In [2]:
usable_sites_subset.head

<bound method NDFrame.head of      site_info_id profile        X_coord       Y_coord      district  \
0            2770    1_57  195025.879923  8.315069e+06        Namibe   
1              48    1_59  231130.898920  9.461556e+06       Cabinda   
2             881    1_63  728708.842390  8.974120e+06       Malanje   
3            2675   10_54  440331.375297  8.350151e+06         Huila   
4            2698   10_55  333872.178710  8.343614e+06         Huila   
...           ...     ...            ...           ...           ...   
1469         1689   99_63  845428.695068  8.739847e+06       Malanje   
1470         1246  99C_62  496081.085524  8.886510e+06    Cuanza Sul   
1471         1485  99C_63  777833.841368  8.794932e+06       Malanje   
1472          348   9C_60  576475.085504  9.208450e+06  Uige e Zaire   
1473         1213   9C_63  648834.637380  8.895791e+06       Malanje   

                            geometry  
0      POINT (195025.88 8315068.551)  
1      POINT (231130.899 94

In [3]:
# --- Merge 'year' from samples if present ---
if {'profile', 'year'}.issubset(samples.columns):
    # merge 'year' from samples by 'profile' (many samples can share one profile)
    usable_sites_subset = pd.merge(
        usable_sites_subset,
        samples[['profile', 'sample_id', 'year']],
        on='profile',
        how='left'  # keep all from usable_sites_subset
    )

# === 6. Convert to GeoDataFrame (ensure UTM 33S CRS) ===
usable_sites_subset = gpd.GeoDataFrame(
    usable_sites_subset,
    geometry=gpd.points_from_xy(usable_sites_subset['X_coord'], usable_sites_subset['Y_coord']),
    crs="EPSG:32733"
)

# === 7. Keep only the needed columns ===
final_cols = ["site_info_id", "profile", "sample_id", "X_coord", "Y_coord", "district", "year"]
usable_sites_subset = usable_sites_subset[final_cols + ['geometry']]

# === 8. Check results ===
print("CRS:", usable_sites_subset.crs)
print("Bounds:", usable_sites_subset.total_bounds)
print("Columns:", usable_sites_subset.columns)
print("Rows:", usable_sites_subset.shape[0])

# === 9. (Optional) Save output ===
# out_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/merged_sites_samples.gpkg"
# usable_sites_subset.to_file(out_path, layer="merged_sites_samples", driver="GPKG")
# print(f"Saved merged data to: {out_path}")

points = usable_sites_subset ## rename back to points bc that what old codes use

CRS: EPSG:32733
Bounds: [ 174544.60741366 7998325.19151826 1482544.14618381 9508186.76347845]
Columns: Index(['site_info_id', 'profile', 'sample_id', 'X_coord', 'Y_coord',
       'district', 'year', 'geometry'],
      dtype='object')
Rows: 5983


In [4]:
usable_sites_subset[usable_sites_subset['year'].isna()]


Unnamed: 0,site_info_id,profile,sample_id,X_coord,Y_coord,district,year,geometry
15,2675,10_54,,440331.375297,8.350151e+06,Huila,,POINT (440331.375 8350150.631)
16,2698,10_55,,333872.178710,8.343614e+06,Huila,,POINT (333872.179 8343613.753)
17,2139,100_56,,637881.888723,8.608926e+06,Huambo,,POINT (637881.889 8608925.636)
41,2138,101_56,,640772.919472,8.610186e+06,Huambo,,POINT (640772.919 8610186.221)
59,2486,102_55,,514720.344969,8.489733e+06,Huila,,POINT (514720.345 8489733.445)
...,...,...,...,...,...,...,...,...
5889,2212,95_69,,307014.272888,8.590995e+06,Benguela,,POINT (307014.273 8590994.702)
5905,2077,96_69,,347759.665112,8.628293e+06,Benguela,,POINT (347759.665 8628292.814)
5925,620,97_70,,546552.333930,9.058962e+06,Cuanza Norte,,POINT (546552.334 9058962.387)
5950,2319,98_69,,355545.240756,8.553962e+06,Benguela,,POINT (355545.241 8553961.948)


In [5]:
## drop rows w null/bad coordinates

import geopandas as gpd
import pandas as pd

# # --- Use the existing GeoDataFrame ---
# points = merged_final_gdf.copy()  # no read_file needed

# --- View the first few rows ---
points.head()

# --- Identify problematic rows where X_coord or Y_coord is NaN or 0.0 ---
mask_problematic = (
    points['X_coord'].isnull() |
    (points['X_coord'] == 0.0) |
    points['Y_coord'].isnull() |
    (points['Y_coord'] == 0.0)
)

# --- Drop problematic rows ---
points_clean = points[~mask_problematic].copy()

# --- Optional: reset index ---
points_clean.reset_index(drop=True, inplace=True)

# --- Report ---
print(f"Dropped {mask_problematic.sum()} problematic rows.")
print(f"Remaining rows: {len(points_clean)}")

# --- Save cleaned GeoPackage (optional) ---
# output_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/usable_site_info_epsg32733_clean.gpkg"
# points_clean.to_file(output_path, layer="usable_sites_clean", driver="GPKG")


Dropped 0 problematic rows.
Remaining rows: 5983


In [6]:
# merged_final_gdf contains repeated sample_id rows
# We want one row per profile with a single year

# First, check that each profile has a consistent year
profile_year_counts = points.groupby('profile')['year'].nunique()
if (profile_year_counts > 1).any():
    print("⚠️ Some profiles have multiple years. You may need to choose one or check your data.")
else:
    print("✅ Each profile has a single unique year.")
    
# # Collapse to one row per profile: take the first year if repeated
# profile_year = points.groupby('profile')['year'].first().reset_index()


⚠️ Some profiles have multiple years. You may need to choose one or check your data.


In [7]:
# --- Identify profiles with more than one year ---
profile_year_counts = points.groupby('profile')['year'].nunique()

# Filter profiles that have multiple years
profiles_multi_year = profile_year_counts[profile_year_counts > 1].index.tolist()

if profiles_multi_year:
    print(f"⚠️ Found {len(profiles_multi_year)} profiles with multiple sampling years.\n")
    
    # Show detailed info: profile, sample_id, year
    multi_year_details = points[points['profile'].isin(profiles_multi_year)]
    multi_year_summary = multi_year_details.groupby('profile')[['sample_id', 'year']].apply(lambda x: x.drop_duplicates().values.tolist())
    
    # Print nicely
    for profile, values in multi_year_summary.items():
        print(f"Profile: {profile}")
        for sample_id, year in values:
            print(f"    Sample ID: {sample_id}, Year: {year}")
        print()
else:
    print("✅ All profiles have a single unique year.")


⚠️ Found 4 profiles with multiple sampling years.

Profile: 120_60
    Sample ID: 6901, Year: 1959.0
    Sample ID: 7900, Year: 1960.0
    Sample ID: 7901, Year: 1960.0
    Sample ID: 7902, Year: 1960.0
    Sample ID: 7903, Year: 1960.0
    Sample ID: 7904, Year: 1960.0
    Sample ID: 7905, Year: 1960.0

Profile: 20_60
    Sample ID: 7717, Year: 1959.0
    Sample ID: 7718, Year: 1959.0
    Sample ID: 7719, Year: 1959.0
    Sample ID: 7720, Year: 1959.0
    Sample ID: 7721, Year: 1959.0
    Sample ID: 7845, Year: 1960.0
    Sample ID: 7846, Year: 1960.0
    Sample ID: 7847, Year: 1960.0
    Sample ID: 7848, Year: 1960.0
    Sample ID: 7849, Year: 1960.0
    Sample ID: 7850, Year: 1960.0

Profile: 80_60
    Sample ID: 7722, Year: 1959.0
    Sample ID: 7723, Year: 1959.0
    Sample ID: 7724, Year: 1959.0
    Sample ID: 7725, Year: 1959.0
    Sample ID: 7726, Year: 1959.0
    Sample ID: 7966, Year: 1960.0
    Sample ID: 7967, Year: 1960.0
    Sample ID: 7968, Year: 1960.0
    Sample ID: 79

In [8]:
# Select only the columns we need: profile and year (plus site info) and adds 4 repeated profiles (for dif years sampled)
profile_year_df = points[['site_info_id', 'profile', 'X_coord', 'Y_coord', 'district', 'geometry','year']].copy()

# Remove duplicates in case sample_id repeats within the same year
profile_year_df = profile_year_df.drop_duplicates(subset=['profile', 'year'])

# Optional: sort for readability
profile_year_df = profile_year_df.sort_values(['profile', 'year']).reset_index(drop=True)

# Inspect
print(profile_year_df.head(20))
print(f"Number of rows (profile-year combinations): {len(profile_year_df)}")


   site_info_id  profile       X_coord       Y_coord      district  \
0          2139   100_56  6.378819e+05  8.608926e+06        Huambo   
1          1927   100_58  3.857257e+05  8.669325e+06      Benguela   
2            17   100_59  2.485386e+05  9.488118e+06       Cabinda   
3          1392   100_61  5.452975e+05  8.833673e+06    Cuanza Sul   
4          1701   100_63  8.400081e+05  8.731220e+06       Malanje   
5          1934  101A_58  3.841355e+05  8.666721e+06      Benguela   
6          1532  101C_63  7.788994e+05  8.782837e+06       Malanje   
7          2138   101_56  6.407729e+05  8.610186e+06        Huambo   
8          2481   101_62  6.851300e+05  8.494643e+06           Bie   
9          2486   102_55  5.147203e+05  8.489733e+06         Huila   
10         3773  102_67H  5.978276e+05  8.462901e+06         Huila   
11         2483   103_55  5.150974e+05  8.492484e+06         Huila   
12         1559   103_63  8.962854e+05  8.769753e+06       Malanje   
13         2215   10

In [9]:
points2 = profile_year_df

In [10]:
# View the all columns
points2.head

<bound method NDFrame.head of      site_info_id profile        X_coord       Y_coord      district  \
0            2139  100_56  637881.888723  8.608926e+06        Huambo   
1            1927  100_58  385725.693290  8.669325e+06      Benguela   
2              17  100_59  248538.636350  9.488118e+06       Cabinda   
3            1392  100_61  545297.500131  8.833673e+06    Cuanza Sul   
4            1701  100_63  840008.131292  8.731220e+06       Malanje   
...           ...     ...            ...           ...           ...   
1473         1689   99_63  845428.695068  8.739847e+06       Malanje   
1474          348   9C_60  576475.085504  9.208450e+06  Uige e Zaire   
1475         1213   9C_63  648834.637380  8.895791e+06       Malanje   
1476         2928    9_62  673979.168557  8.495066e+06           Bie   
1477          960    9_63  726015.143834  8.959163e+06       Malanje   

                            geometry    year  
0     POINT (637881.889 8608925.636)     NaN  
1     POINT

In [11]:
points2[points2['year'].isna()]


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year
0,2139,100_56,637881.888723,8.608926e+06,Huambo,POINT (637881.889 8608925.636),
7,2138,101_56,640772.919472,8.610186e+06,Huambo,POINT (640772.919 8610186.221),
9,2486,102_55,514720.344969,8.489733e+06,Huila,POINT (514720.345 8489733.445),
10,3773,102_67H,597827.560280,8.462901e+06,Huila,POINT (597827.56 8462901.23),
11,2483,103_55,515097.359840,8.492484e+06,Huila,POINT (515097.36 8492483.88),
...,...,...,...,...,...,...,...
1454,2212,95_69,307014.272888,8.590995e+06,Benguela,POINT (307014.273 8590994.702),
1457,2077,96_69,347759.665112,8.628293e+06,Benguela,POINT (347759.665 8628292.814),
1463,620,97_70,546552.333930,9.058962e+06,Cuanza Norte,POINT (546552.334 9058962.387),
1467,2319,98_69,355545.240756,8.553962e+06,Benguela,POINT (355545.241 8553961.948),


##### infer years w missing data based on profile (_X)tells year sampled

What this does:

- Looks for an underscore followed by two digits at the end of the profile string (e.g., _56).

- Converts those two digits into a four-digit year, assuming 19XX for numbers ≥50 and 20XX for numbers <50 (flexible if future samples exist).

- Fills year_inferred only where year was previously NaN.

In [12]:
## infer years w missing data based on profile (_X)tells year sampled (and handles trailing letters)

import pandas as pd
import re

## infer years w missing data based on profile (_XX) tells year sampled

import pandas as pd
import re

# --- Function to extract year from profile name ---
def infer_year_from_profile(profile):
    """
    Extracts the year from profile strings like:
        '101_56'  → 1956
        '102_67H' → 1967
        '105_04A' → 2004
    Returns None if no valid pattern is found.
    """
    if isinstance(profile, str):
        # Match underscore + two digits, optionally followed by letters
        match = re.search(r'_(\d{2})[A-Za-z]*$', profile)
        if match:
            two_digit = int(match.group(1))
            # Convert to full year (assume 1900s for 50–99, 2000s for 00–49 if applicable)
            return 1900 + two_digit if two_digit >= 50 else 2000 + two_digit
    return None

# --- Count missing before ---
n_missing_before = points2['year'].isnull().sum()

# --- Infer years only for missing entries ---
mask_missing = points2['year'].isnull()
points2.loc[mask_missing, 'year'] = points2.loc[mask_missing, 'profile'].apply(infer_year_from_profile)

# --- Count missing after ---
n_missing_after = points2['year'].isnull().sum()
n_filled = n_missing_before - n_missing_after

# --- Report summary ---
print(f"✅ Inferred {n_filled} missing year values from profile names.")
print(f"Remaining missing years: {n_missing_after}")
print("\nExamples of inferred years:")
print(points2.loc[points2['profile'].str.contains('_', na=False), ['profile', 'year']].head(10))


✅ Inferred 384 missing year values from profile names.
Remaining missing years: 0

Examples of inferred years:
   profile    year
0   100_56  1956.0
1   100_58  1958.0
2   100_59  1959.0
3   100_61  1961.0
4   100_63  1963.0
5  101A_58  1958.0
6  101C_63  1963.0
7   101_56  1956.0
8   101_62  1962.0
9   102_55  1955.0


In [13]:
points2[points2['year'].isna()]


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year


In [14]:
points2.head()

Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year
0,2139,100_56,637881.888723,8608926.0,Huambo,POINT (637881.889 8608925.636),1956.0
1,1927,100_58,385725.69329,8669325.0,Benguela,POINT (385725.693 8669324.638),1958.0
2,17,100_59,248538.63635,9488118.0,Cabinda,POINT (248538.636 9488118.432),1959.0
3,1392,100_61,545297.500131,8833673.0,Cuanza Sul,POINT (545297.5 8833673.056),1961.0
4,1701,100_63,840008.131292,8731220.0,Malanje,POINT (840008.131 8731220.482),1963.0


## Extract soil data from .tif to csv per profile

In [15]:
# Count unique profiles
n_profiles1 = points2['profile'].nunique()
print(f"Unique profiles: {n_profiles1}")

Unique profiles: 1474


In [16]:
## extract Soil Data raster per point
import rasterio
import geopandas as gpd
import pandas as pd

# --- Copy points from previous steps ---
points_gdf = points2.copy()  # must have 'geometry' in EPSG:32733

# --- Load FAO soil raster ---
raster_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/angola_soil_gpkg_stuff/angola_soil_data_raster.tif"
with rasterio.open(raster_path) as raster:
    # Ensure CRS match
    if points_gdf.crs != raster.crs:
        points_gdf = points_gdf.to_crs(raster.crs)
    
    # Extract raster values at each point location
    coords = [(geom.x, geom.y) for geom in points_gdf.geometry]
    points_gdf["faosoil_id"] = [val[0] for val in raster.sample(coords)]

# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean = points_gdf.drop(columns=drop_cols, errors="ignore")

# --- Save to CSV ---
out_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/faosoil_id_sample_points.csv"
points_clean.drop(columns="geometry").to_csv(out_csv, index=False)

print(f"✅ faosoil_id values extracted and saved to: {out_csv}")


✅ faosoil_id values extracted and saved to: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/faosoil_id_sample_points.csv


## Extract the landsurfaceforms raster values per profile

In [17]:
import rasterio
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

# --- Use points directly ---

# --- Create geometry from X_coord / Y_coord if not already present ---
if points_clean.geometry.is_empty.all() or points_clean.geometry.isnull().all():
    points_clean = gpd.GeoDataFrame(
        points_clean,
        geometry=gpd.points_from_xy(points_clean['X_coord'], points_clean['Y_coord']),
        crs=points_clean.crs
    )

# --- Load your raster ---
raster_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/landsurfaceforms_1km.tif"
raster = rasterio.open(raster_path)

# --- Ensure points are in the same CRS as the raster ---
if points_clean.crs != raster.crs:
    points_clean = points_clean.to_crs(raster.crs)

# --- Extract raster values at point locations ---
coords = [(x, y) for x, y in zip(points_clean.geometry.x, points_clean.geometry.y)]
points_clean['landsurface_value'] = [val[0] for val in raster.sample(coords)]

# # --- Optional: convert codes to descriptive categories ---
# landsurface_lookup = {
#     1: "smooth_plains",
#     2: "irregular_plains",
#     3: "escarpments",
#     4: "hills",
#     5: "breaks",
#     6: "low_mountains",
#     7: "high_mountains/deep_canyons"
# }
# points3['landsurface_label'] = points3['landsurface_value'].map(landsurface_lookup)

# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean = points_clean.drop(columns=drop_cols, errors="ignore")

# --- Save to CSV ---
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/landsurface_sample_points.csv"
points_clean.drop(columns='geometry').to_csv(output_csv, index=False)

print("CSV saved at:", output_csv)


CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/landsurface_sample_points.csv


In [18]:
print(f"Shape: {points_clean.shape} (profiles x variables)")
points_clean.head()

Shape: (1478, 9) (profiles x variables)


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year,faosoil_id,landsurface_value
0,2139,100_56,637881.888723,8608926.0,Huambo,POINT (637881.889 8608925.636),1956.0,43,2
1,1927,100_58,385725.69329,8669325.0,Benguela,POINT (385725.693 8669324.638),1958.0,120,2
2,17,100_59,248538.63635,9488118.0,Cabinda,POINT (248538.636 9488118.432),1959.0,8,2
3,1392,100_61,545297.500131,8833673.0,Cuanza Sul,POINT (545297.5 8833673.056),1961.0,18,2
4,1701,100_63,840008.131292,8731220.0,Malanje,POINT (840008.131 8731220.482),1963.0,40,1


## Extract lithology value

In [19]:
import rasterio
import geopandas as gpd
import pandas as pd

# --- Load your raster ---
litho_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/lithology_1km.tif"
with rasterio.open(litho_path) as raster:

    # --- Ensure points are in the same CRS as the raster ---
    if points_clean.crs != raster.crs:
        points_clean = points_clean.to_crs(raster.crs)

    # --- Extract raster values at point locations ---
    coords = [(geom.x, geom.y) for geom in points_clean.geometry]
    points_clean['litho_value'] = [val[0] for val in raster.sample(coords)]

# # --- Optional: convert codes to descriptive categories ---
# litho_lookup = {
#     1: "Carbonate",
#     2: "Karst",
#     3: "Non-Carbonate",
#     4: "Metasedimentary",
#     5: "Alkaline Intrusive Volcanic",
#     6: "Silicic",
#     7: "Metaigneous"
# }
# points3['litho_label'] = points3['litho_value'].map(litho_lookup)

# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean = points_clean.drop(columns=drop_cols, errors="ignore")

# --- Save to CSV ---
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/litho_sample_points.csv"
points_clean.drop(columns='geometry').to_csv(output_csv, index=False)

print("✅ CSV saved at:", output_csv)

✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/litho_sample_points.csv


In [20]:
print(f"Shape: {points_clean.shape} (profiles x variables)")
points_clean.head()

Shape: (1478, 10) (profiles x variables)


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year,faosoil_id,landsurface_value,litho_value
0,2139,100_56,637881.888723,8608926.0,Huambo,POINT (637881.889 8608925.636),1956.0,43,2,2
1,1927,100_58,385725.69329,8669325.0,Benguela,POINT (385725.693 8669324.638),1958.0,120,2,2
2,17,100_59,248538.63635,9488118.0,Cabinda,POINT (248538.636 9488118.432),1959.0,8,2,2
3,1392,100_61,545297.500131,8833673.0,Cuanza Sul,POINT (545297.5 8833673.056),1961.0,18,2,2
4,1701,100_63,840008.131292,8731220.0,Malanje,POINT (840008.131 8731220.482),1963.0,40,1,1


## extract the ecosystem raster values per profile

In [21]:
import rasterio
import geopandas as gpd


# --- Load raster (ecosystem.tif) ---
raster_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/labelled_ecosystems32733_1km.tif"
with rasterio.open(raster_path) as raster:

    # --- Ensure points are in the same CRS as the raster ---
    if points_clean.crs != raster.crs:
        points_clean = points_clean.to_crs(raster.crs)

    # --- Extract raster values at point locations ---
    coords = [(geom.x, geom.y) for geom in points_clean.geometry]
    points_clean['formation'] = [val[0] for val in raster.sample(coords)]


# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean = points_clean.drop(columns=drop_cols, errors="ignore")


# --- Save or use the results ---
out_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/ecoformation_sample_points.csv"
points_clean.drop(columns="geometry").to_csv(out_csv, index=False)

print(f"Eco Formation values extracted and saved to: {out_csv}")


Eco Formation values extracted and saved to: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/ecoformation_sample_points.csv


In [22]:
print(f"Shape: {points_clean.shape} (profiles x variables)")
points_clean.head()

Shape: (1478, 11) (profiles x variables)


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year,faosoil_id,landsurface_value,litho_value,formation
0,2139,100_56,637881.888723,8608926.0,Huambo,POINT (637881.889 8608925.636),1956.0,43,2,2,97
1,1927,100_58,385725.69329,8669325.0,Benguela,POINT (385725.693 8669324.638),1958.0,120,2,2,123
2,17,100_59,248538.63635,9488118.0,Cabinda,POINT (248538.636 9488118.432),1959.0,8,2,2,66
3,1392,100_61,545297.500131,8833673.0,Cuanza Sul,POINT (545297.5 8833673.056),1961.0,18,2,2,96
4,1701,100_63,840008.131292,8731220.0,Malanje,POINT (840008.131 8731220.482),1963.0,40,1,1,96


## extract Land Use Percent rasters

In [23]:
import os
import rasterio
import geopandas as gpd
import pandas as pd
import re

# ----------------------------
# Paths
points_clean
# Load soil sampling points
# points_clean = points_clean.copy()
lu_dir = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/hyde_lu/hyde_lu_percent"
out_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/landuse_percent_sample_points.csv"

# ----------------------------
# Prepare coordinate list for sampling
# ----------------------------
coords = [(geom.x, geom.y) for geom in points_clean.geometry]

# ----------------------------
# Helper: extract class and year from file name
# ----------------------------
def parse_filename(filename):
    match = re.match(r"([a-zA-Z_]+)_(\d{4})_percent\.tif$", filename)
    if not match:
        return None, None
    land_class, year = match.groups()
    return land_class.lower(), int(year)

# ----------------------------
# Loop through all land use percent rasters
# ----------------------------
for f in sorted(os.listdir(lu_dir)):
    if not f.endswith(".tif") or f.startswith("._"):
        continue

    land_class, year = parse_filename(f)
    if land_class is None:
        print(f"⚠️ Skipping unrecognized file: {f}")
        continue

    raster_path = os.path.join(lu_dir, f)
    print(f"Extracting from {f} ...")

    with rasterio.open(raster_path) as src:
        # Ensure CRS matches
        if points_clean.crs != src.crs:
            points_clean = points_clean.to_crs(src.crs)

        # Extract values
        values = [val[0] if val[0] != src.nodata else None for val in src.sample(coords)]

    # Add column, e.g. "grazing_1950"
    col_name = f"{land_class}_{year}"
    points_clean[col_name] = values

print("✅ Extraction complete for all rasters.")

# ----------------------------
# Drop unused columns and export
# ----------------------------

drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean = points_clean.drop(columns=drop_cols, errors="ignore")

# Export to CSV (geometry removed)
points_clean.drop(columns="geometry").to_csv(out_csv, index=False)

print(f"✅ Land use percent values extracted and saved to:\n{out_csv}")


Extracting from conv_rangeland_1950_percent.tif ...
Extracting from conv_rangeland_1960_percent.tif ...
Extracting from cropland_1950_percent.tif ...
Extracting from cropland_1960_percent.tif ...
Extracting from grazing_1950_percent.tif ...
Extracting from grazing_1960_percent.tif ...
Extracting from ir_norice_1950_percent.tif ...
Extracting from ir_norice_1960_percent.tif ...
Extracting from ir_rice_1950_percent.tif ...
Extracting from ir_rice_1960_percent.tif ...
Extracting from pasture_1950_percent.tif ...
Extracting from pasture_1960_percent.tif ...
Extracting from rangeland_1950_percent.tif ...
Extracting from rangeland_1960_percent.tif ...
Extracting from rf_norice_1950_percent.tif ...
Extracting from rf_norice_1960_percent.tif ...
Extracting from rf_rice_1950_percent.tif ...
Extracting from rf_rice_1960_percent.tif ...
Extracting from tot_irri_1950_percent.tif ...
Extracting from tot_irri_1960_percent.tif ...
Extracting from tot_rainfed_1950_percent.tif ...
Extracting from tot_r

In [24]:
print(f"Shape: {points_clean.shape} (profiles x variables)")
points_clean.head()

Shape: (1478, 35) (profiles x variables)


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year,faosoil_id,landsurface_value,litho_value,...,rf_norice_1950,rf_norice_1960,rf_rice_1950,rf_rice_1960,tot_irri_1950,tot_irri_1960,tot_rainfed_1950,tot_rainfed_1960,tot_rice_1950,tot_rice_1960
0,2139,100_56,637881.888723,8608926.0,Huambo,POINT (637881.889 8608925.636),1956.0,43,2,2,...,13.921248,13.009153,0.0,0.0,0.0,0.0,13.921248,13.009153,0.0,0.0
1,1927,100_58,385725.69329,8669325.0,Benguela,POINT (385725.693 8669324.638),1958.0,120,2,2,...,8.803409,10.747362,0.0,0.0,0.0,0.0,8.803409,10.747362,0.0,0.0
2,17,100_59,248538.63635,9488118.0,Cabinda,POINT (248538.636 9488118.432),1959.0,8,2,2,...,1.639789,1.990139,0.0,0.0,0.0,0.0,1.639789,1.990139,0.0,0.0
3,1392,100_61,545297.500131,8833673.0,Cuanza Sul,POINT (545297.5 8833673.056),1961.0,18,2,2,...,0.481127,0.499965,0.0,0.0,0.0,0.0,0.481127,0.499965,0.0,0.0
4,1701,100_63,840008.131292,8731220.0,Malanje,POINT (840008.131 8731220.482),1963.0,40,1,1,...,0.266934,0.270036,0.0,0.0,0.0,0.0,0.266934,0.270036,0.0,0.0


## bioclimatic rasters to csv for each sample point

In [25]:
#### Bioclimatic rasters to CSV for each sample point

import rasterio
import geopandas as gpd
import pandas as pd
import glob
import os
import numpy as np

# --- Copy points GeoDataFrame ---
points_clean

# --- Paths ---
bioclimraster_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/bioclimatic32733_cleaned/"
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_bioclim1.csv"

# --- Points are already a GeoDataFrame ---
bioclim_perpoint = points_clean.copy()

# --- Get list of raster files ---
bioclimraster_files = sorted(glob.glob(os.path.join(bioclimraster_folder, "*.tif")))

# --- Reproject points once (using first raster as reference) ---
if bioclimraster_files:
    with rasterio.open(bioclimraster_files[0]) as src_ref:
        if bioclim_perpoint.crs != src_ref.crs:
            bioclim_perpoint = bioclim_perpoint.to_crs(src_ref.crs)

# --- Prepare coordinates for sampling ---
coords = [(geom.x, geom.y) for geom in bioclim_perpoint.geometry]

# --- Extract raster values for each point ---
for raster_path in bioclimraster_files:
    name = os.path.splitext(os.path.basename(raster_path))[0]  # e.g. bio1.tif → "bio1"
    with rasterio.open(raster_path) as src:
        # Sample raster at point locations
        values = [val[0] if val[0] != src.nodata else np.nan for val in src.sample(coords)]
        bioclim_perpoint[name] = values

# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean = bioclim_perpoint.drop(columns=drop_cols, errors="ignore")

# --- Save to CSV ---
points_clean.to_csv(output_csv, index=False)
print("✅ CSV saved at:", output_csv)


✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_bioclim1.csv


In [26]:
print(f"Shape: {points_clean.shape} (profiles x variables)")
points_clean.head()

Shape: (1478, 54) (profiles x variables)


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year,faosoil_id,landsurface_value,litho_value,...,min_temp_coldest_month32733,precip_coldest_quarter32733,precip_driest_month32733,precip_driest_quarter32733,precip_seasonality2,precip_warmest_quarter32733,precip_wettest_month32733,precip_wettest_quarter32733,temp_annual_range32733,temp_seasonality32733
0,2139,100_56,637881.888723,8608926.0,Huambo,POINT (637881.889 8608925.636),1956.0,43,2,2,...,7.1,17.0,0.0,2.0,85.0,322.0,250.0,654.0,21.457123,15.427565
1,1927,100_58,385725.69329,8669325.0,Benguela,POINT (385725.693 8669324.638),1958.0,120,2,2,...,15.8,0.0,0.0,0.0,108.0,335.0,176.0,335.0,14.64056,21.184771
2,17,100_59,248538.63635,9488118.0,Cabinda,POINT (248538.636 9488118.432),1959.0,8,2,2,...,16.9,2.0,0.0,2.0,79.0,554.0,217.0,554.0,13.390113,17.503922
3,1392,100_61,545297.500131,8833673.0,Cuanza Sul,POINT (545297.5 8833673.056),1961.0,18,2,2,...,10.9,5.0,0.0,5.0,76.0,425.0,206.0,520.0,16.760679,10.756773
4,1701,100_63,840008.131292,8731220.0,Malanje,POINT (840008.131 8731220.482),1963.0,40,1,1,...,7.2,3.0,0.0,3.0,84.0,348.0,227.0,628.0,23.813589,15.8832


## add temp min, temp max, and precipitation covariates

In [27]:
#### Tmax annual mean per sample point based on year

import rasterio
import geopandas as gpd
import pandas as pd
import glob
import os
import numpy as np
import re

# --- Copy points GeoDataFrame (dataset with points and 'year' column) ---
points_clean1 = points_clean.copy()

# --- Paths ---
tmax_mean_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/annual_tmax_mean"
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soilsamples_tmaxmean.csv"

# --- Get list of raster files ---
tmax_files = sorted(glob.glob(os.path.join(tmax_mean_folder, "*.tif")))

# --- Helper: extract year from raster filename ---
def extract_year_from_filename(fname):
    """
    Example: tmax_1951_annual_mean.tif -> 1951
    """
    match = re.search(r"tmax_(\d{4})_annual_mean\.tif$", fname)
    if match:
        return int(match.group(1))
    return None

# --- Reproject points once (using first raster as reference) ---
if tmax_files:
    with rasterio.open(tmax_files[0]) as src_ref:
        if points_clean1.crs != src_ref.crs:
            points_clean1 = points_clean1.to_crs(src_ref.crs)

# --- Initialize new column ---
points_clean1["tmax_mean"] = np.nan

# --- Loop through raster files and assign tmax based on year ---
for raster_path in tmax_files:
    raster_year = extract_year_from_filename(raster_path)
    if raster_year is None:
        print(f"⚠️ Skipping unrecognized raster: {raster_path}")
        continue

    # Filter points with this year
    mask_year = points_clean1['year'] == raster_year
    if not mask_year.any():
        continue

    points_year = points_clean1.loc[mask_year].copy()
    coords = [(geom.x, geom.y) for geom in points_year.geometry]

    with rasterio.open(raster_path) as src:
        values = [val[0] if val[0] != src.nodata else np.nan for val in src.sample(coords)]

    # Assign values to original DataFrame
    points_clean1.loc[mask_year, "tmax_mean"] = values
    print(f"Extracted tmax for year {raster_year} ({mask_year.sum()} points)")

# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean1 = points_clean1.drop(columns=drop_cols, errors="ignore")

# --- Save to CSV ---
points_clean1.to_csv(output_csv, index=False)
print("✅ CSV saved at:", output_csv)


Extracted tmax for year 1952 (1 points)
Extracted tmax for year 1954 (55 points)
Extracted tmax for year 1955 (90 points)
Extracted tmax for year 1956 (96 points)
Extracted tmax for year 1957 (81 points)
Extracted tmax for year 1958 (73 points)
Extracted tmax for year 1959 (104 points)
Extracted tmax for year 1960 (171 points)
Extracted tmax for year 1961 (66 points)
Extracted tmax for year 1962 (157 points)
Extracted tmax for year 1963 (304 points)
Extracted tmax for year 1964 (40 points)
Extracted tmax for year 1965 (63 points)
Extracted tmax for year 1966 (46 points)
Extracted tmax for year 1967 (36 points)
Extracted tmax for year 1968 (33 points)
Extracted tmax for year 1969 (14 points)
✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soilsamples_tmaxmean.csv


In [28]:
#### Tmin annual mean per sample point based on year

import rasterio
import geopandas as gpd
import pandas as pd
import glob
import os
import numpy as np
import re

# --- Copy points GeoDataFrame ---
points_clean1

# --- Paths ---
tmin_mean_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/annual_tmin_mean"
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soilsamples_tminmean.csv"

# --- Get list of raster files ---
tmin_files = sorted(glob.glob(os.path.join(tmin_mean_folder, "*.tif")))

# --- Helper: extract year from raster filename ---
def extract_year_from_filename(fname):
    """
    Example: tmin_1951_annual_mean.tif -> 1951
    """
    match = re.search(r"tmin_(\d{4})_annual_mean\.tif$", os.path.basename(fname))
    if match:
        return int(match.group(1))
    else:
        return None

# --- Reproject points once (using first raster as reference) ---
if tmin_files:
    with rasterio.open(tmin_files[0]) as src_ref:
        if points_clean1.crs != src_ref.crs:
            points_clean1 = points_clean1.to_crs(src_ref.crs)

# --- Initialize new column ---
points_clean1["tmin_mean"] = np.nan

# --- Loop through raster files and assign tmin based on year ---
for raster_path in tmin_files:
    raster_year = extract_year_from_filename(raster_path)
    if raster_year is None:
        print(f"⚠️ Skipping unrecognized raster: {raster_path}")
        continue

    # Filter points with this year
    mask_year = points_clean1['year'] == raster_year
    if not mask_year.any():
        continue

    points_year = points_clean1.loc[mask_year].copy()
    coords = [(geom.x, geom.y) for geom in points_year.geometry]

    with rasterio.open(raster_path) as src:
        values = [val[0] if val[0] != src.nodata else np.nan for val in src.sample(coords)]

    # Assign values to original DataFrame
    points_clean1.loc[mask_year, "tmin_mean"] = values
    print(f"Extracted tmin for year {raster_year} ({mask_year.sum()} points)")

# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean1 = points_clean1.drop(columns=drop_cols, errors="ignore")

# --- Save to CSV ---
points_clean1.to_csv(output_csv, index=False)
print("✅ CSV saved at:", output_csv)


Extracted tmin for year 1952 (1 points)
Extracted tmin for year 1954 (55 points)
Extracted tmin for year 1955 (90 points)
Extracted tmin for year 1956 (96 points)
Extracted tmin for year 1957 (81 points)
Extracted tmin for year 1958 (73 points)
Extracted tmin for year 1959 (104 points)
Extracted tmin for year 1960 (171 points)
Extracted tmin for year 1961 (66 points)
Extracted tmin for year 1962 (157 points)
Extracted tmin for year 1963 (304 points)
Extracted tmin for year 1964 (40 points)
Extracted tmin for year 1965 (63 points)
Extracted tmin for year 1966 (46 points)
Extracted tmin for year 1967 (36 points)
Extracted tmin for year 1968 (33 points)
Extracted tmin for year 1969 (14 points)
✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soilsamples_tminmean.csv


In [29]:
#### Annual precipitation sum per sample point based on year

import rasterio
import geopandas as gpd
import pandas as pd
import glob
import os
import numpy as np
import re

# --- Paths ---
precip_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/annual_precip_sum/precip"
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soilsamples_precipsum.csv"

# --- Get list of raster files ---
precip_files = sorted(glob.glob(os.path.join(precip_folder, "*.tif")))

# --- Helper: extract year from raster filename ---
def extract_year_from_filename(fname):
    """
    Example: precip_1951_annual.tif -> 1951
    """
    match = re.search(r"precip_(\d{4})_annual\.tif$", os.path.basename(fname))
    if match:
        return int(match.group(1))
    else:
        return None

# --- Reproject points once (using first raster as reference) ---
if precip_files:
    with rasterio.open(precip_files[0]) as src_ref:
        if points_clean1.crs != src_ref.crs:
            points_clean1 = points_clean1.to_crs(src_ref.crs)

# --- Initialize new column ---
points_clean1["precip_sum"] = np.nan

# --- Loop through raster files and assign precip sum based on year ---
for raster_path in precip_files:
    raster_year = extract_year_from_filename(raster_path)
    if raster_year is None:
        print(f"⚠️ Skipping unrecognized raster: {raster_path}")
        continue

    # Filter points with this year
    mask_year = points_clean1['year'] == raster_year
    if not mask_year.any():
        continue

    points_year = points_clean1.loc[mask_year].copy()
    coords = [(geom.x, geom.y) for geom in points_year.geometry]

    with rasterio.open(raster_path) as src:
        values = [val[0] if val[0] != src.nodata else np.nan for val in src.sample(coords)]

    # Assign values to original DataFrame
    points_clean1.loc[mask_year, "precip_sum"] = values
    print(f"Extracted precipitation for year {raster_year} ({mask_year.sum()} points)")

# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean1 = points_clean1.drop(columns=drop_cols, errors="ignore")

# --- Save to CSV ---
points_clean1.to_csv(output_csv, index=False)
print("✅ CSV saved at:", output_csv)


Extracted precipitation for year 1952 (1 points)
Extracted precipitation for year 1954 (55 points)
Extracted precipitation for year 1955 (90 points)
Extracted precipitation for year 1956 (96 points)
Extracted precipitation for year 1957 (81 points)
Extracted precipitation for year 1958 (73 points)
Extracted precipitation for year 1959 (104 points)
Extracted precipitation for year 1960 (171 points)
Extracted precipitation for year 1961 (66 points)
Extracted precipitation for year 1962 (157 points)
Extracted precipitation for year 1963 (304 points)
Extracted precipitation for year 1964 (40 points)
Extracted precipitation for year 1965 (63 points)
Extracted precipitation for year 1966 (46 points)
Extracted precipitation for year 1967 (36 points)
Extracted precipitation for year 1968 (33 points)
Extracted precipitation for year 1969 (14 points)
✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soilsamples_precipsum.csv


In [30]:
points_clean1.columns

Index(['site_info_id', 'profile', 'X_coord', 'Y_coord', 'district', 'geometry',
       'year', 'faosoil_id', 'landsurface_value', 'litho_value', 'formation',
       'conv_rangeland_1950', 'conv_rangeland_1960', 'cropland_1950',
       'cropland_1960', 'grazing_1950', 'grazing_1960', 'ir_norice_1950',
       'ir_norice_1960', 'ir_rice_1950', 'ir_rice_1960', 'pasture_1950',
       'pasture_1960', 'rangeland_1950', 'rangeland_1960', 'rf_norice_1950',
       'rf_norice_1960', 'rf_rice_1950', 'rf_rice_1960', 'tot_irri_1950',
       'tot_irri_1960', 'tot_rainfed_1950', 'tot_rainfed_1960',
       'tot_rice_1950', 'tot_rice_1960', '2mean_temp_coldest_quarter32733',
       'annual_mean_temp', 'annual_precip2', 'isothermality_32733',
       'max_temp_warmest_month32733', 'mean_temp_coldest_quarter32733',
       'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733',
       'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733',
       'precip_coldest_quarter32733', 'precip_dr

## terrain / DEM features to csv for each sample point

In [31]:
#### Terrain / DEM covariates per sample point

import rasterio
import geopandas as gpd
import pandas as pd
import glob
import os
import numpy as np

# --- Paths ---
terrain_folder = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/covariates_rasters/terraincovs"
output_csv = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_DEM.csv"

# --- Load points ---
# Ensure points_clean1 is a GeoDataFrame
points_clean1 = gpd.GeoDataFrame(points_clean1, geometry='geometry', crs="EPSG:32733")

# --- Get list of terrain rasters ---
terrain_files = sorted(glob.glob(os.path.join(terrain_folder, "*.tif")))

# --- Reproject points once (using first raster as reference) ---
if terrain_files:
    with rasterio.open(terrain_files[0]) as src_ref:
        if points_clean1.crs != src_ref.crs:
            points_clean1 = points_clean1.to_crs(src_ref.crs)

# --- Prepare coordinates for sampling ---
coords = [(geom.x, geom.y) for geom in points_clean1.geometry]

# --- Extract raster values for each point ---
for raster_path in terrain_files:
    var_name = os.path.splitext(os.path.basename(raster_path))[0]  # e.g., slope.tif -> "slope"
    with rasterio.open(raster_path) as src:
        values = [val[0] if val[0] != src.nodata else np.nan for val in src.sample(coords)]
    points_clean1[var_name] = values
    print(f"✅ Extracted {var_name}")

# # --- Optional: Map aspect categorical labels if present ---
# if "aspect_classes" in DEM_perpoint.columns:
#     aspect_lookup = {
#         1: "N", 2: "NE", 3: "E", 4: "SE",
#         5: "S", 6: "SW", 7: "W", 8: "NW"
#     }
#     DEM_perpoint["aspect_label"] = DEM_perpoint["aspect_classes"].map(aspect_lookup)

# --- Drop unnecessary columns ---
drop_cols = ["geo_features_id", "climate_id", "topo_id", "soil_type_id"]
points_clean2 = points_clean1.drop(columns=drop_cols, errors="ignore")

# --- Save to CSV ---
points_clean2.to_csv(output_csv, index=False)
print(f"✅ CSV saved at: {output_csv}")


✅ Extracted MRRTF
✅ Extracted MRVBF
✅ Extracted aspect
✅ Extracted aspect_cos
✅ Extracted aspect_sin
✅ Extracted dem_1km_utm33s
✅ Extracted flow_accumulation
✅ Extracted flow_directions
✅ Extracted flowline_curve
✅ Extracted general_curve
✅ Extracted hill_height
✅ Extracted hillshade
✅ Extracted hillslope_index
✅ Extracted length_slope_factor
✅ Extracted max_curve
✅ Extracted midslope_position
✅ Extracted min_curve
✅ Extracted normalized_height
✅ Extracted plan_curve
✅ Extracted profile_curve
✅ Extracted relief_TRI
✅ Extracted slope
✅ Extracted slope_height
✅ Extracted slope_length
✅ Extracted standardized_height
✅ Extracted terrain_surf_convexity
✅ Extracted terrain_surf_texture
✅ Extracted total_curve
✅ Extracted twi
✅ Extracted valley_depth
✅ Extracted valley_index
✅ Extracted watershed_basins
✅ CSV saved at: /Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_DEM.csv


In [32]:
print(f"Shape: {points_clean2.shape} (profiles x variables)")
points_clean2.head()

Shape: (1478, 89) (profiles x variables)


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year,faosoil_id,landsurface_value,litho_value,...,slope_height,slope_length,standardized_height,terrain_surf_convexity,terrain_surf_texture,total_curve,twi,valley_depth,valley_index,watershed_basins
0,2139,100_56,637881.888723,8608926.0,Huambo,POINT (637881.889 8608925.636),1956.0,43,2,2,...,79.74369,3000.0,952.486816,49.512749,77.167595,1.178243e-09,5.82737,64.605621,0.0,4340.0
1,1927,100_58,385725.69329,8669325.0,Benguela,POINT (385725.693 8669324.638),1958.0,120,2,2,...,209.757782,3414.213623,50.189114,43.61615,72.120461,8.122644e-10,8.748525,559.24823,0.0,3570.0
2,17,100_59,248538.63635,9488118.0,Cabinda,POINT (248538.636 9488118.432),1959.0,8,2,2,...,167.649002,0.0,72.041985,39.723015,71.185776,4.263759e-10,5.255396,260.212341,0.0,7755.0
3,1392,100_61,545297.500131,8833673.0,Cuanza Sul,POINT (545297.5 8833673.056),1961.0,18,2,2,...,141.969437,0.0,656.140442,46.931858,79.724144,2.749215e-11,-inf,128.83374,0.0,4340.0
4,1701,100_63,840008.131292,8731220.0,Malanje,POINT (840008.131 8731220.482),1963.0,40,1,1,...,92.101753,4828.427246,509.570862,44.910591,72.663414,2.168373e-10,7.329942,111.684311,0.0,5267.0


## Joining all files for training data table

In [33]:
# join all files for training data table BY site_info_id

# DEM_perpoint_clean
# bioclim_perpoint_clean
# landsurface_points_clean
# ecosystems_points_clean
# points_soil_lith_clean
# profile_0_30cm (info about soil phys and chem properties) NO LONGER USING


In [34]:
# import pandas as pd
# from functools import reduce
# import geopandas as gpd

# # --- Load all CSVs ---
# DEM_perpoint_clean = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_DEM.csv")
# bioclim_perpoint_clean = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_with_bioclim1.csv")
# landsurface_points_clean = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/landsurface_sample_points.csv")
# ecosystems_points_clean = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/ecoformation_sample_points.csv")
# points_soil_clean = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/soil_samples_from_vector_clean.csv")
# points_litho_clean = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/litho_sample_points.csv")

# #profile_0_30cm_clean = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/profile_0_30cm_clean.csv")

# # --- Load reference GeoDataFrame for shared columns ---
# points_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/data_processed/usable_site_info_epsg32733_clean.gpkg"
# points = gpd.read_file(
#     points_path,
#     layer="usable_sites_clean"   # 
# )

# reference_cols = points_clean[['site_info_id', 'X_coord', 'Y_coord', 'profile', 'district']].copy()

# # --- Ensure site_info_id is string in reference ---
# reference_cols['site_info_id'] = reference_cols['site_info_id'].astype(str)

# # --- List of dataframes to merge, drop shared columns to avoid duplicates ---
# dfs = []
# for df in [
#     DEM_perpoint_clean,
#     bioclim_perpoint_clean,
#     landsurface_points_clean,
#     ecosystems_points_clean,
#     points_soil_clean,
#     points_litho_clean
#     #profile_0_30cm_clean
# ]:
#     df_copy = df.copy()
#     # Drop columns that will be added back from reference
#     df_copy = df_copy.drop(columns=['X_coord','Y_coord','profile','district'], errors='ignore')
#     # Ensure site_info_id is string for merging
#     df_copy['site_info_id'] = df_copy['site_info_id'].astype(str)
#     dfs.append(df_copy)

# # --- Inner merge all tables on site_info_id ---
# train_df = reduce(lambda left, right: pd.merge(left, right, on='site_info_id', how='inner'), dfs)

# # --- Merge back the shared columns ---
# train_df_final = pd.merge(reference_cols, train_df, on='site_info_id', how='inner')

# # --- Save the final training table ---
# output_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/training_data_table_final.csv"
# train_df_final.to_csv(output_path, index=False)

# print(f"Training table created: {train_df_final.shape[0]} rows, {train_df_final.shape[1]} columns")


In [35]:
train_df_final = points_clean2
# --- Save the final training table ---
output_path = "/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/training_data_table_final1.csv"
train_df_final.to_csv(output_path, index=False)

In [36]:
train_df_final.columns

Index(['site_info_id', 'profile', 'X_coord', 'Y_coord', 'district', 'geometry',
       'year', 'faosoil_id', 'landsurface_value', 'litho_value', 'formation',
       'conv_rangeland_1950', 'conv_rangeland_1960', 'cropland_1950',
       'cropland_1960', 'grazing_1950', 'grazing_1960', 'ir_norice_1950',
       'ir_norice_1960', 'ir_rice_1950', 'ir_rice_1960', 'pasture_1950',
       'pasture_1960', 'rangeland_1950', 'rangeland_1960', 'rf_norice_1950',
       'rf_norice_1960', 'rf_rice_1950', 'rf_rice_1960', 'tot_irri_1950',
       'tot_irri_1960', 'tot_rainfed_1950', 'tot_rainfed_1960',
       'tot_rice_1950', 'tot_rice_1960', '2mean_temp_coldest_quarter32733',
       'annual_mean_temp', 'annual_precip2', 'isothermality_32733',
       'max_temp_warmest_month32733', 'mean_temp_coldest_quarter32733',
       'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733',
       'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733',
       'precip_coldest_quarter32733', 'precip_dr

ADD HARMONIZED SOC STOCK TO TRAINING DATA SET

In [37]:
import pandas as pd

# Load SOC stock CSV
soc_stock = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/harmonized_soc_with_log.csv")

# Keep only profile and log_soc_stock, drop other columns
soc_stock_clean = soc_stock[['profile', 'log_soc_stock']]

# Load your training dataset
train_df_final = pd.read_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/training_data_table_final1.csv")

# Merge training dataset with SOC stock by profile
train_with_soc = pd.merge(train_df_final, soc_stock_clean, on='profile', how='inner')

# Optional: save the merged dataset to CSV
train_with_soc.to_csv("/Users/inesschwartz/Desktop/training_data_with_log_soc.csv", index=False)

print("Merged training dataset created with log-transformed SOC stock as response variable.")
train_with_soc.head()


Merged training dataset created with log-transformed SOC stock as response variable.


Unnamed: 0,site_info_id,profile,X_coord,Y_coord,district,geometry,year,faosoil_id,landsurface_value,litho_value,...,slope_length,standardized_height,terrain_surf_convexity,terrain_surf_texture,total_curve,twi,valley_depth,valley_index,watershed_basins,log_soc_stock
0,2139,100_56,637881.888723,8608926.0,Huambo,POINT (637881.8887233642 8608925.63645552),1956.0,43,2,2,...,3000.0,952.486816,49.512749,77.167595,1.178243e-09,5.82737,64.605621,0.0,4340.0,1.260593
1,1927,100_58,385725.69329,8669325.0,Benguela,POINT (385725.6932897064 8669324.63827212),1958.0,120,2,2,...,3414.213623,50.189114,43.61615,72.120461,8.122644e-10,8.748525,559.24823,0.0,3570.0,1.659299
2,17,100_59,248538.63635,9488118.0,Cabinda,POINT (248538.63635045075 9488118.432205807),1959.0,8,2,2,...,0.0,72.041985,39.723015,71.185776,4.263759e-10,5.255396,260.212341,0.0,7755.0,0.0
3,1701,100_63,840008.131292,8731220.0,Malanje,POINT (840008.131292473 8731220.482084274),1963.0,40,1,1,...,4828.427246,509.570862,44.910591,72.663414,2.168373e-10,7.329942,111.684311,0.0,5267.0,0.972043
4,1934,101A_58,384135.495798,8666721.0,Benguela,POINT (384135.4957978715 8666721.19004675),1958.0,120,2,2,...,0.0,69.800308,45.813786,71.682632,2.845996e-10,-inf,535.184631,0.0,3570.0,1.333861


In [38]:
train_with_soc.to_csv("/Volumes/One_Touch/angola_soils_thesis/GIS_Angola/tables/training_data_table_final2.csv", index=False)


In [39]:
train_with_soc.to_csv("/Users/inesschwartz/Desktop/training_data2.csv", index=False)


In [40]:
train_with_soc.shape  # number of rows and columns

train_with_soc.columns

Index(['site_info_id', 'profile', 'X_coord', 'Y_coord', 'district', 'geometry',
       'year', 'faosoil_id', 'landsurface_value', 'litho_value', 'formation',
       'conv_rangeland_1950', 'conv_rangeland_1960', 'cropland_1950',
       'cropland_1960', 'grazing_1950', 'grazing_1960', 'ir_norice_1950',
       'ir_norice_1960', 'ir_rice_1950', 'ir_rice_1960', 'pasture_1950',
       'pasture_1960', 'rangeland_1950', 'rangeland_1960', 'rf_norice_1950',
       'rf_norice_1960', 'rf_rice_1950', 'rf_rice_1960', 'tot_irri_1950',
       'tot_irri_1960', 'tot_rainfed_1950', 'tot_rainfed_1960',
       'tot_rice_1950', 'tot_rice_1960', '2mean_temp_coldest_quarter32733',
       'annual_mean_temp', 'annual_precip2', 'isothermality_32733',
       'max_temp_warmest_month32733', 'mean_temp_coldest_quarter32733',
       'mean_temp_driest_quarter32733', 'mean_temp_warmest_quarter32733',
       'mean_temp_wettest_quarter32733', 'min_temp_coldest_month32733',
       'precip_coldest_quarter32733', 'precip_dr

Adjust column names

In [42]:
columns_renamed = {
    "aspect_1km": "aspect",
    "aspect_cos_1km": "aspect_cos",
    "aspect_sin_1km": "aspect_sin",
    "MRVBF_1km": "MRVBF",
    "RLD_1km": "RLD",
    "dem_filledfiltered_1km": "DEM",
    "flow_accumulation_1km": "flow_accumulation",
    "relief_1km": "relief",
    "ridge_levels_1km": "ridge_levels",
    "roughness_1km": "roughness",  # corrected key
    "slope_1km": "slope",
    "twi_300m_1km": "TWI",
    "valleydepth2_1km": "valleydepth",

    "2mean_temp_coldest_quarter32733": "mean_temp_coldest_quarter",
    "annual_precip2": "annual_precip",
    "isothermality_32733": "isothermality",
    "max_temp_warmest_month32733": "max_temp_warmest_month",
    # duplicate dropped: "mean_temp_coldest_quarter32733"
    "mean_temp_driest_quarter32733": "mean_temp_driest_quarter",
    "mean_temp_warmest_quarter32733": "mean_temp_warmest_quarter",
    "mean_temp_wettest_quarter32733": "mean_temp_wettest_quarter",
    "min_temp_coldest_month32733": "min_temp_coldest_month",
    "precip_coldest_quarter32733": "precip_coldest_quarter",
    "precip_driest_month32733": "precip_driest_month",
    "precip_warmest_quarter32733": "precip_warmest_quarter",
    "precip_driest_quarter32733": "precip_driest_quarter",
    "precip_wettest_month32733": "precip_wettest_month",
    "precip_seasonality2": "precip_seasonality",
    "temp_annual_range32733": "temp_annual_range",
    "temp_seasonality32733": "temp_seasonality",
    "precip_wettest_quarter32733": "precip_wettest_quarter"
}

columns_to_drop = [
    "mean_temp_coldest_quarter32733",  # duplicate column
    "landsurface_label",
    "litho_label",
]

# Rename + drop
train_with_soc = (
    train_with_soc
        .rename(columns=columns_renamed)
        .drop(columns=columns_to_drop, errors="ignore")
)

# Save to CSV
output_path = "/Users/inesschwartz/Desktop/training_data.csv"
train_with_soc.to_csv(output_path, index=False)

print(f"Cleaned DataFrame saved to: {output_path}")


Cleaned DataFrame saved to: /Users/inesschwartz/Desktop/training_data.csv
