In [None]:
# !pip install geopandas

In [None]:
import pandas as pd
import numpy as np

## Load Rivers and Dams data

In [None]:
import geopandas as gpd
rivers = gpd.read_file("Rivers_Data/Rivers.shp")
rivers.head()

In [None]:
rivers.crs

In [None]:
import geopandas as gpd
dams = gpd.read_file("Dams_Data/South_Africa_Dams.shp")
dams.head()

## Load the data we want to join

In [None]:
test_df = pd.read_csv("submission_template.csv")
test_df.head()

In [None]:
# define longtitude and latitude column names
lon_col = "Longitude"
lat_col = "Latitude"

test_gdf = gpd.GeoDataFrame(
    test_df,
    geometry=gpd.points_from_xy(test_df[lon_col], test_df[lat_col]),
    crs="EPSG:4326"  # Longtitude and latitude
)

print(test_gdf.crs)
test_gdf.head()

In [None]:
# Check for the validation
print(test_gdf[[lon_col, lat_col]].describe())
print(test_gdf[[lon_col, lat_col]].isna().sum())

In [None]:
# Project to the meter coordinate system
test_m = test_gdf.to_crs("EPSG:3857")
river_m = rivers.to_crs("EPSG:3857")

In [None]:
river_m.columns

In [None]:
# Join the points with the nearest river
matched_df = gpd.sjoin_nearest(
    test_m,
    river_m,
    how="left",
    distance_col="dist_to_river_m"
)

matched_df.head()

In [None]:
matched_df["dist_to_river_m"].describe(percentiles=[0.5,0.9,0.95,0.99])

In [None]:
# Set the limit to 5km to avoid illogical join. 
max_dist = 5000  # 5km
matched_df.loc[matched_df["dist_to_river_m"] > max_dist] = None

In [None]:
out = matched.drop(columns="geometry") 
out.to_csv("water_with_river_features.csv", index=False)


## Function for joining the data

In [None]:
import pandas as pd
import geopandas as gpd

def to_points_gdf(
    df: pd.DataFrame,
    lon_col: str = "Longitude",
    lat_col: str = "Latitude",
    crs: str = "EPSG:4326",
    drop_invalid: bool = True
) -> gpd.GeoDataFrame:
    """
    Convert a DataFrame with lon/lat columns into a GeoDataFrame of Point geometry.
    Optionally drops rows with invalid/missing coordinates.
    """
    df = df.copy()

    # basic checks
    if lon_col not in df.columns or lat_col not in df.columns:
        raise ValueError(f"Missing lon/lat columns: '{lon_col}', '{lat_col}'")

    # numeric coercion
    df[lon_col] = pd.to_numeric(df[lon_col], errors="coerce")
    df[lat_col] = pd.to_numeric(df[lat_col], errors="coerce")

    # invalid coords
    invalid = (
        df[lon_col].isna() |
        df[lat_col].isna() |
        (df[lon_col] < -180) | (df[lon_col] > 180) |
        (df[lat_col] < -90)  | (df[lat_col] > 90)
    )

    if drop_invalid and invalid.any():
        df = df.loc[~invalid].copy()

    gdf = gpd.GeoDataFrame(
        df,
        geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
        crs=crs
    )
    return gdf


In [None]:
def join_nearest_layer(
    points_gdf: gpd.GeoDataFrame,
    layer_gdf: gpd.GeoDataFrame,
    *,
    layer_keep_cols: list[str] | None = None,
    dist_col: str = "dist_to_layer_m",
    max_dist_m: float | None = None,
    metric_crs: str = "EPSG:3857",
    how: str = "left"
) -> gpd.GeoDataFrame:
    """
    Spatially join points to the nearest feature in a layer (river lines, dam polygons, etc.)
    Returns points with selected attributes from the layer + distance in meters.

    - layer_keep_cols: columns to bring back from layer (excluding geometry; geometry auto handled)
    - max_dist_m: if provided, any match beyond this distance will have joined columns set to NA
    """
    # Decide which layer columns to keep
    if layer_keep_cols is None:
        # default: bring back all columns
        cols = [c for c in layer_gdf.columns if c != "geometry"]
    else:
        missing = [c for c in layer_keep_cols if c not in layer_gdf.columns]
        if missing:
            raise ValueError(f"layer_keep_cols not found in layer_gdf: {missing}")
        cols = layer_keep_cols

    # Project both to metric CRS for distance correctness
    pts_m = points_gdf.to_crs(metric_crs)
    lyr_m = layer_gdf.to_crs(metric_crs)

    # drop the "index_right" column to avoid the same name conflicts when joining.
    pts_m = pts_m.drop(columns=["index_right"], errors="ignore")

    # join
    matched = gpd.sjoin_nearest(
        pts_m,
        lyr_m[cols + ["geometry"]],
        how=how,
        distance_col=dist_col
    )

    # apply max distance filter only to joined columns (keep original point columns)
    if max_dist_m is not None:
        too_far = matched[dist_col] > max_dist_m

        # columns that came from layer (these are the ones we want to null out if too far)
        joined_cols = cols

        matched.loc[too_far, joined_cols] = pd.NA

    return matched

In [None]:
import geopandas as gpd
import pandas as pd

rivers = gpd.read_file("Rivers_Data/Rivers.shp")
dams = gpd.read_file("Dams_Data/South_Africa_Dams.shp") 

test_df = pd.read_csv("submission_template.csv") # the data we want to join

In [None]:
# Transform the csv file to gdf
test_gdf = to_points_gdf(test_df, lon_col="Longitude", lat_col="Latitude")

In [None]:
# Define the columns we want to extract, default is set to be all columns
# river_cols = []
#dam_cols = []

test_with_river = join_nearest_layer(
    test_gdf,
    rivers,
    # layer_keep_cols=river_cols,
    dist_col="dist_to_river_m",
    max_dist_m=500,  # The max distance between the nearest river, for now we set to 500 meters.
    metric_crs="EPSG:3857"
)

test_with_dam = join_nearest_layer(
    test_with_river,
    dams,
    # layer_keep_cols=dam_cols,
    dist_col="dist_to_dam_m",
    max_dist_m=5000,         # distance for dam to the point could be larger?
    metric_crs="EPSG:3857"
)


In [None]:
test_with_dam.head()