# Extract river segements from HydroRIVERS

The following code takes a continental-scale rivers shapefile from the HydroATLAS "HydroRIVERS" database, which is a large polyline file containing segments of all large rivers in the world. The HydroRIVERS shapefiles should already be downloaded to your working directory.

The user should first open the HydroRIVERS shapefile for the desired continent in a GIS and find the desired river to extract. The user should then choose the most upstream segment they want for the extraction, and record the 'HYRIV_ID' for that segment. 'HYRIV_ID' is a unique numeric for each segment in the dataset. Using a standard "RivMapper" .csv input sheet, the user then enters the desired river name (river_name), the working directory, the upstream and downstream segments via the HYRIV_IDs (us_hyriv_id and ds_hyriv_id), the ending condition either as "specific_segment" or "length" (end_condition), the maximum river channel length to extract if the end condition is length (max_length_km). Then, within this notebook, the user must simply enter the path to their RivMapper input sheet, and run. Executing the code extracts all river segments downstream of and including the given 'HYRIV_ID' segment, terminating at the ocean or depositional basin. This is written as a shapefile to the desired output folder and named after the river.

HydroRIVERS database: https://www.hydrosheds.org/page/hydroatlas

Citation: Lehner, B., Verdin, K., Jarvis, A., 2008. New Global Hydrography Derived From Spaceborne Elevation Data. Eos, Transactions American Geophysical Union 89, 93–94. https://doi.org/10.1029/2008EO100001

Author: James (Huck) Rees; PhD Student, UCSB Geography

Date: January 7, 2025

## Import libraries

In [1]:
import os
import geopandas as gpd
import pandas as pd

## Initialize functions

In [2]:
def load_shapefile_from_local_folder(local_folder):
    """
    Load a shapefile from a given local folder.

    Parameters:
        local_folder (str): Path to the folder containing the shapefile.

    Returns:
        gpd.GeoDataFrame: The loaded shapefile as a GeoDataFrame.
    """
    # Locate the shapefile in the folder
    for file in os.listdir(local_folder):
        if file.endswith(".shp"):
            shapefile = os.path.join(local_folder, file)
            return gpd.read_file(shapefile)
    
    raise FileNotFoundError(f"No shapefile (.shp) found in the folder: {local_folder}")

def extract_downstream_river(gdf, start_hyriv_id, method="specific_segment", terminal_hyriv_id=0, threshold_length_km=None):
    """
    Extract downstream river segments from a GeoDataFrame.

    Parameters:
        gdf (gpd.GeoDataFrame): GeoDataFrame with river segments.
        start_hyriv_id (int): Starting HYRIV_ID.
        method (str): Extraction method ('specific_segment' or 'length').
        terminal_hyriv_id (int): Terminal HYRIV_ID (for 'specific_segment' method).
        threshold_length_km (float): Length threshold (for 'length' method).

    Returns:
        gpd.GeoDataFrame: Extracted downstream river segments.
    """
    segment_dict = gdf.set_index('HYRIV_ID').to_dict(orient='index')
    downstream_segments = []
    total_length_km = 0
    visited = set()
    current_id = start_hyriv_id

    while current_id != 0 and current_id not in visited:
        visited.add(current_id)
        current_segment = segment_dict.get(current_id)

        if current_segment:
            downstream_segments.append(current_segment)
            total_length_km += current_segment['LENGTH_KM']

            if method == "specific_segment" and current_id == terminal_hyriv_id:
                break
            if method == "length" and threshold_length_km is not None and total_length_km >= threshold_length_km:
                break

            current_id = current_segment['NEXT_DOWN']
        else:
            break

    return gpd.GeoDataFrame(downstream_segments, crs=gdf.crs) if downstream_segments else gpd.GeoDataFrame(columns=gdf.columns, crs=gdf.crs)

def extract_and_save_segments_from_csv(gdf, df):
    """
    Extract river segments based on criteria in a DataFrame and save them as shapefiles.

    Parameters:
        gdf (gpd.GeoDataFrame): GeoDataFrame with river segments.
        df (pd.DataFrame): DataFrame with extraction criteria.
    """
    for _, row in df.iterrows():
        river_name = row['river_name']
        working_directory = row['working_directory']
        start_hyriv_id = row['us_hyriv_id']
        method = row['end_condition']
        terminal_hyriv_id = row.get('ds_hyriv_id', 0)
        max_length_km = row.get('max_length_km', None)

        output_directory = os.path.join(working_directory, "HydroATLAS", "HydroRIVERS", "Extracted_Rivers", river_name)
        os.makedirs(output_directory, exist_ok=True)
        output_shapefile_path = os.path.join(output_directory, f"{river_name}.shp")

        extracted_river = extract_downstream_river(
            gdf,
            start_hyriv_id=start_hyriv_id,
            method=method,
            terminal_hyriv_id=terminal_hyriv_id,
            threshold_length_km=max_length_km
        )
        extracted_river.to_file(output_shapefile_path)

def main(csv_path):
    """
    Main function to process the CSV, load shapefiles, and extract and save river segments.

    Parameters:
        csv_path (str): Path to the input CSV file.
    """
    df = pd.read_csv(csv_path)
    working_directory = df.loc[0, 'working_directory']
    unique_zones = df['hydroatlas_zone'].unique()

    for zone in unique_zones:
        zone_path = os.path.join(working_directory, "HydroATLAS", "HydroRIVERS", f"HydroRIVERS_v10_{zone}_shp")
        gdf = load_shapefile_from_local_folder(zone_path)
        sub_df = df[df['hydroatlas_zone'] == zone]
        extract_and_save_segments_from_csv(gdf, sub_df)

## Input variables and run

In [3]:
csv_path = r"C:\Users\huckr\Desktop\UCSB\Dissertation\Data\RiverMapping\Bermejo_river_datasheet.csv"
main(csv_path)