# Extract river segements from HydroRIVERS

The following code takes a continental-scale rivers shapefile from the HydroATLAS "HydroRIVERS" database, which is a large polyline file containing segments of all large rivers in the world. The user should first open the shpaefile in a GIS and find the desired river to extract. The user should then choose the most upstream segment they want for the extraction, and record the 'HYRIV_ID' for that segment. 'HYRIV_ID' is a unique numeric for each segment in the dataset. The user then enters the desired dataset name, path, the 'HYRIV_ID', a river name, and an output path (optional). Executing the code extracts all river segments downstream of and including the given 'HYRIV_ID' segment, terminating at the ocean or depositional basin. This is written as a shapefile to the desired output folder and named after the river.

Author: James (Huck) Rees; PhD Student, UCSB Geography

Date: June 25th, 2024

## Import libraries

In [1]:
import os
import requests
import zipfile
from urllib.parse import urlparse
import geopandas as gpd
import pandas as pd

## Initialize functions

In [2]:
def download_and_extract_shapefile(url, extract_to='shapefile_data'):
    # Step 1: Download the zip file
    local_zip = 'shapefile.zip'
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_zip, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    # Step 2: Extract the zip file
    with zipfile.ZipFile(local_zip, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
    
    # Step 3: Extract the subfolder name from the URL
    parsed_url = urlparse(url)
    subfolder_name = os.path.basename(parsed_url.path).replace('.zip', '')

    # Construct the full path to the subfolder
    subfolder_path = os.path.join(extract_to, subfolder_name)

    # Ensure the subfolder exists
    if not os.path.exists(subfolder_path):
        raise FileNotFoundError(f"Expected subfolder {subfolder_name} not found in {extract_to}.")

    # Step 4: Find the shapefile (.shp) within the subfolder
    shapefile_path = None
    for root, dirs, files in os.walk(subfolder_path):
        for file in files:
            if file.endswith('.shp'):
                shapefile_path = os.path.join(root, file)
                break
        if shapefile_path:
            break

    # Step 5: Load the shapefile into a GeoDataFrame
    if shapefile_path:
        gdf = gpd.read_file(shapefile_path)
        return gdf
    else:
        raise FileNotFoundError(f"No .shp file found in the subfolder {subfolder_name}.")
        
def extract_downstream_river(gdf, start_hyriv_id, method="specific_segment", terminal_hyriv_id=0, threshold_length_km=None):
    """
    Extracts downstream segments of a river starting from a given HYRIV_ID.
    
    Parameters:
        gdf (GeoDataFrame): The GeoDataFrame containing river segments.
        start_hyriv_id (int): The HYRIV_ID to start the extraction from.
        method (str): The method to determine when to stop extraction. 
                      Options are "specific_segment" (default) or "length".
        terminal_hyriv_id (int): The HYRIV_ID to stop at when using "specific_segment" method. Default is 0 (ocean).
        threshold_length_km (float): The cumulative distance in km to stop at when using "length" method.
    
    Returns:
        river (GeoDataFrame): A GeoDataFrame containing the extracted downstream river segments.
    """
    # Prepare to store the segments
    downstream_segments = []
    total_length_km = 0
    
    # Create a dictionary for quick access to segments by HYRIV_ID
    segment_dict = gdf.set_index('HYRIV_ID').to_dict(orient='index')
    
    # Track visited segments to avoid loops
    visited = set()
    
    # Start with the given HYRIV_ID
    current_id = start_hyriv_id
    
    while current_id != 0 and current_id not in visited:
        # Mark the current segment as visited
        visited.add(current_id)
        
        # Get the current segment from the dictionary
        current_segment = segment_dict.get(current_id)
        
        if current_segment is not None:
            downstream_segments.append(current_segment)
            total_length_km += current_segment['LENGTH_KM']
            
            # Check if we've reached the stopping condition
            if method == "specific_segment" and current_id == terminal_hyriv_id:
                break
            if method == "length" and threshold_length_km is not None and total_length_km >= threshold_length_km:
                break
            
            # Move to the next downstream segment
            current_id = current_segment['NEXT_DOWN']
        else:
            break
    
    # Convert the list of segments back into a GeoDataFrame
    if downstream_segments:
        # Ensure that the geometry column is properly set
        river = gpd.GeoDataFrame(downstream_segments, geometry=[seg['geometry'] for seg in downstream_segments], crs=gdf.crs)
    else:
        river = gpd.GeoDataFrame(columns=gdf.columns, crs=gdf.crs)
    
    return river

def extract_and_save_segments_from_csv(gdf, df):
    """
    Extract segments from a GeoDataFrame (gdf) based on criteria provided in each row of a DataFrame (df)
    and save each extracted river to a shapefile.

    Parameters:
        gdf (GeoDataFrame): The GeoDataFrame containing river segments.
        df (DataFrame): The DataFrame containing criteria for extraction.
    """
    for idx, row in df.iterrows():
        # Extracting the necessary parameters from the current row
        river_name = row['river_name']
        start_hyriv_id = row['us_hyriv_id']  # Column name for starting HYRIV_ID
        end_condition = row['end_condition']  # Segment or length condition
        max_length_km = row['max_length_km']  # Maximum length in km
        terminal_hyriv_id = row['ds_hyriv_id']  # Terminal HYRIV_ID if applicable
        local_shapefile_directory = row['local_shapefile_directory']

        # Construct the output directory path
        output_directory = os.path.join(local_shapefile_directory, "Extracted_Rivers", river_name)
        os.makedirs(output_directory, exist_ok=True)  # Create the directory if it doesn't exist

        # Define the full path for the output shapefile
        output_shapefile_path = os.path.join(output_directory, f"{river_name}.shp")

        # Call the extract_downstream_river function
        extracted_river = extract_downstream_river(
            gdf, 
            start_hyriv_id=start_hyriv_id, 
            method=end_condition, 
            terminal_hyriv_id=terminal_hyriv_id, 
            threshold_length_km=max_length_km
        )
        
        # Save the extracted river to a shapefile
        extracted_river.to_file(output_shapefile_path)
        
def main(csv_path):
    """
    Main function to process the CSV, download and extract shapefiles, and extract and save river segments.
    
    Parameters:
        csv_path (str): The path to the input CSV file.
    """
    # Step 1: Read the CSV into a DataFrame
    df = pd.read_csv(csv_path)
    
    # Step 2: Find unique URLs in the 'online_url' column
    unique_urls = df['online_url'].unique()
    
    for url in unique_urls:
        # Step 3: Download and extract the shapefile, resulting in a GeoDataFrame
        gdf = download_and_extract_shapefile(url)
        
        # Step 4: Create a sub-DataFrame containing only the records with the current URL
        sub_df = df[df['online_url'] == url]
        
        # Step 5: Run the extraction and saving process with the sub-DataFrame and associated gdf
        extract_and_save_segments_from_csv(gdf, sub_df)

## Input variables and run

In [3]:
csv_path = r"C:\Users\huckr\Desktop\UCSB\Dissertation\Data\RiverMapping\Combo_river_datasheet.csv" # Replace with the actual path to your CSV file
main(csv_path)

  _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)
  _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)
  _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)
  _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)
  _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)
  _to_file_fiona(df, filename, driver, schema, crs, mode, **kwargs)
