# Create reach polygons

The following code takes a river polyline shapefile extracted from the HydroAtlas HydroRIVERS database (see Extract_River_Segments_from_HydroRIVERS.ipynb), and splits it into reaches that are scaled based on a linear relationship between upstream drainage area and reach length, meaning that reaches become progressively longer downstream. This relationship was deduced by examining upstream drainage area estimates at specific points along the Ganges River in India and manually measuring the width of an ideal reach at that point. Finally, buffers are applied to each reach line to create a polygon encompassing the entire reach, which is used in a subsequent script to extract river masks from that area. The script is meant to take a path to a standardized RivMapper input sheet .csv as an input, and iterate through each row to create reach polygons for the given rivers. Reach polygon buffers are output to a specified local folder.

HydroRIVERS database: https://www.hydrosheds.org/page/hydroatlas

Citation: Lehner, B., Verdin, K., Jarvis, A., 2008. New Global Hydrography Derived From Spaceborne Elevation Data. Eos, Transactions American Geophysical Union 89, 93–94. https://doi.org/10.1029/2008EO100001

Logic for downstream increases in reach length, scaling with upstream drainage area: "A reach is a continuous length of river corridor with consistent geometry and linear downstream increases in drainage area, which may be tens of meters long in a small river and tens of kilometers long in a major river." - Professor Ellen Wohl

Author: James (Huck) Rees; PhD Student, UCSB Geography

Date: January 7, 2025

## Import packages

In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.ops import unary_union, split
from shapely.geometry import LineString, MultiLineString, Point
import os

## Initialize functions

In [2]:
def read_input_csv(csv_path):
    """
    Reads the input CSV containing the necessary parameters for processing.

    Parameters:
    csv_path (str): Path to the input CSV file.

    Returns:
    DataFrame: A DataFrame containing the parameters.
    """
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file not found: {csv_path}")
    
    df = pd.read_csv(csv_path)
    
    required_columns = ['river_name', 'buffer_width', 'working_directory']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Missing required column in CSV: {col}")
    
    return df

def reimport_river_segments(shapefile_path):
    """
    Reimport river segments from a shapefile.

    Parameters:
    shapefile_path (str): Path to the shapefile.

    Returns:
    GeoDataFrame: A GeoDataFrame containing the river segments.
    """
    if not os.path.exists(shapefile_path):
        raise FileNotFoundError(f"Shapefile not found: {shapefile_path}")
    gdf = gpd.read_file(shapefile_path)
    if gdf.is_empty.any():
        raise ValueError("Input shapefile contains empty geometries.")
    return gdf

def transform_to_crs(gdf, epsg_code=3395):
    """
    Transform the CRS of a GeoDataFrame.

    Parameters:
    gdf (GeoDataFrame): GeoDataFrame to transform.
    epsg_code (int): EPSG code of the target CRS.

    Returns:
    GeoDataFrame: GeoDataFrame with transformed CRS.
    """
    return gdf.to_crs(epsg=epsg_code)

def merge_river_segments(river_segs):
    """
    Merge river segments into a single LineString.

    Parameters:
    river_segs (GeoDataFrame): GeoDataFrame containing river segments.

    Returns:
    GeoDataFrame: A GeoDataFrame containing the merged river segments.
    """
    merged_line = unary_union(river_segs.geometry)
    if isinstance(merged_line, MultiLineString):
        all_coords = []
        for line in merged_line.geoms:
            all_coords.extend(line.coords)
        merged_line = LineString(all_coords)
    river_merged = gpd.GeoDataFrame(geometry=[merged_line], crs=river_segs.crs)
    return river_merged

def calculate_reach_len(upland_skm, modulator):
    """
    Original coefficient based on excel relationship: 0.0668
    Calculate reach length from upland_skm.

    Parameters:
    upland_skm (float): Upland area in square kilometers.

    Returns:
    float: Reach length in meters.
    """
    return modulator * 0.0668 * upland_skm + 14316

def get_upland_skm_at_distance(river_segs, distance):
    """
    Sample UPLAND_SKM at a specified downstream distance.

    Parameters:
    river_segs (GeoDataFrame): GeoDataFrame containing river segments.
    distance (float): Distance downstream in meters.

    Returns:
    float: UPLAND_SKM value at the specified distance.
    """
    cumulative_length = 0
    for _, segment in river_segs.iterrows():
        segment_length = segment['LENGTH_KM'] * 1000  # Convert km to meters
        if cumulative_length + segment_length >= distance:
            return segment['UPLAND_SKM']
        cumulative_length += segment_length
    return river_segs.iloc[-1]['UPLAND_SKM']  # Return the UPLAND_SKM of the last segment if distance exceeds total length

def create_reach_lengths(river_segs, modulator):
    """
    Create reach lengths iteratively.

    Parameters:
    river_segs (GeoDataFrame): GeoDataFrame containing river segments.

    Returns:
    list: List of reach lengths in meters.
    """
    reach_lengths = []
    current_distance = 0
    total_length = river_segs['LENGTH_KM'].sum() * 1000  # Total length in meters

    while current_distance < total_length:
        upland_skm = get_upland_skm_at_distance(river_segs, current_distance)
        reach_length = calculate_reach_len(upland_skm, modulator)
        reach_lengths.append(reach_length)
        current_distance += reach_length

    return reach_lengths

def cut(line, distance):
    """
    Cut a line at a specified distance from its starting point.

    Parameters:
    line (LineString): The line to be cut.
    distance (float): The distance from the start of the line where the cut should be made.

    Returns:
    tuple: A tuple containing two LineStrings. The first LineString is the portion of the original
           line from the start to the cut point, and the second LineString is the portion from the
           cut point to the end of the original line.
    """
    # If the distance is 0 or greater than the line length, return the line and an empty LineString
    if distance <= 0.0 or distance >= line.length:
        return line, LineString()  # Return the original line and an empty LineString
    
    # Otherwise, split the line
    for i, seg in enumerate(line.coords):
        pd = line.project(Point(seg))
        if pd == distance:
            return LineString(line.coords[:i+1]), LineString(line.coords[i:])
        if pd > distance:
            cp = line.interpolate(distance)
            return (
                LineString(line.coords[:i] + [(cp.x, cp.y)]),
                LineString([(cp.x, cp.y)] + line.coords[i:])
            )

def split_line_with_points(line, points):
    """
    Split a LineString with a list of points.

    Parameters:
    line (LineString): The LineString to split.
    points (list): List of Points at which to split the line.

    Returns:
    list: List of LineStrings resulting from the split.
    """
    segments = []
    current_line = line
    for p in points:
        d = current_line.project(p)
        seg, current_line = cut(current_line, d)
        segments.append(seg)
    segments.append(current_line)
    return segments

def process_river_reaches(folder_name, modulator, output_reaches_partial_path, shapefile_partial_path):
    """
    Main function to process river reaches.

    Parameters:
    shapefile_path (str): Path to the input shapefile.
    output_reaches_path (str): Path to save the output shapefile.
    """
    shapefile_path = f"{shapefile_partial_path}/{folder_name}/{folder_name}.shp"
    river_segs = reimport_river_segments(shapefile_path)
    river_segs = transform_to_crs(river_segs)
    river_merged_gdf = merge_river_segments(river_segs)
    river_merged = river_merged_gdf.geometry[0]  # Convert to shapely LineString

    # Create reach lengths and points
    reach_lengths = create_reach_lengths(river_segs, modulator)
    cumulative_distances = [sum(reach_lengths[:i+1]) for i in range(len(reach_lengths))]
    points = [river_merged.interpolate(distance) for distance in cumulative_distances]

    # Split the river_merged line at the points
    split_geometries = split_line_with_points(river_merged, points)

    # Create a GeoDataFrame for the split reaches
    river_reaches = gpd.GeoDataFrame(geometry=split_geometries, crs=river_merged_gdf.crs)

    # Add downstream order, reach length, and distance downstream fields
    river_reaches['ds_order'] = range(1, len(river_reaches) + 1)
    river_reaches['reach_len'] = [segment.length for segment in river_reaches.geometry]
    river_reaches['ds_dist'] = river_reaches['reach_len'].cumsum()
    
    # Delete artifact reaches with length that equals 0
    river_reaches = river_reaches[river_reaches['reach_len'] != 0]
    
    # Delete the most downstream reach if it's too small
    river_reaches = river_reaches.sort_values(by='ds_order', ascending=False)
    record_largest_ds_order = river_reaches.iloc[0]
    record_second_largest_ds_order = river_reaches.iloc[1]

    # Compare their 'reach_len' values
    if record_largest_ds_order['reach_len'] < 0.8 * record_second_largest_ds_order['reach_len']:
        # Delete the record with the largest 'ds_order' value
        river_reaches = river_reaches.drop(record_largest_ds_order.name)

    # Reorder columns to move 'geometry' to the end
    cols = [col for col in river_reaches.columns if col != 'geometry'] + ['geometry']
    river_reaches = river_reaches[cols]
    
    output_file_name = f"{output_reaches_partial_path}/{folder_name}/{folder_name}_reaches.shp"

    # Optionally, save the river_reaches to a shapefile
    river_reaches.to_file(output_file_name)

    return river_reaches

def create_buffered_reaches(folder_name, output_buffers_partial_path, river_reaches, buffer_width):
    """
    Create buffers for each river segment.

    Parameters:
    folder_name (str): Name of the folder to store outputs.
    output_buffers_partial_path (str): Path to the directory for storing buffers.
    river_reaches (GeoDataFrame): GeoDataFrame containing river reaches.
    buffer_width (float): Multiplier for calculating buffer width.

    Returns:
    None
    """
    # Calculate buffer widths and create buffers
    river_reaches['bufwid_m'] = river_reaches['reach_len'] * buffer_width
    river_reaches['geometry'] = river_reaches.geometry.buffer(river_reaches['bufwid_m'])
    river_reaches['bufar_m2'] = river_reaches.geometry.area

    # Construct output path and ensure the directory exists
    output_directory = os.path.join(output_buffers_partial_path, folder_name)
    os.makedirs(output_directory, exist_ok=True)
    output_buffers_path = os.path.join(output_directory, f"{folder_name}.shp")

    # Save the buffered reaches to a shapefile
    river_reaches.to_file(output_buffers_path)

def main(csv_path):
    """
    Main function to process river data based on input from a CSV file.

    Parameters:
        csv_path (str): The path to the input CSV file.
    """
    # Read the input CSV
    input_df = read_input_csv(csv_path)

    # Iterate over rows and process each river
    for _, row in input_df.iterrows():
        folder_name = row.get('river_name', 'Unknown River')
        working_directory = row.get('working_directory')
        modulator = row.get('reach_length_modulator')
        buffer_width = row.get('buffer_width')

        try:
            # Ensure all necessary data is present
            if not all([folder_name, working_directory, buffer_width]):
                print(f"Skipping {folder_name} due to missing data.")
                continue

            shapefile_path = os.path.join(working_directory, "HydroATLAS", "HydroRIVERS", "Extracted_Rivers")
            output_buffers_path = os.path.join(working_directory, "RiverMapping", "Reaches")
            os.makedirs(output_buffers_path, exist_ok=True)

            # Process the river reaches and create buffered reaches
            river_reaches = process_river_reaches(folder_name, modulator, shapefile_path, shapefile_path)
            create_buffered_reaches(folder_name, output_buffers_path, river_reaches, buffer_width)

        except (FileNotFoundError, ValueError) as e:
            print(f"Skipping {folder_name}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred while processing {folder_name}: {e}")

## Enter input variables and run

The user only needs to enter the path to the .csv file they wish to process from. An formatted example for the input datasheet may be found here: (need to add this in later)

In [5]:
csv_path = r"D:\Dissertation\Data\Geyman_river_datasheet.csv" # Replace with the actual path to the CSV file
main(csv_path)

  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
