# Mobility and Storage Time Calculator

The following code takes processed and "cleaned" water masks from a specified working directory and performs a series of operations to calculate the: 1) area-based floodplain reworking timescales (TR) and distribution of channel areas (AW); 2) the sediment storage time distributions (tstor) using the deterministic (TCB) and probabilistic (TFP) approaches; 3) the reach transit times for both tstor approaches; 4) the total sediment transit time (ttot) for both tstor approaches.

Author: James (Huck) Rees; PhD Student, UCSB Geography

Date: April 9, 2025

## Import packages

In [1]:
import os
import numpy as np
import pandas as pd
from natsort import natsorted
from glob import glob
import math
import geopandas as gpd
import ast

import re
import fiona
import rasterio
from rasterio.mask import mask
from rasterio import warp
from rasterio.warp import transform_geom
from rasterio.enums import Resampling
from pyproj import CRS, Geod

from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, PillowWriter

import geemap
import ee
from geopy.distance import geodesic

# Authenticate with Google Earth Engine
ee.Initialize()

## Initialize functions to produce mobility dataframes and CSVs

In [2]:
def get_mobility_yearly(images, mask, scale=30):

    A = len(np.where(mask == 1)[1])

    year_range = list(images.keys())
    ranges = [year_range[i:] for i, yr in enumerate(year_range)]
    river_dfs = {}
    for yrange in ranges:
        data = {
            'year': [],
            'i': [],
            'O_avg': [],
            'O_wd': [],
            'O_dw': [],
            'O_wick': [],
            'fR': [],
            'fR_wick': [],
            'w_b': [],
            'd_b': [],
        }
        length = images[yrange[0]].shape[0]
        width = images[yrange[0]].shape[1]
        long = len(yrange)
        all_images = np.empty((length, width, long))
        years = []
        for j, year in enumerate(yrange):
            years.append(year)
            im = images[str(year)].astype(int)
            filt = np.where(~np.array(mask) + 2)
            im[filt] = 0
            all_images[:, :, j] = im

        baseline = all_images[:, :, 0]
        w_b = len(np.where(baseline == 1)[0])
        fb = mask - baseline
        fw_b = w_b / A
        fd_b = np.sum(fb) / A
        Na = A * fd_b

        for j in range(all_images.shape[2]):
            im = all_images[:, :, j]

            kb = (
                np.sum(all_images[:, :, :j + 1], axis=(2))
                + mask
            )
            kb[np.where(kb != 1)] = 0
            Nb = np.sum(kb)
            # fR = (Na / w_b) - (Nb / w_b)
            fR = (Na - Nb)
            fR_wick = 1 - (Nb / Na)

            # Calculate D - EQ. (1)
            D = np.subtract(baseline, im)
            # 1 - wet -> dry
            d_wd = len((np.where(D == 1))[0])
            # -1 - dry -> wet
            d_dw = len((np.where(D == -1))[0])

            # Calculate Phi
            w_t = len(np.where(im == 1)[0])
            fw_t = w_t / A
            fd_t = (A - w_t) / A

            # Calculate O_Phi
            PHI = (fw_b * fd_t) + (fd_b * fw_t)
            o_wick = 1 - (np.sum(np.abs(D)) / (A * PHI))
            o_avg = w_b - np.mean([d_wd, d_dw])
            o_wd = w_b - d_wd
            o_dw = w_b - d_dw

            data['i'].append(j)
            data['O_avg'].append(o_avg * (scale**2))
            data['O_wd'].append(o_wd * (scale**2))
            data['O_dw'].append(o_dw * (scale**2))
            data['O_wick'].append(o_wick)
            data['fR'].append(fR * (scale**2))
            data['fR_wick'].append(fR_wick)
            data['w_b'].append(w_b * (scale**2))
            data['d_b'].append(Na * (scale**2))

        data['year'] = years
        data['i'] = np.array(years).astype(int) - int(years[0])
        river_dfs[yrange[0]] = pd.DataFrame(data=data)

    return river_dfs

def get_mobility_rivers(folder_path, river, mob_storage, reach_range="All"):
    """
    Processes mobility metrics for specified reaches of a river and saves
    the results to separate CSV files with the reach number in the filename.

    Parameters:
        folder_path (str): The root directory that contains subfolders for each reach.
        river (str): Name of the river.
        mob_storage (str): Path to store the output CSV files.
        reach_range (str/int/tuple): Specifies which reaches to process. Can be:
            - An integer for a single reach (e.g., 3).
            - A tuple for a range of reaches (e.g., (1, 4)).
            - "All" to process all reaches.

    Returns:
        str: Name of the river (for confirmation or chaining).
    """

    def create_mask_shape(river, fps):
        """
        Reads the first raster file in fps and applies a mask processing step to return a binary raster
        where pixel values are transformed as follows:
            - Values < 10 are set to 0
            - Values >= 10 are set to 1

        Parameters:
        river (str): Unused in this version, kept for consistency with the original function signature.
        fps (list): List of file paths to raster images, with the first path being used for processing.

        Returns:
        numpy.ndarray: A 2D array with the processed binary mask.
        """
        import rasterio

        # Use the first file path in fps to open the raster file
        image = fps[0]
        with rasterio.open(image) as ds:
            # Read the data as a 2D array (assuming single-band raster)
            out_image = ds.read(1).astype('int64')
            
            # Mask processing
            out_image += 11  # Offset values as in the original function
            out_image[np.where(out_image < 10)] = 0
            out_image[np.where(out_image > 10)] = 1

        return out_image

    def clean(river, fps):
        """
        Processes a set of raster images by creating binary water masks without using any polygon for masking.
        Each raster file in `fps` is read, transformed into a binary water mask, and stored in a dictionary
        by year (derived from filenames).

        Parameters:
        river (str): Unused in this version, kept for compatibility with the original function signature.
        fps (list): List of file paths to raster images.

        Returns:
        tuple: A dictionary of images (binary masks by year) and their respective metadata.
        """
        import rasterio
        from rasterio.enums import Resampling
        import re

        images = {}
        metas = {}

        # Process each file path in fps
        for fp in fps:
            # Extract year from filename using regex
            year_match = re.findall(r"[0-9]{4,7}", fp)
            if year_match:
                year = year_match[-1]  # Take the last match as the year
            else:
                continue  # Skip files without a year identifier

            # Open the raster file
            with rasterio.open(fp) as ds:
                # Read the data and apply threshold to create a binary water mask
                image = ds.read(1, resampling=Resampling.nearest) > 0  # Binary mask where pixel > 0 is water

                # Skip images with no water (all values are 0)
                if not np.any(image):
                    continue

                # Update metadata for the binary water mask
                meta = ds.meta
                meta.update(
                    width=image.shape[1],
                    height=image.shape[0],
                    count=1,
                    dtype=rasterio.int8
                )

                # Save the binary water mask and its metadata
                images[year] = image
                metas[year] = meta

        return images, metas

    # Ensure the storage directory exists
    if not os.path.exists(mob_storage):
        os.makedirs(mob_storage)

    print(f"Processing river: {river}")

    # Generate the dictionary of paths for each reach
    paths = {}
    river_folder = os.path.join(folder_path, river)
    for reach_folder in os.listdir(river_folder):
        reach_path = os.path.join(river_folder, reach_folder, 'Cleaned')
        if os.path.isdir(reach_path):
            tif_files = glob(os.path.join(reach_path, "*.tif"))
            if tif_files:
                paths[reach_folder] = tif_files

    # Filter reaches based on reach_range
    if isinstance(reach_range, int):
        # Single reach
        reach_keys = [f"reach_{reach_range}"]
    elif isinstance(reach_range, tuple):
        # Range of reaches
        start, end = reach_range
        reach_keys = [f"reach_{i}" for i in range(start, end + 1)]
    elif reach_range == "All":
        # All reaches
        reach_keys = list(paths.keys())
    else:
        raise ValueError("Invalid reach_range format. Must be an integer, tuple, or 'All'.")

    # Iterate through specified reaches
    for reach_key in reach_keys:
        if reach_key not in paths:
            print(f"Reach {reach_key} not found in paths. Skipping.")
            continue

        path_list = paths[reach_key]

        # Extract the reach number from the reach_key (assumes format "reach_x")
        reach_number = reach_key.split('_')[1]  # Gets the 'x' part of "reach_x"

        # Sort paths naturally
        path_list = natsorted(path_list)

        # Generate the mask for the reach
        mask = create_mask_shape(river, path_list)

        # Clean and retrieve images and metadata
        images, metas = clean(river, path_list)

        # Set a fixed scale for processing
        scale = 30

        # Calculate yearly mobility metrics
        river_dfs = get_mobility_yearly(images, mask, scale=scale)

        # Combine data into a full DataFrame for the reach
        full_df = pd.DataFrame()
        for year, df in river_dfs.items():
            rnge = f"{year}_{df.iloc[-1]['year']}"
            df['dt'] = pd.to_datetime(df['year'], format='%Y')
            df['range'] = rnge

            # Append data to the final DataFrame
            full_df = pd.concat([full_df, df], ignore_index=True)

        # Define the output path with reach number in the filename
        out_path = os.path.join(mob_storage, f'{river}_reach_{reach_number}_yearly_mobility.csv')
        full_df.to_csv(out_path, index=False)
        print(f"Saved mobility metrics for {river} reach {reach_number} to {out_path}")

def get_mobility_dfs(csv_path):
    """
    Wrapper function to process multiple rivers and reaches based on a CSV file.

    Parameters:
        csv_path (str): Path to the CSV file containing river and reach information.

    Returns:
        None
    """

    # Read the CSV file into a DataFrame
    river_data = pd.read_csv(csv_path)

    # Iterate over each row in the DataFrame
    for index, row in river_data.iterrows():
        river_name = row['river_name']
        working_directory = row['working_directory']
        reach_range = row['reach_range']

        # Parse reach_range
        if isinstance(reach_range, str) and reach_range != "All":
            if reach_range.startswith("(") and reach_range.endswith(")"):
                # Convert tuple-like string to actual tuple
                reach_range = tuple(map(int, reach_range.strip("() ").split(",")))
            else:
                # Single integer as string
                reach_range = int(reach_range)

        # Construct input and output paths
        folder_path = f"{working_directory}/RiverMapping/RiverMasks"
        mob_storage = os.path.join(working_directory, "RiverMapping", "Mobility", river_name, "Mobility_dfs")
        os.makedirs(mob_storage, exist_ok=True)

        # Call the get_mobility_dfs function for the current river
        print(f"Processing {river_name} with reach range {reach_range}...")
        get_mobility_rivers(folder_path, river_name, mob_storage, reach_range)

    print("All rivers processed.")

## Initialize functions to calculate floodplain reworking timescale (TR) and channel area (AW) distributions from mobility sheets

In [3]:
def get_TR(csv_path):
    """
    Processes TR values for reaches based on a CSV file and generates aw distributions.

    Parameters:
        csv_path (str): Path to the CSV file containing river and reach information.

    Returns:
        None
    """

    def calculate_median_fit_with_TR_and_uncertainty(data):
        """
        A helper function that computes TR from a DataFrame containing channel mobility data.
        """
        data['AR_over_AW'] = data['fR'] / data['w_b']
        grouped = data.groupby('i')
        median_values = []
        i_values = []
        for i, group in grouped:
            i_values.append(i)
            median_values.append(np.median(group['AR_over_AW']))

        i_values = np.array(i_values)
        median_values = np.array(median_values)

        def exp_decay_asymptote(i, PR_over_AW, CR):
            return -PR_over_AW * np.exp(-CR * i) + PR_over_AW

        initial_guess = [1, 0.1]
        params, _ = curve_fit(exp_decay_asymptote, i_values, median_values, p0=initial_guess, maxfev=10000)
        PR_over_AW, CR = params
        Aw = 1
        return (1 / CR) * (Aw / PR_over_AW)
    
    def get_utm_epsg(lon, lat):
        zone_number = int((lon + 180) / 6) + 1
        if lat >= 0:
            return 32600 + zone_number  # Northern Hemisphere
        else:
            return 32700 + zone_number  # Southern Hemisphere

    def get_aw_dist(base_directory, output_directory, reach_range=None):
        """
        Calculates total wetted area (1 = wet) from binary water mask rasters stored in EPSG:4326,
        reprojects to UTM, and outputs a CSV with one value per raster (in m^2).

        Parameters:
            base_directory (str): Base directory containing reach subfolders (e.g., reach_1, reach_2, etc.).
            output_directory (str): Directory to save the output CSV files.
            reach_range (int, tuple, or None):
                - An integer for a single reach (e.g., 3).
                - A tuple for a range of reaches (e.g., (1, 4)).
                - None to process all reach folders.

        Returns:
            None
        """
        os.makedirs(output_directory, exist_ok=True)

        reach_dirs = [d for d in os.listdir(base_directory) if d.startswith("reach_") and os.path.isdir(os.path.join(base_directory, d))]

        for reach_dir in reach_dirs:
            try:
                reach_number = int(reach_dir.split('_')[1])

                if isinstance(reach_range, int) and reach_number != reach_range:
                    continue
                elif isinstance(reach_range, tuple) and not (reach_range[0] <= reach_number <= reach_range[1]):
                    continue

                cleaned_dir = os.path.join(base_directory, reach_dir, "Cleaned")
                if not os.path.exists(cleaned_dir):
                    print(f"Cleaned folder not found for Reach {reach_number}.")
                    continue

                tif_files = [f for f in os.listdir(cleaned_dir) if f.endswith(".tif")]
                aw_values = []

                for tif_file in tif_files:
                    with rasterio.open(os.path.join(cleaned_dir, tif_file)) as src:
                        data = src.read(1)
                        transform = src.transform
                        bounds = src.bounds
                        centroid_lon = (bounds.left + bounds.right) / 2
                        centroid_lat = (bounds.top + bounds.bottom) / 2
                        utm_epsg = get_utm_epsg(centroid_lon, centroid_lat)

                        dst_crs = CRS.from_epsg(utm_epsg)
                        transform_utm, width, height = calculate_default_transform(
                            src.crs, dst_crs, src.width, src.height, *src.bounds)

                        reprojected = np.empty((height, width), dtype=data.dtype)

                        reproject(
                            source=data,
                            destination=reprojected,
                            src_transform=transform,
                            src_crs=src.crs,
                            dst_transform=transform_utm,
                            dst_crs=dst_crs,
                            resampling=Resampling.nearest
                        )

                        pixel_area = abs(transform_utm.a * transform_utm.e)
                        wet_pixel_count = np.sum(reprojected == 1)
                        total_area_m2 = wet_pixel_count * pixel_area
                        aw_values.append(total_area_m2)

                output_df = pd.DataFrame({'a_w': aw_values})
                output_csv = os.path.join(output_directory, f"Reach_{reach_number}_aw_dist.csv")
                output_df.to_csv(output_csv, index=False)

                print(f"Saved corrected a_w totals for Reach {reach_number} to {output_csv}")

            except Exception as e:
                print(f"Error processing reach folder {reach_dir}: {e}")

    river_data = pd.read_csv(csv_path)
    for index, row in river_data.iterrows():
        river_name = row['river_name']
        working_directory = row['working_directory']
        reach_range = row['reach_range']

        # Parse reach_range
        if isinstance(reach_range, str) and reach_range != "All":
            if reach_range.startswith("(") and reach_range.endswith(")"):
                reach_range = tuple(map(int, reach_range.strip("() ").split(",")))
            else:
                reach_range = int(reach_range)

        input_directory = f"{working_directory}/RiverMapping/Mobility/{river_name}/Mobility_DFs"
        mask_directory = f"{working_directory}/RiverMapping/RiverMasks/{river_name}"
        tr_output_directory = f"{working_directory}/RiverMapping/Mobility/{river_name}"
        aw_output_directory = f"{working_directory}/RiverMapping/Mobility/{river_name}/AW_Distributions"

        os.makedirs(tr_output_directory, exist_ok=True)
        os.makedirs(aw_output_directory, exist_ok=True)

        csv_files = [f for f in os.listdir(input_directory) if f.endswith("_yearly_mobility.csv")]
        results = []

        for csv_file in csv_files:
            reach_number = int(csv_file.split('_')[2])

            # Filter based on reach_range
            if isinstance(reach_range, int) and reach_number != reach_range:
                continue
            elif isinstance(reach_range, tuple) and not (reach_range[0] <= reach_number <= reach_range[1]):
                continue

            data = pd.read_csv(os.path.join(input_directory, csv_file))
            try:
                TR = calculate_median_fit_with_TR_and_uncertainty(data)
                results.append((reach_number, TR))
            except Exception as e:
                print(f"Error processing {csv_file}: {e}")

        results_df = pd.DataFrame(results, columns=['ds_order', 'TR'])
        results_df.sort_values(by='ds_order', inplace=True)

        output_file = os.path.join(tr_output_directory, f"{river_name}_TR_values.csv")
        results_df.to_csv(output_file, index=False)
        print(f"TR values saved to {output_file}")

        # Generate AW distributions for the specified reaches
        get_aw_dist(mask_directory, aw_output_directory, reach_range)


## Initialize functions to calculate channel-belt turnover timescale (TCB; deterministic tstor estimate) 

In [4]:
def import_aw_distribution(river_name, reach_number, working_directory):
    """
    Imports the AW distribution for a specified reach.

    Args:
        river_name (str): Name of the river.
        reach_number (int): Reach number to import AW distribution.
        working_directory (str): Base working directory containing the river data.

    Returns:
        DataFrame: A DataFrame containing the AW distribution for the specified reach.
    """
    # Define base directory for AW distributions
    aw_dir = os.path.join(working_directory, 'RiverMapping', 'Mobility', river_name, 'AW_Distributions')

    # Ensure the directory exists
    if not os.path.exists(aw_dir):
        raise FileNotFoundError(f"AW distribution directory not found: {aw_dir}")

    # Load AW distribution for the specified reach
    aw_file = os.path.join(aw_dir, f"Reach_{reach_number}_aw_dist.csv")
    if not os.path.isfile(aw_file):
        raise FileNotFoundError(f"AW file not found: {aw_file}")

    aw_distribution = pd.read_csv(aw_file)

    return aw_distribution

def calculate_tcb_distribution(tr_value, channel_belt_area, aw_distribution):
    """
    Calculates the TCB distribution for a single reach and returns the result.

    Args:
        tr_value (float): TR value for the reach.
        channel_belt_area (float): Channel belt area (in square km) for the reach.
        aw_distribution (DataFrame): AW distribution for the reach.

    Returns:
        DataFrame: A DataFrame containing the TCB distribution for the specified reach.
    """
    # Ensure the AW distribution is not empty
    if aw_distribution.empty:
        raise ValueError("AW distribution is empty.")

    # Generate 10,000 random draws from the AW distribution
    aw_random_draws = np.random.choice(aw_distribution['a_w'], size=10000, replace=True)

    # Calculate TCB for each random draw, converting channel belt area from square km to square m
    tcb_values = tr_value * (channel_belt_area * 1000000 / aw_random_draws)

    return pd.DataFrame({'TCB': tcb_values})

def get_tcb_distributions(csv_path):
    """
    Processes a range of reaches from a CSV file and calculates TCB distributions for each.

    Args:
        csv_path (str): Path to the CSV file containing river name and reach range.

    Outputs:
        CSV files containing TCB distributions for each processed reach.
    """
    # Load the configuration CSV
    config_data = pd.read_csv(csv_path)

    # Extract river name, reach range, and working directory
    river_name = config_data['river_name'].iloc[0]
    reach_range = config_data['reach_range'].iloc[0]
    working_directory = config_data['working_directory'].iloc[0]

    # Define directories for required inputs
    tr_file = os.path.join(working_directory, 'RiverMapping', 'Mobility', river_name, f"{river_name}_TR_values.csv")
    channel_belt_file = os.path.join(working_directory, 'ChannelBelts', 'Extracted_ChannelBelts', river_name, f"{river_name}_channelbelt_areas.csv")

    # Check if the required files exist
    if not os.path.isfile(tr_file):
        raise FileNotFoundError(f"TR file not found: {tr_file}")
    if not os.path.isfile(channel_belt_file):
        raise FileNotFoundError(f"Channel belt areas file not found: {channel_belt_file}")

    # Load the TR and channel belt areas data
    tr_data = pd.read_csv(tr_file)
    channel_belt_data = pd.read_csv(channel_belt_file)

    # Parse reach_range
    if isinstance(reach_range, str) and reach_range != "All":
        if reach_range.startswith("(") and reach_range.endswith(")"):
            # Convert tuple-like string to actual tuple
            reach_range = tuple(map(int, reach_range.strip("() ").split(",")))
        else:
            # Single integer as string
            reach_range = reach_range.astype(int)
    
    # Determine the range of reaches to process
    if reach_range == "All":
        reaches = tr_data['ds_order'].unique()
    elif isinstance(reach_range, tuple):
        reaches = range(reach_range[0], reach_range[1] + 1)
    else:
        reaches = [reach_range]

    # Iterate through the range of reaches and calculate TCB for each
    for reach_number in reaches:
        # Get TR value for the reach
        tr_value = tr_data.loc[tr_data['ds_order'] == reach_number, 'TR'].values[0]

        # Get channel belt area for the reach
        channel_belt_area = channel_belt_data.loc[channel_belt_data['ds_order'] == reach_number, 'area_sq_km'].values[0]

        # Import AW distribution for the reach
        aw_distribution = import_aw_distribution(river_name, reach_number, working_directory)

        # Calculate the TCB distribution for the reach
        tcb_distribution = calculate_tcb_distribution(tr_value, channel_belt_area, aw_distribution)

        # Save TCB distribution to a CSV
        output_file = os.path.join(working_directory, 'RiverMapping', 'Mobility', river_name, 'TCB_Distributions', f"Reach_{reach_number}_TCB_distribution.csv")
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        tcb_distribution.to_csv(output_file, index=False)

        print(f"TCB distribution for Reach {reach_number} saved to {output_file}")

## Initialize functions to calculate first-passage time distributions of sediment storage time (TFP; probablistic tstor estimate)

In [5]:
def calculate_tfp_distribution(tr_value, channel_belt_area, aw_distribution):
    """
    Calculates the TFP distribution for a single reach using the random walk model and returns the result.

    Args:
        tr_value (float): TR value for the reach.
        channel_belt_area (float): Channel belt area (in square km) for the reach.
        aw_distribution (DataFrame): AW distribution (in square m) for the reach.

    Returns:
        DataFrame: A DataFrame containing the TFP distribution for the specified reach.
    """
    # Convert channel belt area from square km to square m
    channel_belt_area_m2 = channel_belt_area * 1000000

    # Ensure the AW distribution is not empty
    if aw_distribution.empty:
        raise ValueError("AW distribution is empty.")

    # Extract the 'a_w' values from the distribution
    aw_values = aw_distribution['a_w'].values

    # Number of iterations for the Monte Carlo simulation
    num_iterations = 10000

    # Maximum number of timesteps per iteration
    max_timesteps = 10000

    # Initialize storage for the first passage times
    tfp_times = []

    for _ in range(num_iterations):
        # Randomly initialize the channel's starting position within the domain
        x0 = np.random.uniform(0, channel_belt_area_m2)
        x = x0

        total_time = 0
        timestep_count = 0

        while timestep_count < max_timesteps:
            # Draw a random step size from the AW distribution
            aw_step = np.random.choice(aw_values)

            # Randomly determine the direction (+ or -)
            direction = np.random.choice([-1, 1])

            # Move the channel
            x_new = x + direction * aw_step

            # Reflect at boundaries if needed
            if x_new < 0:
                x_new = -x_new
            elif x_new > channel_belt_area_m2:
                x_new = 2 * channel_belt_area_m2 - x_new

            # Check if the channel has reached or passed the starting position
            if (x_new >= x0 and x < x0) or (x_new <= x0 and x > x0):
                # Calculate fractional time for the overshoot
                remaining_distance = abs(x0 - x)
                fractional_tr = (remaining_distance / aw_step) * tr_value
                total_time += fractional_tr
                break

            # Increment total time for this full step
            total_time += tr_value
            timestep_count += 1
            x = x_new

        # If the loop exited due to timestep limit, skip this iteration
        if timestep_count >= max_timesteps:
            continue

        # Store the total time for this iteration
        tfp_times.append(total_time)

    # Return the TFP distribution as a DataFrame
    return pd.DataFrame({'TFP': tfp_times})

def get_tfp_distributions(csv_path):
    """
    Processes a range of reaches from a CSV file and calculates TFP distributions for each.

    Args:
        csv_path (str): Path to the CSV file containing river name and reach range.

    Outputs:
        CSV files containing TFP distributions for each processed reach.
    """
    # Load the configuration CSV
    config_data = pd.read_csv(csv_path)

    for index, row in config_data.iterrows():
        # Extract river name, reach range, and working directory for each row
        river_name = row['river_name']
        reach_range = row['reach_range']
        working_directory = row['working_directory']

        # Define directories for required inputs
        tr_file = os.path.join(working_directory, 'RiverMapping', 'Mobility', river_name, f"{river_name}_TR_values.csv")
        channel_belt_file = os.path.join(working_directory, 'ChannelBelts', 'Extracted_ChannelBelts', river_name, f"{river_name}_channelbelt_areas.csv")

        # Check if the required files exist
        if not os.path.isfile(tr_file):
            raise FileNotFoundError(f"TR file not found: {tr_file}")
        if not os.path.isfile(channel_belt_file):
            raise FileNotFoundError(f"Channel belt areas file not found: {channel_belt_file}")

        # Load the TR and channel belt areas data
        tr_data = pd.read_csv(tr_file)
        channel_belt_data = pd.read_csv(channel_belt_file)

        # Determine the reach range
        if isinstance(reach_range, str):
            reach_range = reach_range.strip()  # Remove any extra spaces

            if reach_range == "All":
                reach_start = tr_data['ds_order'].min()
                reach_end = tr_data['ds_order'].max()
            elif reach_range.isdigit():
                # Convert a numeric string to an integer
                reach_range = int(reach_range)
                reach_start = reach_range
                reach_end = reach_range
            elif re.match(r'^\(\d{1,4}, \d{1,4}\)$', reach_range):  # Match (XX, YY) with 1 to 4 digits
                try:
                    # Convert the string to a tuple of integers
                    reach_range = ast.literal_eval(reach_range)
                    reach_start, reach_end = reach_range
                except (ValueError, SyntaxError):
                    raise ValueError(f"Invalid reach range format: {reach_range}")
            else:
                raise ValueError(f"Invalid string format for reach_range: {reach_range}")
        elif isinstance(reach_range, (int, float)) and float(reach_range).is_integer():
            # Convert float-like integers (e.g., 7.0) to int
            reach_range = int(reach_range)
            reach_start = reach_range
            reach_end = reach_range
        elif isinstance(reach_range, tuple) and len(reach_range) == 2:
            reach_start, reach_end = reach_range
        else:
            raise ValueError("reach_range must be 'All', an int, or a tuple (start, end).")

        # Generate range of reaches to process
        reaches = range(reach_start, reach_end + 1)

        # Iterate through the range of reaches and calculate TFP for each
        for reach_number in reaches:
            # Get TR value for the reach
            tr_value = tr_data.loc[tr_data['ds_order'] == reach_number, 'TR'].values[0]

            # Get channel belt area for the reach
            channel_belt_area = channel_belt_data.loc[channel_belt_data['ds_order'] == reach_number, 'area_sq_km'].values[0]

            # Import AW distribution for the reach
            aw_distribution = import_aw_distribution(river_name, reach_number, working_directory)

            # Calculate the TFP distribution for the reach
            tfp_distribution = calculate_tfp_distribution(tr_value, channel_belt_area, aw_distribution)

            # Save TFP distribution to a CSV
            output_file = os.path.join(working_directory, 'RiverMapping', 'Mobility', river_name, 'TFP_Distributions', f"Reach_{reach_number}_TFP_distribution.csv")
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            tfp_distribution.to_csv(output_file, index=False)

            print(f"TFP distribution for Reach {reach_number} saved to {output_file}")


## Initialize functions to run Monte Carlo simulation to calculate total transit time from the number of storage events and storage time distributions

In [6]:
def monte_carlo_reach_transit_time(tstor_df: pd.DataFrame, transit_df: pd.DataFrame, reach_number: int, num_iterations: int = 10_000) -> pd.DataFrame:
    """
    Runs a Monte Carlo Simulation to compute reach transit time values.
    
    Parameters:
        tstor_df (pd.DataFrame): DataFrame containing storage time values for a specific reach.
        transit_df (pd.DataFrame): DataFrame containing number of storage events "n" data for multiple reaches.
        reach_number (int): The reach number for which to calculate transit time.
        num_iterations (int): Number of iterations for the Monte Carlo simulation (default is 10,000).

    Returns:
        pd.DataFrame: DataFrame containing the reach transit time distribution.
    """
    # Ensure column names are properly formatted
    transit_df.columns = transit_df.columns.str.strip()
    
    n = transit_df.loc[transit_df["ds_order"] == reach_number, "n_stor"].values
    if len(n) == 0:
        raise ValueError(f"No transit length data found for Reach {reach_number}.")
    n = n[0]
    
    # Randomly sample tstor values from the distribution
    random_tstor_samples = np.random.choice(tstor_df.iloc[:, 0], size=num_iterations, replace=True)
    
    # Compute reach transit time
    reach_transit_time = random_tstor_samples * n
    
    # Create DataFrame for results
    return pd.DataFrame({"reach_transit_time_yr": reach_transit_time})

def process_all_reaches(work_dir: str, river_name: str, tstor_method: str):
    """
    Processes all reach transit time distributions for a given river based on the specified transit storage method.

    Parameters:
        work_dir (str): Path to the working directory containing relevant data files.
        river_name (str): Name of the river to process.
        tstor_method (str): Three-letter storage method code (e.g., "tfp", "tcb").
    """

    # Path to transit length (storage) values
    nstor_path = os.path.join(work_dir, "RiverMapping", "Mobility", river_name, f"{river_name}_transit_lengths.csv")
    nstor_vals = pd.read_csv(nstor_path)

    # Path where method-specific files are stored
    tstor_dir = os.path.join(work_dir, "RiverMapping", "Mobility", river_name, f"{tstor_method.upper()}_Distributions")
    tstor_files = {
        file: pd.read_csv(os.path.join(tstor_dir, file))
        for file in os.listdir(tstor_dir)
        if file.endswith(".csv") and f"{tstor_method.upper()}_" in file
    }

    # Prepare output directory
    output_subfolder = f'RTT_from{tstor_method.upper()}_Distributions'
    output_dir = os.path.join(work_dir, "RiverMapping", "Mobility", river_name, output_subfolder)
    os.makedirs(output_dir, exist_ok=True)

    # Process each reach
    for filename, tstor_df in tstor_files.items():
        try:
            reach_number = int(filename.split("_")[1].split(".")[0])
            reach_transit_time_df = monte_carlo_reach_transit_time(tstor_df, nstor_vals, reach_number)

            output_path = os.path.join(output_dir, f"Reach_{reach_number}_RTT_from{tstor_method.upper()}_distribution.csv")
            reach_transit_time_df.to_csv(output_path, index=False)
            print(f"Saved: {output_path}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")

## Calculate distributions for total alluvial transit time

In [7]:
def calculate_ttt_statistics(directory: str):
    """
    Calculates and saves statistics for all total transit time distribution CSV files
    found in the given directory.

    Parameters:
        directory (str): Directory containing TTT distribution CSV files.
    """
    files = [f for f in os.listdir(directory) if f.endswith("_distribution.csv")]

    for file in files:
        file_path = os.path.join(directory, file)
        ttt_df = pd.read_csv(file_path)

        if "total_transit_time_yr" not in ttt_df.columns:
            print(f"Skipping {file} — missing 'total_transit_time_yr' column.")
            continue

        # Compute statistics for all columns
        stats_list = []
        for column in ttt_df.columns:
            stats_list.append({
                "Variable": column,
                "Mean": np.mean(ttt_df[column]),
                "Standard Deviation": np.std(ttt_df[column]),
                "Min": np.min(ttt_df[column]),
                "1st Quartile": np.percentile(ttt_df[column], 25),
                "Median": np.median(ttt_df[column]),
                "3rd Quartile": np.percentile(ttt_df[column], 75),
                "Max": np.max(ttt_df[column])
            })

        stats_df = pd.DataFrame(stats_list)

        # Build output file name
        base_name = os.path.splitext(file)[0]
        stats_file = f"{base_name}_stats.csv"
        stats_path = os.path.join(directory, stats_file)
        stats_df.to_csv(stats_path, index=False)
        print(f"Saved stats: {stats_path}")

def monte_carlo_total_transit_time(working_dir: str, river_name: str, tstor_method: str, num_iterations: int = 10_000, reach_start: int = 1, reach_end: int = None):
    """
    Runs a Monte Carlo simulation to compute the total river transit time distribution,
    and includes the sampled reach-level transit times for each iteration.

    Parameters:
        working_dir (str): Root directory containing the data folder structure.
        river_name (str): Name of the river for output file naming.
        tstor_method (str): Three-letter storage method code (e.g., "tfp", "tcb").
        num_iterations (int): Number of iterations for the Monte Carlo simulation (default is 10,000).
        reach_start (int): Index of the first reach to include (1-based, inclusive).
        reach_end (int): Index of the last reach to include (1-based, inclusive). If None, includes all reaches to the end.

    Returns:
        pd.DataFrame: DataFrame containing the total river transit time and individual reach samples.
    """

    rtt_dir = os.path.join(working_dir, 'RiverMapping', 'Mobility', river_name, f'RTT_from{tstor_method.upper()}_Distributions')
    all_rtt_files = os.listdir(rtt_dir)

    # Determine reach range
    reach_end = reach_end if reach_end is not None else 100
    selected_reach_dfs = []
    actual_reaches = []

    for reach_num in range(reach_start, reach_end + 1):
        expected_filename = f"Reach_{reach_num}_RTT_from{tstor_method.upper()}_distribution.csv"
        file_path = os.path.join(rtt_dir, expected_filename)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            if "reach_transit_time_yr" not in df.columns:
                raise KeyError(f"Missing 'reach_transit_time_yr' column in file: {expected_filename}")
            selected_reach_dfs.append(df)
            actual_reaches.append(reach_num)
        else:
            raise FileNotFoundError(f"Expected file not found: {expected_filename}")

    simulation_results = []

    for _ in range(num_iterations):
        sampled_reach_times = [np.random.choice(df["reach_transit_time_yr"], 1)[0] for df in selected_reach_dfs]
        total_time = sum(sampled_reach_times)
        simulation_results.append(sampled_reach_times + [total_time])

    # Build DataFrame with individual reach samples and total time
    columns = [f"reach_{reach}_tt" for reach in actual_reaches] + ["total_transit_time_yr"]
    simulation_df = pd.DataFrame(simulation_results, columns=columns)

    # Create output filename reflecting reach range and storage method
    reach_range_str = f"R{reach_start}toR{reach_end}"
    output_filename = f"{river_name}_{reach_range_str}_TTT_from{tstor_method.upper()}_distribution.csv"
    output_path = os.path.join(working_dir, 'RiverMapping', 'Mobility', river_name, output_filename)
    simulation_df.to_csv(output_path, index=False)
    print(f"Saved: {output_path}")
    stats_directory = os.path.join(working_dir, 'RiverMapping', 'Mobility', river_name)
    calculate_ttt_statistics(stats_directory)

In [8]:
csv_path = r"D:\Dissertation\Data\Geyman_river_datasheet.csv"

In [9]:
get_mobility_dfs(csv_path)

Processing Yukon_Beaver with reach range All...
Processing river: Yukon_Beaver
Saved mobility metrics for Yukon_Beaver reach 1 to D:\Dissertation\Data\RiverMapping\Mobility\Yukon_Beaver\Mobility_dfs\Yukon_Beaver_reach_1_yearly_mobility.csv
All rivers processed.


In [33]:
get_TR(csv_path)

TR values saved to D:\Dissertation\Data/RiverMapping/Mobility/Yukon_Beaver\Yukon_Beaver_TR_values.csv
Error processing reach folder reach_1: name 'calculate_default_transform' is not defined
TR values saved to D:\Dissertation\Data/RiverMapping/Mobility/Koyukuk_Huslia\Koyukuk_Huslia_TR_values.csv
Error processing reach folder reach_1: name 'calculate_default_transform' is not defined


In [15]:
get_tcb_distributions(csv_path)

TCB distribution for Reach 1 saved to D:\Dissertation\Data\RiverMapping\Mobility\Yukon_Beaver\TCB_Distributions\Reach_1_TCB_distribution.csv


In [16]:
get_tfp_distributions(csv_path)

TFP distribution for Reach 1 saved to D:\Dissertation\Data\RiverMapping\Mobility\Yukon_Beaver\TFP_Distributions\Reach_1_TFP_distribution.csv


In [45]:
wd = r'D:\Dissertation\Data'
process_all_reaches(wd, "Bermejo", 'tcb')

Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\RTT_fromTCB_Distributions\Reach_1_RTT_fromTCB_distribution.csv
Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\RTT_fromTCB_Distributions\Reach_2_RTT_fromTCB_distribution.csv
Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\RTT_fromTCB_Distributions\Reach_3_RTT_fromTCB_distribution.csv
Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\RTT_fromTCB_Distributions\Reach_4_RTT_fromTCB_distribution.csv
Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\RTT_fromTCB_Distributions\Reach_5_RTT_fromTCB_distribution.csv
Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\RTT_fromTCB_Distributions\Reach_6_RTT_fromTCB_distribution.csv
Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\RTT_fromTCB_Distributions\Reach_7_RTT_fromTCB_distribution.csv
Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\RTT_fromTCB_Distributions\Reach_8_RTT_fromTCB_distribution.csv
Saved: D:\Dissertation\Data\Rive

In [57]:
monte_carlo_total_transit_time(r'D:\Dissertation\Data', 'Bermejo', 'tcb', 10000, 1, 20)

Saved: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R1toR20_TTT_fromTCB_distribution.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R1toR20_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R1toR4_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R5toR9_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R10toR16_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R17toR20_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R1toR4_TTT_fromTCB_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R5toR9_TTT_fromTCB_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R10toR16_TTT_fromTCB_d

In [53]:
calculate_ttt_statistics(r"D:\Dissertation\Data\RiverMapping\Mobility\Bermejo")

Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R1toR20_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R1toR4_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R5toR9_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R10toR16_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R17toR20_TTT_fromTFP_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R1toR4_TTT_fromTCB_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R5toR9_TTT_fromTCB_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R10toR16_TTT_fromTCB_distribution_stats.csv
Saved stats: D:\Dissertation\Data\RiverMapping\Mobility\Bermejo\Bermejo_R17toR20_