# River mask cleaner

The following code takes DSWE-derived watermasks from a specified local path. It also finds the associated "main_channel" vector produced from the Calculate_CFI notebook. Then, it overlays the main channel vector with the channel mask and selects the wetted portions of the mask intersecting the main channel, thus eliminating detected surface water not connected to the river. The processed mask is placed in a "Processed" folder. If no "Processed" folder exists, a new folder with that name is generated. 

Author: James (Huck) Rees; PhD Student, UCSB Geography

Date: July 12th, 2024

## Import packages

In [1]:
import numpy as np
import rasterio
from scipy.ndimage import label as ndimage_label
import geopandas as gpd
from shapely.geometry import box
from shapely.geometry import Polygon, MultiPoint
import os
from rasterio.transform import from_origin
import glob

## Initialize functions

In [17]:
def import_raster(file_path):
    """
    Imports a raster file from a specified local path.

    Parameters:
    file_path (str): The path to the raster file.

    Returns:
    dataset: A rasterio dataset object.
    """
    try:
        dataset = rasterio.open(file_path)
        return dataset
    except rasterio.errors.RasterioIOError as e:
        print(f"Error opening raster file: {e}")
        return None

def classify_wet_components(raster, min_pixels=200):
    """
    Classify connected components of wet pixels in a 2-D water mask raster and eliminate small components.

    Parameters:
    raster: A rasterio dataset object with pixel values 0 (dry) and 1 (wet).
    min_pixels (int): Minimum number of pixels for a component to be retained. Default is 200.

    Returns:
    classified_raster: A 2-D numpy array with unique integer values for each component.
    """
    # Read the raster data into a numpy array
    water_mask = raster.read(1)

    # Label the connected components
    structure = np.ones((3, 3), dtype=int)  # 8-connectivity
    labeled_array, num_features = ndimage_label(water_mask, structure=structure)

    # Eliminate components smaller than min_pixels
    component_sizes = np.bincount(labeled_array.ravel())
    for component_label, size in enumerate(component_sizes):
        if size < min_pixels:
            labeled_array[labeled_array == component_label] = 0

    # Relabel the remaining components to have consecutive integer labels
    labeled_array, num_features = ndimage_label(labeled_array > 0, structure=structure)

    return labeled_array

def import_and_reproject_shapefile(shapefile_path, raster_crs):
    """
    Import a shapefile and reproject it to match the CRS of the raster.

    Parameters:
    shapefile_path (str): The path to the shapefile.
    raster_crs (rasterio.crs.CRS): The CRS of the raster to match.

    Returns:
    gdf: A GeoDataFrame with the reprojected shapefile.
    """
    # Import the shapefile
    gdf = gpd.read_file(shapefile_path)

    # Reproject the GeoDataFrame to match the raster CRS
    gdf = gdf.to_crs(raster_crs)

    return gdf

def raster_to_gdf(labeled_array, transform):
    """
    Convert a labeled raster array to a GeoDataFrame with polygons for each pixel.

    Parameters:
    labeled_array: A 2-D numpy array with labeled components.
    transform: Affine transform for converting array coordinates to geographic coordinates.

    Returns:
    gdf: A GeoDataFrame with polygons for each pixel.
    """
    rows, cols = labeled_array.shape
    polygons = []
    labels = []

    for row in range(rows):
        for col in range(cols):
            value = labeled_array[row, col]
            if value != 0:  # Skip background pixels
                # Create a polygon for each pixel
                minx, miny = transform * (col, row)
                maxx, maxy = transform * (col + 1, row + 1)
                polygons.append(box(minx, miny, maxx, maxy))
                labels.append(value)

    # Create a GeoDataFrame
    gdf = gpd.GeoDataFrame({'geometry': polygons, 'label': labels})

    return gdf

def find_intersecting_pixels(raster_gdf, line_gdf):
    """
    Find unique pixel values that a line intersects with.

    Parameters:
    raster_gdf: GeoDataFrame with raster pixels as polygons.
    line_gdf: GeoDataFrame with polyline geometry.

    Returns:
    unique_labels: A set of unique pixel values that the line intersects.
    """
    # Perform a spatial join to find intersecting pixels
    intersections = gpd.sjoin(raster_gdf, line_gdf, how='inner', op='intersects')
    unique_labels = set(intersections['label'])
    
    return unique_labels

def filter_and_reclassify_raster(classified_raster, unique_labels, min_pixels=200):
    """
    Filter the classified raster to include only specified components and reclassify pixels.
    Eliminate components smaller than min_pixels.

    Parameters:
    classified_raster: A 2-D numpy array with labeled components.
    unique_labels: A set of unique pixel values to retain as wet.
    min_pixels (int): Minimum number of pixels for a component to be retained. Default is 200.

    Returns:
    final_raster: A 2-D numpy array with reclassified pixels.
    """
    # Create a new array for the final raster
    final_raster = np.zeros_like(classified_raster, dtype=np.int32)

    # Set pixels that are in unique_labels to 1 (wet)
    for unique_label in unique_labels:
        final_raster[classified_raster == unique_label] = 1

    # Label the connected components in the final raster
    structure = np.ones((3, 3), dtype=int)  # 8-connectivity
    labeled_array, num_features = ndimage_label(final_raster, structure=structure)

    # Eliminate components smaller than min_pixels
    component_sizes = np.bincount(labeled_array.ravel())
    for component_label, size in enumerate(component_sizes):
        if size < min_pixels:
            final_raster[labeled_array == component_label] = 0

    return final_raster

def export_raster(raster, transform, crs, output_path):
    """
    Export a raster array to a GeoTIFF file.

    Parameters:
    raster: A 2-D numpy array to export.
    transform: Affine transform for the raster.
    crs: Coordinate reference system of the raster.
    output_path (str): The path to save the GeoTIFF file.
    """
    with rasterio.open(
        output_path,
        'w',
        driver='GTiff',
        height=raster.shape[0],
        width=raster.shape[1],
        count=1,
        dtype=raster.dtype,
        crs=crs,
        transform=transform,
    ) as dst:
        dst.write(raster, 1)

def main(river_name, reach_range, year_range, base_raster_path, base_shapefile_path, min_pixels=200):
    """
    Main function to process rasters and shapefile, and export the final rasters.
    """
    river_folder = os.path.join(base_raster_path, river_name)
    
    if isinstance(reach_range, int):
        reach_folders = [f"reach_{reach_range}"]
    elif isinstance(reach_range, tuple):
        reach_folders = [f"reach_{i}" for i in range(reach_range[0], reach_range[1] + 1)]
    elif reach_range == "All":
        reach_folders = [d for d in os.listdir(river_folder) if d.startswith("reach_")]
    else:
        raise ValueError("Invalid reach_range input")
    
    for reach_folder in reach_folders:
        reach_number = int(reach_folder.split('_')[1])
        reach_path = os.path.join(river_folder, reach_folder, "Raw")
        
        if isinstance(year_range, int):
            years = [year_range]
        elif isinstance(year_range, tuple):
            years = range(year_range[0], year_range[1] + 1)
        elif year_range == "All":
            years = None  # Process all years
        else:
            raise ValueError("Invalid year_range input")
        
        raster_files = glob.glob(os.path.join(reach_path, "*.tif"))
        
        for raster_path in raster_files:
            raster_filename = os.path.basename(raster_path)
            try:
                year = int(raster_filename.split('_')[-4])
                dswe_level = raster_filename.split('_')[-1][0]
            except ValueError:
                continue
            
            if years is not None and year not in years:
                continue
            
            shapefile_path = os.path.join(base_shapefile_path, river_name, f"reach_{reach_number}", str(year), "main_channel.shp")
            
            raster = import_raster(raster_path)
            classified_raster = classify_wet_components(raster, min_pixels=min_pixels)
            main_channel = import_and_reproject_shapefile(shapefile_path, raster.crs)
            raster_gdf = raster_to_gdf(classified_raster, raster.transform)
            unique_labels = find_intersecting_pixels(raster_gdf, main_channel)
            final_raster = filter_and_reclassify_raster(classified_raster, unique_labels, min_pixels=min_pixels)
            
            # Construct the output folder path
            output_folder = os.path.join(base_raster_path, river_name, f"reach_{reach_number}", "Processed")
            os.makedirs(output_folder, exist_ok=True)
            
            final_raster_filename = f"{river_name}_reach_{reach_number}_{year}_DSWE_level_{dswe_level}_processed.tif"
            final_raster_path = os.path.join(output_folder, final_raster_filename)
            
            export_raster(final_raster, raster.transform, raster.crs, final_raster_path)

In [18]:
# Input arguments
river_name = 'Brahmaputra'  # e.g., 'Rakaia'
reach_range = 25  # e.g., single int, tuple, or 'All'
year_range = (1996, 2003)  # Can be a single year, a tuple, or 'All'
base_raster_path = r"C:\Users\huckr\Desktop\UCSB\Dissertation\Data\RiverMapping\RiverMasks"
base_shapefile_path = r"C:\Users\huckr\Desktop\UCSB\Dissertation\Data\RiverMapping\Channels"
min_pixels = 200  # Minimum number of pixels for a component to be retained

# Run the main function
main(river_name, reach_range, year_range, base_raster_path, base_shapefile_path, min_pixels)

  main(river_name, reach_range, year_range, base_raster_path, base_shapefile_path, min_pixels)
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  intersections = gpd.sjoin(raster_gdf, line_gdf, how='inner', op='intersects')
  main(river_name, reach_range, year_range, base_raster_path, base_shapefile_path, min_pixels)
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  intersections = gpd.sjoin(raster_gdf, line_gdf, how='inner', op='intersects')
  main(river_name, reach_range, year_range, base_raster_path, base_shapefile_path, min_pixels)
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:4326

  intersections = gpd.sjoin(raster_gdf, line_gdf, how='inner', op='intersects')
  main(river_name, reach_range, year_range, base_raster_path, base_shapefile_path, min_pixels)
Use `