In [9]:
import arcpy, pandas as pd, os, multiprocessing as mp, psutil
from arcpy import env
from arcpy.sa import *
from otherfunctions import folders_exist

In [None]:
# Paths to input datasets
root_folder = r"Z:\PhD_Datasets&Analysis\Info_Inputs"
tam_out_dir = r"Z:\PhD_Datasets&Analysis\Outputs\T&M_WBM"
tc_ds = root_folder + "\\TerraClimate"
out_geotiff = tc_ds + "\\GeoTIFF"
bands_gee = ["pr", "pet", "ro"] # band names in GEE - for comparison with GEE TerraClimate dataset
tc_vars = ["ppt", "pet", "q"] # variable names according to TerraClimate
serial_id = 'grdcno_int'

# Set arcpy environment variables
env.overwriteOutput = True
arcpy.CheckOutExtension("spatial")
# env.cellSize = "MINOF" # Avoided to prevent huge files
env.cellSize = out_geotiff + "\\ppt_2023_1.tif" # Use TerraClimate resolution as reference for cell size
env.workspace = r"Z:\PhD_Datasets&Analysis\_ProcessingCache"
env.outputCoordinateSystem = arcpy.SpatialReference("WGS 1984") # WGS 1984 (4326)

In [None]:
# Get the current environment's spatial reference
spatial_ref = env.outputCoordinateSystem

# Check if a spatial reference is set
if spatial_ref:
    print(f"Spatial Reference Name: {spatial_ref.name}")
    print(f"Spatial Reference WKID: {spatial_ref.factoryCode}")
else:
    print("No spatial reference is set in the current environment.")

In [None]:
# Read the Shapefile with the processed drainage areas
drain_areas = root_folder + "\\Streamflow_Sts_Drainage_Areas\GRDC_Watersheds\CSS-WATERSHEDS_FINAL_SELECTION.shp"

# Create a feature layer object
arcpy.MakeFeatureLayer_management(drain_areas, "drain_areas_lyr")

# Initialize an empty list to store the station IDs
sts_ids = []

# Use a SearchCursor to iterate through the rows of the feature layer
with arcpy.da.SearchCursor("drain_areas_lyr", [serial_id]) as cursor:
    for row in cursor:
        sts_ids.append(row[0])

sts_ids

In [None]:
######################################################
### Starting values for the water balance model - T&M
######################################################

# Initial variables
years = range(1958, 1967 + 1) # Years to process. This line can be used to execute this code for specific years in multiple runs.
months = range(1, 12 + 1)

In [None]:
# Create folders for other variables of tam model
wyield_dir = tam_out_dir + '\\wyield'
folders_exist([wyield_dir])

# Folder with baseflow rasters resulting from the model
bflow_dir = tam_out_dir + '\\bflow'

In [None]:
def zonal_stastics_iteratively(year):
    """
    Function to calculate zonal statistics iteratively for each station ID.
    """
    print(f"\t[Process {os.getpid()}] Calculating zonal statistics of water yield for year {year}......")

    sts_flows_sim = pd.DataFrame(columns=[serial_id, "YEAR", "MONTH", "COUNT", "AREA", "MIN", "MAX", "RANGE", "MEAN", "STD", "SUM", "MEDIAN", "PCT90"])  

    # Create a feature layer for this process
    arcpy.MakeFeatureLayer_management(drain_areas, f"drain_areas_lyr_{os.getpid()}")

    for st in sts_ids:
        print(f"\t\t[Process {os.getpid()}] Station ID: {st}")
        
        # Select the current station ID in the feature layer
        arcpy.SelectLayerByAttribute_management(f"drain_areas_lyr_{os.getpid()}", "NEW_SELECTION", f"{serial_id} = {st}")

        for month in months:
            wyield = wyield_dir + "\\wyield_" + str(year) + "_" + str(month) + ".tif"
            out_table = f"zonal_wyield_{st}_{year}_{month}_{os.getpid()}.dbf"

            arcpy.sa.ZonalStatisticsAsTable(f"drain_areas_lyr_{os.getpid()}", serial_id, wyield, out_table, "DATA", "ALL")

            # Convert the output table to a NumPy array
            array = arcpy.da.TableToNumPyArray(out_table, [serial_id, "COUNT", "AREA", "MIN", "MAX", "RANGE", "MEAN", "STD", "SUM", "MEDIAN", "PCT90"])

            # Convert the NumPy array to a pandas DataFrame
            df_sim = pd.DataFrame(array)

            df_sim["YEAR"] = year # Assign the year of simulation
            df_sim["MONTH"] = month # Assign the month of simulation
            df_sim = df_sim[[serial_id, "YEAR", "MONTH", "COUNT", "AREA", "MIN", "MAX", "RANGE", "MEAN", "STD", "SUM", "MEDIAN", "PCT90"]] # Reorder columns

            sts_flows_sim = pd.concat([sts_flows_sim, df_sim], ignore_index=True) # Concat all simulated stream flow station values

            arcpy.Delete_management(out_table) # Delete the output table to save space

    # Clean up the feature layer
    arcpy.Delete_management(f"drain_areas_lyr_{os.getpid()}")
    
    # Save the results to a CSV file for this year
    csv_path = wyield_dir + "\\wyield_zonal_statistics_" + str(year) + ".csv"
    sts_flows_sim.to_csv(csv_path, index=False)
    print(f"\t[Process {os.getpid()}] Saved results for year {year} to {csv_path}")
    
    return year  # Just return the year to confirm completion

In [None]:
def init_worker():
    """Initialize worker process with ArcPy license checkout"""
    arcpy.CheckOutExtension("spatial")
    print(f"Worker process {os.getpid()} initialized with Spatial Analyst extension")

def run_parallel_processing(years_to_process):
    """
    Run the zonal statistics calculations in parallel for multiple years,
    with safeguards to prevent machine overload.
    """
    # Determine the number of cores to use
    # Use at most 75% of available cores to prevent overloading the system
    total_cores = mp.cpu_count()
    max_cores = max(1, int(total_cores * 0.75))
    
    # Further limit cores based on available memory
    # Estimate 4GB per process (adjust this based on your observation)
    memory_per_process_gb = 4
    available_memory_gb = psutil.virtual_memory().available / (1024 * 1024 * 1024)
    memory_limited_cores = max(1, int(available_memory_gb / memory_per_process_gb))
    
    # Use the more conservative limit
    num_processes = min(max_cores, memory_limited_cores, len(years_to_process))
    
    print(f"\nSystem has {total_cores} cores, using {num_processes} for parallel processing")
    print(f"Available memory: {available_memory_gb:.2f} GB, estimated usage: {memory_per_process_gb * num_processes:.2f} GB")
    
    # Create a pool of worker processes
    with mp.Pool(processes=num_processes, initializer=init_worker) as pool:
        # Map the processing function to each year
        results = pool.map(zonal_stastics_iteratively, years_to_process)
    
    print(f"\nCompleted processing for years: {results}")
    return results

In [None]:
print('\n############################################################')
print('\t\tINITIAL VARIABLES')
print('\tPeriod to be executed: ' + str(years[0]) + '-' + str(years[-1]))
print('############################################################')

# Run the parallel processing
run_parallel_processing(years)

# Check in the extension when all processing is done
arcpy.CheckInExtension("spatial")

# Clear the workspace environment
arcpy.ClearEnvironment("workspace")

print("\nDONE!!")