#### This workflow was originally written 09/09/2021 by Aaron Hurst to automate ordering dams from upstream to downstream using the NHDPlus Flowline dataset and NID dams that were previously filtered and snapped to the flowlines using the 01DamNet_FilteringAndSnapping iPython notebook. Modifications have been made since then.

In [1]:
# import packages
import pandas as pd
import numpy as np
import re
from datetime import datetime

pd.set_option('display.max_columns',None)

In [2]:
# load data

# Load file with dams. These are NID dams that were filtered with 01DamNet_FilteringAndSnapping and intersected with NHD Flowlines
# to get flowline attributes for each dam. This file must contain the COMID and Hydrosequence data. The second file is the
# NHDPlus Flowline dataset exported as a csv. This file will be located in your output folder from the filtering and snapping
# step and is called 'NID_filtered_snapped_nodupl.csv'.
NID = pd.read_csv("Outputs/NID_filtered_snapped_nodupl.csv", low_memory=False)

# Load NHDPlus table. This has the option of a CountryOut code that tells which coast the flowline exits at. 
NHD = pd.read_csv('Inputs/NHD_Flowline_Network_countriesOut.csv', low_memory=False)

# Output File location and name
today = datetime.today().strftime('%Y%m%d')
outputFile = f'Outputs/ResNet_{today}.csv'

# Sort the dataframe by descending Hydrosequence
dam = NID.sort_values('Hydroseq', ascending = False)
dam = dam.reset_index(drop=True)

NameError: name 'NID' is not defined

In [6]:
# If multiple dams snapped to the same flowline, pick one. The order of operations for choosing the correct dam is to first
# pick dam attribute dams or GRanD dams. Then pick the one that has a completion year, followed by maximum storage, largest dam height,
# and then largest ShortID arbitrarily. If multiple have a completeion year, the code goes through the rest of the prioritization
# until there is only one dam left.

#Create flags field
flags = [None] * len(dam['ShortID'])

_, uniqueIdx = np.unique(dam['Hydroseq'], return_index=True)
duplicate_indices = np.setdiff1d(np.arange(len(dam['Hydroseq'])), uniqueIdx) # Find the indices of elements that are not the first occurrence (i.e., duplicates)
duplicate_values = dam['Hydroseq'].iloc[duplicate_indices] # Get the actual duplicate values based on these indices
dupeIdx = np.isin(dam['Hydroseq'], duplicate_values) # Find which elements in dam['Hydroseq'] are among the duplicate values
dupes = dam['Hydroseq'][dupeIdx] #contains all including first value
dupeLoc = np.where(dupeIdx)[0]

for loc in dupeLoc:
    flags[loc] = [5]

In [7]:
# Initialize an empty array for indices of non-duplicates we want to keep
dupind = []

#convert DF to dictionaries (struct-like); basically has format column->value
dupl_ordered_dict = dam.to_dict(orient = 'records')

# Identify unique values and their counts
Hydro_dupe = [item['Hydroseq'] for item in dupl_ordered_dict]
uniquevals,ia = np.unique(Hydro_dupe, return_inverse = True)
uniquevals = np.flip(uniquevals)

#Count the frequency of each index in ia
bincounts = np.bincount(ia)
bincounts = np.flip(bincounts)

#Zero out non-duplicates
singles = uniquevals[bincounts <= 1]
singleidx = [i for i, val in enumerate(Hydro_dupe) if val in singles]
for idx in singleidx:
    Hydro_dupe[idx] = 0
    
#Overwrite repeats
repeats = uniquevals[bincounts > 1]
Hydro_dupe = np.array([np.where(repeats==val)[0][0] + 1 if val in repeats else val for val in Hydro_dupe])


In [8]:
skip_it = 0

YEAR = [item['yrc'] for item in dupl_ordered_dict]
GRAND = [item['IsGRanD'] for item in dupl_ordered_dict]
Move = [item['Moved'] for item in dupl_ordered_dict]

# Loop through Hydro_dupe values
for i in range(len(Hydro_dupe)):
    if Hydro_dupe[i] == 0:  # If the value is not a duplicate, keep it
        dupind.append(i)
        continue 
        
    elif skip_it > 0:  # If we already dealt with an index, skip it
        skip_it -= 1
        continue

    else:  # The value is a duplicate
        dup = [idx for idx, val in enumerate(Hydro_dupe) if val == Hydro_dupe[i]]  # Get indices of the duplicates
        dup1 = dup[0]
        j = len(dup)

        skip_it = j - 1

        # Pull out variables at the duplicate indices
        YearCompl = [index for index, value in enumerate(YEAR[dup1:dup[j-1]+1]) if value > 0]
        GRanDloc = [index for index, value in enumerate(GRAND[dup1:dup[j-1]+1]) if value == 1]
        moveloc = [index for index, value in enumerate(Move[dup1:dup[j-1]+1]) if value == 1]
        
        MM = []

        if len(GRanDloc) > 0:  # It's a GRanD dam
            dupind.append(dup[GRanDloc[0]])
            continue
        elif len(moveloc) > 0:
            dupind.append(dup[moveloc[0]])
            continue
        elif len(YearCompl) > 0:
            if len(YearCompl) < len(dup):  # If all of the dams don't have a completion year, get rid of those
                dup = [dup[i] for i in YearCompl]
                
                # Then take max storage of those
                MM = dam['MaxStor_m3'][dup]
                
                dup = [dup[idx] for idx in (np.where(MM==np.max(MM))[0])]
                    
                if len(dup) > 1:  # If the storage values are the same, take the largest dam height
                    HH = dam['DamH_m'][dup]
                    dup = [dup[idx] for idx in (np.where(HH==np.max(HH))[0])]

                    if len(dup) > 1: #If the dam heights are the same,  take the oldest dam
                        age = dam['yrc'][dup]
                        dup = [dup[idx] for idx in (np.where(age == np.min(age))[0])]
                        
                        if len(dup)>1: #if dam ages are the same, take the largest surface area
                            SA = dam['SA_m2'][dup]
                            dup = [dup[idx] for idx in (np.where(SA==np.max(SA))[0])]
                            
                            if len(dup) > 1:  # If the surface areas are the same, take the largest dam length arbitrarily
                                length = dam['Dam_Len_m'][dup]
                                dup = [dup[idx] for idx in (np.where(length==np.max(length))[0])]
                                
                                if len(dup)>1: # If dam lengths are the same, take any that aren't privately owned.
                                    owner = dam['OwnerTypes'][dup]
                                    if (owner == 'Private').all():
                                        dup = dup  # Keep all indices if all are 'Private'
                                    else:
                                        non_private_idx = owner.index[owner != 'Private'].tolist()
                                        dup = [idx for idx in dup if idx in non_private_idx]
                                    
                                    if len(dup)>1:
                                        SID = dam['ShortID'][dup]
                                        dup = dup[np.where(SID == np.max(SID))[0][0]] #Otherwise take max of ShortID arbitrarily
                                    else:
                                        dup = dup[0]
                                else:
                                    dup = dup[0]
                            else:
                                dup = dup[0]
                        else:
                            dup = dup[0]
                    else:
                        dup = dup[0]
                else:
                    dup = dup[0]

                dupind.append(dup)
            else:
                # Otherwise just take the max storage
                MM = dam['MaxStor_m3'][dup]
                dup = [dup[idx] for idx in (np.where(MM==np.max(MM))[0])]
    
                if len(dup) > 1:  # If the storage values are the same, take the largest dam height
                    HH = dam['DamH_m'][dup]
                    dup = [dup[idx] for idx in (np.where(HH==np.max(HH))[0])]

                    if len(dup) > 1: #If the dam heights are the same,  take the oldest dam
                        age = dam['yrc'][dup]
                        dup = [dup[idx] for idx in (np.where(age == np.min(age))[0])]
                        
                        if len(dup)>1: # If dam ages are the same, take the largest surface area
                            SA = dam['SA_m2'][dup]
                            dup = [dup[idx] for idx in (np.where(SA==np.max(SA))[0])]
                            
                            if len(dup) > 1:  # If the surface areas are the same, take the largest dam length arbitrarily
                                length = dam['Dam_Len_m'][dup]
                                dup = [dup[idx] for idx in (np.where(length==np.max(length))[0])]
                                
                                if len(dup)>1: # If dam lengths are the same, take any that aren't privately owned.
                                    owner = dam['OwnerTypes'][dup]
                                    if (owner == 'Private').all():
                                        dup = dup  # Keep all indices if all are 'Private'
                                    else:
                                        non_private_idx = owner.index[owner != 'Private'].tolist()
                                        dup = [idx for idx in dup if idx in non_private_idx]
                                    
                                    if len(dup)>1:
                                        SID = dam['ShortID'][dup]
                                        dup = dup[np.where(SID == np.max(SID))[0][0]]
                                        
                                    else:
                                        dup = dup[0]
                                else:
                                    dup = dup[0]
                            else:
                                dup = dup[0]
                        else:
                            dup = dup[0]
                    else:
                        dup = dup[0]
                else:
                    dup = dup[0]

                dupind.append(dup)

            continue
        else:
            # Otherwise just take the max storage
            MM = dam['MaxStor_m3'][dup]
            dup = [dup[idx] for idx in (np.where(MM==np.max(MM))[0])]
            
            if len(dup) > 1:  # If the storage values are the same, take the largest dam height
                HH = dam['DamH_m'][dup]
                dup = [dup[idx] for idx in (np.where(HH==np.max(HH))[0])]

                if len(dup) > 1: #If the dam heights are the same,  take the oldest dam
                    age = dam['yrc'][dup]
                    dup = [dup[idx] for idx in (np.where(age == np.min(age))[0])]
                        
                    if len(dup)>1: # If dam ages are the same, take the largest surface area
                        SA = dam['SA_m2'][dup]
                        dup = [dup[idx] for idx in (np.where(SA==np.max(SA))[0])]
                            
                        if len(dup) > 1:  # If the surface areas are the same, take the largest dam length arbitrarily
                            length = dam['Dam_Len_m'][dup]
                            dup = [dup[idx] for idx in (np.where(length==np.max(length))[0])]
                                
                            if len(dup)>1: # If dam lengths are the same, take any that aren't privately owned.
                                owner = dam['OwnerTypes'][dup]
                                if (owner == 'Private').all():
                                    dup = dup  # Keep all indices if all are 'Private'
                                else:
                                    non_private_idx = owner.index[owner != 'Private'].tolist()
                                    dup = [idx for idx in dup if idx in non_private_idx]
                                            
                                if len(dup)>1:
                                    SID = dam['ShortID'][dup]
                                    dup = dup[np.where(SID == np.max(SID))[0][0]]
                                
                                else:
                                    dup = dup[0]
                            else:
                                dup = dup[0]
                        else:
                            dup = dup[0]
                    else:
                        dup = dup[0]
                else:
                    dup = dup[0]
            else:
                dup = dup[0]

            dupind.append(dup)
            


# Update dam to only keep the selected non-duplicate flowline snaps
dam = dam.iloc[dupind]
dam = dam.reset_index(drop=True)
flags = [flags[i] for i in dupind]


print('Length after removing duplicate snaps to same hydrosequence:',len(dam))

Length after removing duplicate snaps to same hydrosequence: 57454


In [7]:
# Create variables and empty datasets
ToDam = np.full(len(dam['ShortID']),np.nan)
FromDam = [None]*(len(dam['ShortID']))
DA = np.full(len(ToDam),np.nan)
flagDA = np.full(len(ToDam),0)
flagTerm = np.full(len(ToDam),0)
flagHW = np.full(len(ToDam),0)
flagCAP = np.full(len(ToDam),0)
countryOut = np.full(len(ToDam),0)


In [8]:
dam_hydroseq_array = dam.Hydroseq.to_numpy() # Convert to numpy array due to pandas equality issues

for i in range(len(dam.Hydroseq)):

    count = 0
    where = np.where(NHD.Hydroseq == dam.Hydroseq[i])[0][0] # Find the location of the dam in the NHD table
    d_s = NHD.DnHydroseq[where] # Look to see what the downstream flowpath is


    locCoast = np.where(NHD.Hydroseq == dam.TerminalPa[i])[0][0]
    country_Out = NHD.Country_out[locCoast]
    countryOut[i] = country_Out
    
    if len([locCoast]) == 0:
        countryOut[i] = -999 # If there wasn't a match between the Hydroseq and terminal path

    while d_s not in dam_hydroseq_array: # While the current flowpath does not have a dam on it
        flag = 0 # Reset flag
        
        if count > len(NHD.Hydroseq): # If you have looped through all of the NHD table; this is one way to ID terminal dams, but could be not actually terminal if NHD flowline is missing
            flagTerm = 1
            break
        elif d_s == 0.0: # No downstream flowpath
            flag = 2
            break
        elif np.isnan(d_s):
            flag = 3
            break
        else:
            where = np.where(NHD.Hydroseq == d_s)[0][0] # Update the current flowpath
            
            if len([where]) == 0: # The d_s hydrosequence is missing from NHD
                flag = 4
                break
                
            if NHD.FCODE[where] == 56600: # If you made it to a coastline, add a flag and break.
                flag = 10
                break
                
            d_s = NHD.DnHydroseq[where]
            count += 1
                
    
    if flag > 0: # If we had a flag already, assign it to the final flags array
        if flags[i] is None:
            flags[i] = [flag]
        else:
            flags[i].append(flag)
        continue
    else: # Otherwise the flag was 0 and nothing abnormal happened, or the flag was 5, which means it originally had a duplicate dam on a hydrosequence
        damloc = np.where(dam.Hydroseq == d_s)[0][0] # Find which dam we reached
        
        ToDam[i] = dam.ShortID[damloc] # Update the ToDam column
        
        if FromDam[damloc] is None:
            FromNow = [dam.ShortID[i]]
        else:
            FromNow = FromDam[damloc] # Update FromDam
            FromNow.append(dam.ShortID[i])
        
        FromDam[damloc] = FromNow
        
        if flags[i] is None:
            flags[i] = [flag]
        else:
            flags[i].append(flag)
        
        # Add flag if DA d/s is smaller than DA of u/s dam
        if dam.DivDASqKM[damloc] < dam.DivDASqKM[i]:
            flagDA[i] = 6
            flagDA[damloc] = 6
            if flags[i] is None:
                flags[i] = 6
            else:
                flags[i].append(6)
            if flags[damloc] is None:
                flags[damloc] = [6]
            else:
                flags[damloc].append(6)
        elif dam.DivDASqKM[damloc] == dam.DivDASqKM[i]: # Drainage areas are equal
            flagDA[i] = 7
            flagDA[damloc] = 7
            if flags[i] is None:
                flags[i] = [7]
            else:
                flags[i].append(7)
            if flags[damloc] is None:
                flags[damloc] = [7]
            else:
                flags[damloc].append(7)
                
        if dam.MaxStor_m3[damloc] == dam.MaxStor_m3[i]: # Flag if capacity the same at two subsequent dams
            flagCAP[i] = 8
            flagCAP[damloc] = 8
            if flags[i] is None:
                flags[i] = [8]
            else:
                flags[i].append(8)
            if flags[damloc] is None:
                flags[damloc] = [8]
            else:
                flags[damloc].append(8)
                


In [9]:
# Mark headwater dams
hw = [i for i, value in enumerate(FromDam) if value is None] # Find indices where FromDam is None
flagHW[hw] = 1


# Iterate over the identified indices
for i in hw:
    if flags[i] is None:
        flags[i] = [9]
    else:
        flags[i].append(9)

        
# Mark terminal dams
term = np.where(np.isnan(ToDam))[0]
flagTerm[term] = 1

for i in term:
    if flags[i] is None:
        flags[i] = 1
    else:
        flags[i].append(1)

In [10]:
# Add fields to the dataframe
dam['FromDam'] = FromDam
dam['ToDam'] = ToDam
dam['flag'] = flags
dam['flagDA'] = flagDA
dam['flagCAP'] = flagCAP
dam['flagTerm'] = flagTerm
dam['flagHW'] = flagHW
dam['countryOut'] = countryOut


In [11]:
#Define site tags function
def assignSiteTags(siteNow, allDams, upstreamDamID, tag_field):

    if siteNow['FromDam'] is not None:
        FromDam = siteNow.FromDam.iloc[0]
    else:
        FromDam = None 
    
    if FromDam:
        for dam_id in FromDam:
            idx = allDams.index[allDams['ShortID'] == dam_id].tolist()
            
            if idx:
                allDams.loc[idx[0], tag_field] = upstreamDamID
                site_now_next = allDams.loc[idx]
                allDams = assignSiteTags(site_now_next, allDams, upstreamDamID,tag_field)
    
    return allDams

In [12]:
# Add site tags
# Sort the dataframe by ascending Hydrosequence
dam = dam.sort_values('Hydroseq', ascending = True)

# Pull out sites
sites = dam.loc[(dam.IsSite == 1) & (dam.IsRiverMth == 0)]
sites = sites.reset_index(drop=True)
dam['SiteTag'] = np.full(len(ToDam),0)

for i in range(len(sites.IsSite)):
    siteNow = sites.iloc[[i]]
    tag = siteNow.ShortID.iloc[0]
    
    dam = assignSiteTags(siteNow,dam,tag,'SiteTag')

In [13]:
# Add GRanD tag
# Pull out GRanD
GRanD = dam.loc[dam.IsGRanD == 1]
GRanD = GRanD.reset_index(drop=True)

dam['GRanDTag'] = np.full(len(ToDam),0)

for i in range(len(GRanD.IsGRanD)):
    GRanDNow = GRanD.iloc[[i]]
    tag = GRanDNow.ShortID.iloc[0]
    
    dam = assignSiteTags(GRanDNow,dam,tag,'GRanDTag')

In [14]:
# Add river tag
# Pull out rivers
river = dam.loc[dam.IsRiverMth == 1]
river = river.reset_index(drop=True)

dam['RiverTag'] = np.full(len(ToDam),0)

for i in range(len(river.IsRiverMth)):
    riverNow = river.iloc[[i]]
    tag = riverNow.ShortID.iloc[0]
    
    dam = assignSiteTags(riverNow,dam,tag,'RiverTag')

In [15]:
# Add delta tag
# Pull out deltas
delta = dam.loc[dam.delta > 0]
delta = delta.reset_index(drop=True)

dam['DeltaTag'] = np.full(len(ToDam),0)

for i in range(len(delta.delta)):
    deltaNow = delta.iloc[[i]]
    tag = deltaNow.delta.iloc[0]
    
    dam = assignSiteTags(deltaNow,dam,tag,'DeltaTag')

In [16]:
# Delete extra columns
dam = dam.drop(['FCODE','DnHydroseq','Country_ou','TerminalPa','LENGTHKM','MaxQ_m3s','DA_km2','flagDA',
               'flagCAP','Pathlength','WBCOMID','SA_m2','PrimDamTyp','Reservoir'],axis=1)

dam = dam.rename(columns={'NrX_Final':'Longitude','NrY_Final':'Latitude','GRAND_ID':'GRanD_ID'})

# Save as a csv
dam.to_csv(outputFile,index=False)