# Code for Tiantian - Selecting outlet gages

In [16]:
# IMPORTS
import os
import numpy as np
import pandas as pd
import seaborn as sns
import geopandas as gpd
import matplotlib.pyplot as plt

from matplotlib.dates import MonthLocator, DateFormatter # for plotting month on x-axis

# USGS Data retreival tool
from datetime import datetime, timedelta
from dataretrieval import nwis, utils, codes

# Custom modules are imported in multiple locations to faciliate easy reloading when edits are made to their respective files
# import Src.classes as cl
# import Src.func_ko as fn
# reload(cl)
# reload(fn)

In [2]:
# Import excel files with all gages 
#### CHANGE FOLDER TO YOUR UNIQUE PATH ####
data_paths = {
    '30_90': 'Prelim_Data/National_Metrics/National_Metrics_30_90.xlsx',
    '50_90': 'Prelim_Data/National_Metrics/National_Metrics_50_90.xlsx',
    '30_95': 'Prelim_Data/National_Metrics/National_Metrics_30_95.xlsx',
    '50_95': 'Prelim_Data/National_Metrics/National_Metrics_50_95.xlsx'    
}

dfs_metrics = {key: pd.read_excel(path, sheet_name='site_metrics') for key, path in data_paths.items()}

In [3]:
# Converts site_no to strings
date_ranges = ['30', '50']
percentiles = ['90', '95']
for date_range in date_ranges:
    for percentile in percentiles: 
        # Assuming df is your DataFrame and 'column_name' is the name of the column with numbers
        dfs_metrics[f'{date_range}_{percentile}']['site_no'] = dfs_metrics[f'{date_range}_{percentile}']['site_no'].astype(str)  # Convert numbers to strings

        # Add leading '0' to numbers with 7 digits
        dfs_metrics[f'{date_range}_{percentile}']['site_no'] = dfs_metrics[f'{date_range}_{percentile}']['site_no'].apply(lambda x: '0' + x if len(x) == 7 else x)

In [4]:
# Select valid gages (at least 30 or 50 years of data with less than 10% of data missing over record)
dfs_valid = {}
for date_range in date_ranges:
    for percentile in percentiles: 
        dfs_valid[f'{date_range}_{percentile}'] = dfs_metrics[f'{date_range}_{percentile}'][dfs_metrics[f'{date_range}_{percentile}']['valid'] == True]

### Generate list of all gages

In [42]:
# Get lists of gage numbers in each df
site_nos = {}
date_ranges = ['30', '50']
percentiles = ['90', '95']
for date_range in date_ranges:
    for percentile in percentiles:
        df = dfs_valid[f'{date_range}_{percentile}']
        site_nos[f'{date_range}_{percentile}'] = df['site_no'].unique().tolist()
print(len(site_nos['30_90']))
print(len(site_nos['50_90']))

4242
3314


### Generate dictionary of gages by state to evaulate each state individually

In [43]:
# Select a subset of gages for calculating the total volume in 
dfs_valid_subset = {}
for date_range in date_ranges:
    for percentile in percentiles: 
        key = f'{date_range}_{percentile}'
        dfs_valid_subset[key] = {}

        for state in dfs_valid['30_90']['state'].unique().tolist():
            dfs_valid_subset[key][state] = dfs_valid[key][dfs_valid[key]['state'] == state]

In [28]:
# Create shorter list of site_nos to test code
# site_nos_test = {}
# site_nos_test['30_90'] = ['02342500',
#  '02361000',
#  '02369800',
#  '02371500',
#  '02372250',
#  '02373000',
#  '02374500',
#  '02376500',
#  '02399200',
#  '02401390',
#  '02412000',
#  '02414500',
#  '02419000']

# site_nos_test['30_95'] = ['02342500',
#  '02361000',
#  '02369800',
#  '02371500',
#  '02372250',
#  '02373000',
#  '02374500',
#  '02376500',
#  '02399200',
#  '02401390',
#  '02412000',
#  '02414500',
#  '02419000']

# site_nos_test['50_90'] = ['02342500',
#  '02361000',
#  '02369800',
#  '02371500',
#  '02372250',
#  '02373000',
#  '02374500',
#  '02376500',
#  '02399200',
#  '02401390',
#  '02412000',
#  '02414500',
#  '02419000']
# site_nos_test['50_95'] = ['02342500',
#  '02361000',
#  '02369800',
#  '02371500',
#  '02372250',
#  '02373000',
#  '02374500',
#  '02376500',
#  '02399200',
#  '02401390',
#  '02412000',
#  '02414500',
#  '02419000']

In [13]:
def merge_tidal(df_combined):
    """This function merges, if necessary, tidal data with streamflow data returning a dataframe
       with only the necessary columns for analysis. If no data is present, an empty dataframe is returned."""
    keep_cols = ['datetime', '00060_Mean', 'site_no']
    
    # If we have both stream and tidal data, merge them, prioritizing tidal data, and rename the column to '00060_Mean'
    if '72137_Mean' in df_combined.columns and '00060_Mean' in df_combined.columns:
        df_combined['00060_Mean'] = df_combined['72137_Mean'].combine_first(df_combined['00060_Mean'])
        df_combined = df_combined.drop(columns=[col for col in df_combined.columns if col not in keep_cols])
        return df_combined
    
    # If we only have stream data use it as is, drop any unnecessary columns
    if '00060_Mean' in df_combined.columns:
        df_combined = df_combined.drop(columns=[col for col in df_combined.columns if col not in keep_cols])
        return df_combined
    
    # If we only have tidal data we'll rename it to stream data and use it as is
    if '72137_Mean' in df_combined.columns:
        df_combined.rename(columns={'72137_Mean': '00060_Mean'}, inplace=True)
        df_combined = df_combined.drop(columns=[col for col in df_combined.columns if col not in keep_cols])
        return df_combined
    
    # Catch-all
    return df_combined

In [None]:
### Create dictionary with site no and total streamflow volume over record period for each date range

In [47]:
# Calculate total volume for each gage over the selected record period
date_ranges = ['30', '50']
percentile = ['90'] # NOTE: You only need to evaluate the 90th percentile since 90th versus 95th percentile doesn't apply to the total volume caluclation
tot_vol_dict = {}

cfs_to_km3 = 60 * 60 * 24 / 43560 / 1e6 * 1.233489 # (cfs × 86,400 sec/day × 365 days/year) × (0.0000000283168 km³/ft³)

for date_range in date_ranges:
    print('DATE RANGE:', date_range)
    for percentile in percentiles: 
    print('PERCENTILE:', percentile)

        data_range = int(date_range)

        site_no_list = []
        tot_vol_list = []

        for site_no in site_nos[f'{date_range}_{percentile}']:
        #for site_no in site_nos_test[f'{date_range}_{percentile}']:
            print('SITE NO:', site_no)

            # Notes: '00060' = parameter code for daily avg discharge; '72137'= parameter code for tidal data
            # Notes: Adjust start and end as necessary; we get the data for the full record then snip the data to the specified "date_threshold" below (e.g., 30 or 50-year period)
            df = nwis.get_record(sites=site_no, service='dv', parameterCD=['00060', '72137'], start='1900-10-01', end='2020-09-30') 
            df = df.reset_index()
            #print(site_no)
            
            if df.empty:
                #print(f'IGNORED: No data for site {site_no}')
                # Append NaN for the tot_vol and the site_no
                tot_vol_list.append(np.nan)
                site_no_list.append(site_no)
                continue
                
            if '00060_radar sensor_Mean' in df.columns and '00060_Mean' not in df.columns:
                df.rename(columns={'00060_radar sensor_Mean': '00060_Mean'}, inplace=True)

            if '00060_2_Mean' in df.columns and '00060_Mean' not in df.columns:
                df.rename(columns={'00060_2_Mean': '00060_Mean'}, inplace=True)

            if '00060_3_Mean' in df.columns and '00060_Mean' not in df.columns:
                df.rename(columns={'00060_3_Mean': '00060_Mean'}, inplace=True)
                
            df = merge_tidal(df)
            
            df_copy = df.copy()
            print(df_copy.head())

            # Data is snipped to the 30- or 50-year record here
            date_threshold = pd.to_datetime('2020-09-30').date() - timedelta(days=365.25 * data_range)
            df_copy = df_copy[df_copy['datetime'].dt.date >= date_threshold]
            
            column = '00060_2_Mean' if site_no == '01118500' else '00060_Mean'
            
            df_copy = df_copy.rename(columns={'00060_Mean': 'flow_cfs'})
            print(df_copy.head())
                
            tot_vol = sum(df_copy['flow_cfs']) * cfs_to_km3
            
            # Append the calculated pct_hmf and site_no
            tot_vol_list.append(tot_vol)
            site_no_list.append(site_no)
        
        # Create a DataFrame from the lists
        df_tot_vol = pd.DataFrame({
            'site_no': site_no_list,
            'tot_vol': tot_vol_list
        })
        
        tot_vol_dict[f'{date_range}_{percentile}'] = df_tot_vol

DATE RANGE: 30
PERCENTILE: 90
SITE NO: 02342500
                   datetime   site_no  00060_Mean
0 1946-10-01 00:00:00+00:00  02342500        85.0
1 1946-10-02 00:00:00+00:00  02342500        99.0
2 1946-10-03 00:00:00+00:00  02342500        95.0
3 1946-10-04 00:00:00+00:00  02342500        92.0
4 1946-10-05 00:00:00+00:00  02342500        87.0
                       datetime   site_no  flow_cfs
16071 1990-10-01 00:00:00+00:00  02342500      24.0
16072 1990-10-02 00:00:00+00:00  02342500      23.0
16073 1990-10-03 00:00:00+00:00  02342500      23.0
16074 1990-10-04 00:00:00+00:00  02342500      22.0
16075 1990-10-05 00:00:00+00:00  02342500      22.0
SITE NO: 02361000
                   datetime   site_no  00060_Mean
0 1921-12-01 00:00:00+00:00  02361000       383.0
1 1921-12-02 00:00:00+00:00  02361000       482.0
2 1921-12-03 00:00:00+00:00  02361000       535.0
3 1921-12-04 00:00:00+00:00  02361000       514.0
4 1921-12-05 00:00:00+00:00  02361000       493.0
                      

KeyboardInterrupt: 

In [30]:
tot_vol_dict

{'30_90':      site_no    tot_vol
 0   02342500  10.008041
 1   02361000  22.774448
 2   02363000  14.761109
 3   02369800   3.663977
 4   02371500  15.782313
 5   02372250  14.524861
 6   02373000  17.204078
 7   02374500   6.985936
 8   02376500  20.645369
 9   02377570  11.501550
 10  02378500   3.074583
 11  02398300  16.788632
 12  02399200  11.653661
 13  02400100   9.842780,
 '30_95':      site_no    tot_vol
 0   02342500  10.008041
 1   02361000  22.774448
 2   02363000  14.761109
 3   02369800   3.663977
 4   02371500  15.782313
 5   02372250  14.524861
 6   02373000  17.204078
 7   02374500   6.985936
 8   02376500  20.645369
 9   02377570  11.501550
 10  02378500   3.074583
 11  02398300  16.788632
 12  02399200  11.653661
 13  02400100   9.842780,
 '50_90':      site_no     tot_vol
 0   02342500   18.015513
 1   02361000   40.464046
 2   02369800    6.387802
 3   02371500   28.172384
 4   02372250   23.425545
 5   02373000   27.322864
 6   02374500   12.651125
 7   02376500