**Running DataRetrieval for a large amount of data- or all of NYS** 11/10/24

In [2]:
from scipy import stats
import pandas as pd
import numpy as np

from dataretrieval import codes, nwis, utils


In [3]:
# Read in GagesII and make STAID into a str so it keeps the 0 in front of every number
gages_df = pd.read_csv('NYS_Gage_LatLong.csv', dtype={'STAID': str})
gages_df['STAID'] = gages_df['STAID'].astype(str)
print(gages_df.dtypes)
gages_df

STAID          object
LAT_GAGE      float64
LNG_GAGE      float64
DRAIN_SQKM    float64
dtype: object


Unnamed: 0,STAID,LAT_GAGE,LNG_GAGE,DRAIN_SQKM
0,01200000,41.658983,-73.528458,532.58490
1,01300000,40.983709,-73.687351,24.10380
2,01300500,40.955932,-73.718186,11.72430
3,01301000,40.954543,-73.734575,58.20592
4,01301500,40.911210,-73.813467,14.50440
...,...,...,...,...
286,04276500,44.358383,-73.396797,694.32840
287,04276842,43.942557,-73.463734,134.73900
288,04278300,43.663398,-73.603454,57.70620
289,04279000,43.843949,-73.432064,670.71960


In [None]:
print(gages_df[])

In [6]:
i = 0
gages_df['STAID'][i]
str(gages_df['STAID'][i])
#gages_df

'01200000'

In [11]:
# for loop to find normalized peak flow for all nys gages
normalized_peak_flow = np.empty(len(gages_df['DRAIN_SQKM']))
for i in range(len(gages_df['DRAIN_SQKM'])):
    try:
        # Attempt to retrieve peak flow data for the current site
        get_peaks = nwis.get_record(sites=str(gages_df['STAID'][i]), service='peaks', start='2013-01-01')
        peaks_mean = np.mean(get_peaks['peak_va'])
        cur_drainage_area = gages_df.loc[gages_df['STAID'] == gages_df['STAID'][i], 'DRAIN_SQKM'].iloc[0]  # Only one DRAIN_SQKM per STAID
        normalized_peak_flow[i] = peaks_mean / cur_drainage_area  # units: cfs/sqkm
    except Exception as e:  # Catch any exception
        print(f"Error for site {gages_df['STAID'][i]}: {e}")
        normalized_peak_flow[i] = np.nan  # or handle as desired

Error for site 01300000: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01300000&begin_date=2013-01-01
Error for site 01300500: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01300500&begin_date=2013-01-01
Error for site 01301500: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01301500&begin_date=2013-01-01
Error for site 01302000: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01302000&begin_date=2013-01-01
Error for site 01302500: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01302500&begin_date=2013-01-01
Error for site 01306000: No sites/data found using the selection criteria s

In [None]:
# Initialize an empty list to store results
results = []

# Loop over all stations in the gages DataFrame
for i in range(len(gages_df['DRAIN_SQKM'])):
    site_id = gages_df['STAID'][i]
    try:
        # Attempt to retrieve peak flow data for the current site
        get_peaks = nwis.get_record(sites=str(site_id), service='peaks', start='2013-01-01')
        peaks_mean = np.mean(get_peaks['peak_va'])
        cur_drainage_area = gages_df.loc[gages_df['STAID'] == site_id, 'DRAIN_SQKM'].iloc[0]  # Get drainage area for this site
        normalized_peak = peaks_mean / cur_drainage_area  # Calculate normalized peak flow (units: cfs/sqkm)
    except Exception as e:
        # If an error occurs, set normalized peak to NaN
        print(f"No data found or an error occurred for site {site_id}: {e}")
        normalized_peak = np.nan

    # Store results for the site
    results.append({
        'STAID': site_id,
        'normalized_peak_flow': normalized_peak
    })

# Convert the results to a DataFrame
results_df = pd.DataFrame(results)

# Display the results DataFrame
print(results_df)


No data found or an error occurred for site 01300000: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01300000&begin_date=2013-01-01
No data found or an error occurred for site 01300500: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01300500&begin_date=2013-01-01
No data found or an error occurred for site 01301500: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01301500&begin_date=2013-01-01
No data found or an error occurred for site 01302000: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.usgs.gov/nwis/peaks?format=rdb&site_no=01302000&begin_date=2013-01-01
No data found or an error occurred for site 01302500: No sites/data found using the selection criteria specified in url: https://nwis.waterdata.

In [19]:
type(results_df)
results_df

Unnamed: 0,STAID,normalized_peak_flow
0,01200000,5.395640
1,01300000,
2,01300500,
3,01301000,28.240255
4,01301500,
...,...,...
286,04276500,7.309876
287,04276842,7.785422
288,04278300,
289,04279000,1.848761


In [20]:
# Perform unit conversion for normalized peak flow
results_df['Norm_Peak_Flow_mmd'] = results_df['normalized_peak_flow'] * (86400 * 304.8 / 1.0764E7)

# Create a new DataFrame for output with STAID and normalized peak flow in mm/day
results_df_mmd = pd.DataFrame({
    'STAID': results_df['STAID'],
    'Norm_Peak_Flow_mmd': results_df['Norm_Peak_Flow_mmd']
})

# Display the DataFrame
print(results_df_mmd)

# Specific STAID to search for
desired_staid = '04260500'
# Filter the DataFrame for the desired STAID
result = results_df_mmd.loc[results_df_mmd['STAID'] == desired_staid, 'Norm_Peak_Flow_mmd']
if not result.empty:
    print(f'Station {desired_staid} has {result.iloc[0]:.2f} mm/day of normalized peak flow.')
else:
    print(f'Station {desired_staid} not found in the DataFrame.')


        STAID  Norm_Peak_Flow_mmd
0    01200000           13.200731
1    01300000                 NaN
2    01300500                 NaN
3    01301000           69.091341
4    01301500                 NaN
..        ...                 ...
286  04276500           17.884015
287  04276842           19.047465
288  04278300                 NaN
289  04279000            4.523095
290  04280450           20.774551

[291 rows x 2 columns]
Station 04260500 has 11.90 mm/day of normalized peak flow.


In [None]:
# Remove rows with NaN values in 'Norm_Peak_Flow_mmd'
nys_gauge_mmd = np.round(results_df_mmd.dropna(subset=['Norm_Peak_Flow_mmd']), 2)

# Display the new DataFrame with only valid data & turn into a csv
print(nys_gauge_mmd)
print('There are 66 stations in NYS that do not have data')
nys_gauge_mmd.to_csv('nys_gauge_mmd.csv', index=False)

        STAID  Norm_Peak_Flow_mmd
0    01200000               13.20
3    01301000               69.09
6    01302020               39.58
8    01303000                6.27
9    01303500                4.55
..        ...                 ...
285  04275500               26.61
286  04276500               17.88
287  04276842               19.05
289  04279000                4.52
290  04280450               20.77

[225 rows x 2 columns]
There are 66 stations in NYS that do not have data


In [22]:
# Specific STAID to search for
desired_staid = '04234000'
# Filter the DataFrame for the desired STAID
result = results_df_mmd.loc[results_df_mmd['STAID'] == desired_staid, 'Norm_Peak_Flow_mmd']
if not result.empty:
    print(f'Station {desired_staid} has {result.iloc[0]:.2f} mm/day of normalized peak flow.')
else:
    print(f'Station {desired_staid} not found in the DataFrame.')

Station 04234000 has 22.08 mm/day of normalized peak flow.
