### Download USGS streamflow data for reference watersheds

Identify reference watersheds using gages-ii that fall within the blue ridge ecoregion 

Check the streamflow availability for each gage

In [3]:
import geopandas as gpd
import numpy as np
import pandas as pd 
import dataretrieval.nwis as nwis
from datetime import date, timedelta
import rasterio
from rasterstats import zonal_stats
import fiona

In [4]:
def check_streamflow(site):
    
    # quality control 
    # A = Approved for publication 
    # P = Provisional data subject to revision 
    # e = Value has been estimated 

    streamflow_df = nwis.get_record(sites=site, service = 'dv', start = '1900-01-01', parameterCd = '00060')
    if streamflow_df.shape[0] == 0:
        return np.nan
    date_index = streamflow_df.index.to_series().between('1984-01-01', '2021-12-31')
    streamflow_subset = streamflow_df[date_index]
    streamflow_subset = streamflow_subset[streamflow_subset['00060_Mean_cd'] == 'A']
    streamflow_subset.dropna(subset=['00060_Mean'])
    streamflow = streamflow_subset.reset_index(drop=True)
    # I want to use the estimated values because we can't have gaps 

    sdate = date(1984, 1, 1)
    edate = date(2021, 12, 31)
    delta = edate-sdate
    total_days = delta.days + 1
    
    pct_missing = (streamflow_subset.shape[0]/total_days)*100
    
    return pct_missing

In [5]:
# subset the gages-ii reference watersheds to only those intersecting the blue ridge ecoregion 
home = "/Volumes/GoogleDrive/My Drive/Chapter2_mechanisms_forest_water_cycling"
roi = gpd.read_file(os.path.join(home, "Data", "ROI", "blue_ridge.shp"))
gages_reference = gpd.read_file(os.path.join(home, "Data", "Catchments", "Reference", "gages_ii", "boundaries-shapefiles-by-aggeco", "bas_ref_all.shp"))

roi_match = roi.to_crs(gages_reference.crs)
roi_match = roi_match[["NA_L3NAME", "geometry"]]

gages_reference_join = gpd.tools.sjoin(gages_reference, roi_match, predicate="intersects", how="left")
ref_roi = gages_reference_join.dropna(subset=['index_right'])

In [18]:
pct_miss= []
for gage_id in ref_roi['GAGE_ID']:
    pct_miss.append(check_streamflow(gage_id))
ref_roi['pct_streamflow'] = pct_miss


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super().__setitem__(key, value)


In [22]:
# get rid of the reference watersheds with less than 75% of data from 1984 - 2021 
ref_roi_keep = ref_roi[ref_roi['pct_streamflow'] >=75]
ref_roi_keep.shape

(28, 7)

In [None]:
# make a polygon that combines the ROI and the reference catchments for the anlaysis 
ref_roi_keep.to_file(os.path.join(home, "Data", "Catchments", "Reference", "gages_ii",  "reference_keep.shp"))
sbr_ref_combo = gpd.overlay(roi_match, ref_roi_keep, how = 'union')
sbr_ref_combo['group'] = ['group']*sbr_ref_combo.shape[0]
sbr_ref_combo_diss = sbr_ref_combo.dissolve(by = "group")
sbr_ref_combo_diss.to_file(os.path.join(home, "Data", "ROI", "blue_ridge_plus_reference.shp"))