# Preprocessing CONUS404 and reference data

Short paragraph describing what is about to happen

<details>
  <summary>Guide to pre-requisites and learning outcomes...&lt;click to expand&gt;</summary>
  
  <table>
    <tr>
      <td>Pre-Requisites
      <td>To get the most out of this notebook, you should already have an understanding of these topics: 
        <ul>
        <li>pre-req one
        <li>pre-req two
        </ul>
    <tr>
      <td>Expected Results
      <td>At the end of this notebook, you should be able to: 
        <ul>
        <li>outcome one
        <li>outcome two
        </ul>
  </table>
</details>

In [None]:
# library imports
import fsspec
import hvplot.xarray
import intake
import os
import warnings
import requests
import rioxarray
import dask
import metpy

from shapely.geometry import Polygon
from dask.distributed import LocalCluster, Client
from pygeohydro import pygeohydro
from fsspec.implementations.ftp import FTPFileSystem
from holoviews.streams import PolyEdit, PolyDraw
from datetime import date

import xarray as xr
import geopandas as gpd
import pandas as pd
import geoviews as gv
import dask.dataframe as dd
import cartopy.crs as ccrs #testing
import numpy as np #testing

warnings.filterwarnings('ignore')

## Retrieving data from HPC or the Cloud
#### The process varies based on where the notebook is being run but generally looks this:
1. (Done already) Connect to workspace (local, HPC, or QHUB) and open notebook
2. Start Dask client 
3. Pull in data from source
4. Process the data into usable file format, size, and extent

# Update tohelper function after repo consolidation
## **Start a Dask client using an appropriate Dask Cluster** 
This is an optional step, but can speed up data loading significantly, especially when accessing data from the Cloud.

In [None]:
def configure_cluster(machine):
    ''' Helper function to configure cluster
    '''
    if machine == 'denali':
        from dask.distributed import LocalCluster, Client
        cluster = LocalCluster(threads_per_worker=1)
        client = Client(cluster)
    
    elif machine == 'tallgrass':
        from dask.distributed import Client
        from dask_jobqueue import SLURMCluster
        cluster = SLURMCluster(queue='cpu', cores=1, interface='ib0',
                               job_extra=['--nodes=1', '--ntasks-per-node=1', '--cpus-per-task=1'],
                               memory='6GB')
        cluster.adapt(maximum_jobs=30)
        client = Client(cluster)
        
    elif machine == 'local':
        import os
        import warnings
        from dask.distributed import LocalCluster, Client
        warnings.warn("Running locally can result in costly data transfers!\n")
        n_cores = os.cpu_count() # set to match your machine
        cluster = LocalCluster(threads_per_worker=n_cores)
        client = Client(cluster)
        
    elif machine in ['esip-qhub-gateway-v0.4']:   
        import sys, os
        sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
        import ebdpy as ebd
        aws_profile = 'esip-qhub'
        ebd.set_credentials(profile=aws_profile)

        aws_region = 'us-west-2'
        endpoint = f's3.{aws_region}.amazonaws.com'
        ebd.set_credentials(profile=aws_profile, region=aws_region, endpoint=endpoint)
        worker_max = 30
        client,cluster = ebd.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                              region=aws_region, use_existing_cluster=True,
                                              adaptive_scaling=False, wait_for_cluster=False, 
                                              worker_profile='Medium Worker', propagate_env=True)
        
    return client, cluster

### Select workspace from the following list:
'denali', 'tallgrass', 'local', or 'esip-qhub-gateway-v0.4'

### View available datasets from the Intake Catalog and choose which to use
Note: Select datasets that end in "onprem" if running on Denali/Tallgrass HPC or cloud data if working on QHub or local.

In [None]:
url = 'https://raw.githubusercontent.com/hytest-org/hytest/main/dataset_catalog/hytest_intake_catalog.yml'
cat = intake.open_catalog(url)
list(cat)

### You can setup your client and dataset on QHub like this:
Uncomment single commented spaces (#) to run

In [None]:
# set machine
machine = 'esip-qhub-gateway-v0.4'

# use configure cluster helper function to setup dask
client, cluster = configure_cluster(machine)

# set dataset
dataset = 'conus404-hourly-cloud'

### Or if on the HPC you can setup your client and dataset like this:
Uncomment single commented spaces (#) to run

In [None]:
## set machine
# machine = os.environ['SLURM_CLUSTER_NAME']

## use configure_cluster helper function to setup dask
# client, cluster = configure_cluster(machine)

## set dataset
# dataset = 'conus404-hourly-onprem'

## **Retrieve CONUS404 from source and tranform it to a Dask array**

In [None]:
# double check that dataset is in catalog (cat)
dataset = 'conus404-hourly-cloud'
cat[dataset]

Transform data to dask array

In [None]:
ds = cat[dataset].to_dask()

View dataset metadata. To view variables, expand the "Data variables" section. 

For this tutorial, we will be working with accumulated precipitation (PREC_ACC_NC), air temperature (TK), and surface net radiation (RNET) variables. RNET req 

In [None]:
ds

## **Explore the variable** 
(sometimes called exploratory data analysis (EDA) or exploratory spatial data analysis (ESDA) when it contains cartographic data)

### Lets look at the accumulated precipitation variable by first subsetting the larger dataset. 
Notice the information in the array and chunk columns as well as the coordinates (in particular *time*) and the units.

In [None]:
# variable PREC_ACC_NC
prec = ds.PREC_ACC_NC
prec

### Next, lets visualize a map of the data at specific time step.

In [None]:
prec_time = prec.sel(time='2014-03-01 00:00').load()

In the previous cell, the .sel() method filters the dataset by the *time* coordinate through "time=" and then uses the .load() method to load the dataset into memory.

Now, let's visualize the dataset using the [QuadMesh](https://holoviews.org/reference/elements/bokeh/QuadMesh.html) plot from Holoviews. For a more in-depth tutorial for visualizing gridded data in Holoviews, go to [Gridded Datasets](http://holoviews.org/getting_started/Gridded_Datasets.html).

In [None]:
prec_time.hvplot.quadmesh(x='lon', y='lat', rasterize=True, geo=True, tiles='OSM', alpha=0.7, cmap='turbo')

### We can also look at a time-series for a specific grid cell

In [None]:
prec_point = prec.isel(y=600,x=600).sel(time=slice('2015-02-11 00:00','2015-04-28 00:00')).load()

Note the previous cell uses the .isel() method, which returns the dataset from where the **x** and **y** indexes equal 600 prior to filtering by **time** and loading the data into memory.

Lets plot the dataset.

In [None]:
prec_point.hvplot(x='time', grid=True)

## Importing geographic extents
Sometimes the data that is brought in is not analyzed but is used to clip a larger dataset to an area of interest (AOI).  <br>
Let's look at two ways this can be done: a user-defined polygon or using the pyNHD package. Data can also be brought in other ways such as a local file or an API request. These are covered in other tutorials. <br>
We'll show how to use geometries to limit datasets later in this notebook.

### The first method will use the the Holoviews and Geoviews libraries to let a user draw and then add the polygon dimensions to a geopandas GeoDataFrame.

When the next code block is run, a map will open and the PolyDraw tool automatically selected. Double tap to add the first vertex, then use tap to add each subsequent vertex, to finalize the draw action double tap to insert the final vertex or press the ESC key to stop drawing.

In [None]:
#use CartoLight basemap
basemap = gv.tile_sources.CartoLight()

# x and y limits for CONUS
xlim = (-135, -50)
ylim = (22, 50)

#create blank polygon to draw
## redim.range works with Bokeh backend to set default map extent
blank_poly = gv.Polygons([]).redim.range(Longitude=xlim, Latitude=ylim)

# set PolyDraw for creation and PolyEdit for editing polygon, num_objects keeps to single object at a time
user_poly = PolyDraw(source=blank_poly, show_vertices=True, num_objects=1)
user_poly_edit = PolyEdit(source=blank_poly)

# create plots
## active_tools set to allow instant polygon drawing
basemap.options(width=700, height=400) * blank_poly.options(
    active_tools=['poly_draw'], fill_alpha=0.2, line_color='black')

The next code block pulls the latitude and longitude coordinates for the polygon vertices that were just drawn and creates a polygon GeoDataFrame.

In [None]:
# extract lists of lat/long coordinates
long = user_poly.data['xs'][0]
lat = user_poly.data['ys'][0]

# create zip of polygon vertices
vertices = zip(long, lat)

# construct polygon in GDF
polygon = gpd.GeoDataFrame(
    index=[0], crs="EPSG:4326", geometry=[Polygon(vertices)])

In [None]:
# plot polygon to confirm the shape matches what was drawn
polygon.plot()

### The second method will be importing the HUC6 boundaries using the PyGeoHydro library. PyGeoHydro is a part of the HyRiver library and is document [here](https://docs.hyriver.io/autoapi/pygeohydro/index.html).

The following cell queries the Water Boundary Dataset HUC6 layer and returns a GeoDataFrame from the .byids() function by examing the "huc6" field for the list of HUC6 id's. 

In [None]:
drb = pygeohydro.WBD("huc6", outfields=["huc6", "name"]).byids("huc6", ["020401", "020402"])
drb

You see there are two polygons in the GeoDataFrame and ploting it them confirms this.

In [None]:
drb.plot()

When you want to use geometries to refine datasets to an AOI, it is best to have a single, concise geometry. We'll combine them in the next code cell.

In [None]:
# create a column where all entries have the same value
drb["name"] = "DRB"

#dissolve by that column
drb = drb.dissolve(by="name")

Check to make sure it worked by examing the tabular and spatial data.

In [None]:
# tabular 
drb

In [None]:
# spatial
drb.plot()

## **Putting it together: Preprocess CONUS404 to variable and research spatial extent**
In this section we are going to put together some skills we have learned so far: bring in CONUS404, select our variables and time extent, then clip to our spatial extent.

Variables: Accumulated precipitation (PREC_ACC_NC), air temperature (TK), and surface net radiation (RNET) <br>
Time period: 01/01/1990 - 12/31/1999 <br>
Spatial extent: Delaware River Basin <br>

In [None]:
# set up conus404 filename
conus404 = 'conus404-hourly-cloud'

# create dask array from dataset
ds = cat[conus404].to_dask()

# parse spatial information from CF conventions
ds = ds.metpy.parse_cf()

Get CRS from CONUS404 dataset 

In [None]:
crs = ds.TK.metpy.cartopy_crs
# crs

Other datasets that that are brought in might need to be sliced to the same time period as the CONUS404 dataset. And, since the final data will be resampled to a monthly time step, data can be sliced to a YYYY-MM format.

In [None]:
# get the minimum time coordinate
start_time = ds.coords["time"].values.min()

# convert to datetime then extract the year-month as a string
start_time = pd.to_datetime(start_time).strftime("%Y-%m")

# get the maximum time coordinate
end_time = ds.coords["time"].values.max()

# convert to datetime then extract the year-month as a string
end_time = pd.to_datetime(end_time).strftime("%Y-%m")

print("Start time:", start_time, "\nEnd time:", end_time)

In [None]:
# bring in boundaries of DRB and create single polygon
drb = pygeohydro.WBD("huc6", outfields=["huc6", "name"]).byids("huc6", ["020401", "020402"])
# create a column where all entries have the same value
drb["name"] = "DRB"

# dissolve by that column
drb = drb.dissolve(by="name")

# set CRS to match ds
drb = drb.iloc[[0]].to_crs(crs)

#visualize
# drb.plot()

In [None]:
#subset data variables
c404_variables = ["PREC_ACC_NC", "TK", "ACSWDNB", "I_ACSWDNB", "ACSWUPB", "I_ACSWUPB", "ACLWDNB", "I_ACLWDNB", "ACLWUPB", "I_ACLWUPB"]
c404 = ds[c404_variables]

# write CRS
c404.rio.write_crs(crs, inplace=True)

# perform clip
c404_drb = c404.rio.clip(drb.geometry, crs=crs, drop=True, invert=False)

Visualize the results

In [None]:
c404_drb["ACSWDNB"].isel(time=-1).hvplot(x='x', y='y', crs=crs, rasterize=True, cmap='turbo', tiles='OSM')

We have a little more processing to do before the dataset is ready for analysis. We need to:
1. Calcuate RNET using the radiation columns
2. Resample and aggregate the data to the desired time-step (30 days)

RNET is calculated using the equation <br>
RNET = SWDN + LWDN - SWUP - LWUP

Lets break down the components. First, lets tackle SWDN, which is calculated like this: <br>
(ACSWDNB[i]+(1e9xI_ACSWDNB[i])) - (ACSWDNB[i-1]+(1e9xI_ACSWDNB[i-1])) / 3600

We want all values for ACSWDNB and I_ACSWDNB starting at time index = 1 (h)....

In [None]:
ACSWDNB = c404_drb["ACSWDNB"][1:]

I_ACSWDNB = c404_drb["I_ACSWDNB"][1:]

and we want all values for ACSWDNB at time index = -1 h-1) and have the same time index the other array.

In [None]:
ACSWDNB1 = c404_drb["ACSWDNB"][:-1]
ACSWDNB1.coords["time"] = ACSWDNB.coords["time"]

I_ACSWDNB1 = c404_drb["I_ACSWDNB"][:-1]
I_ACSWDNB1.coords["time"] = I_ACSWDNB.coords["time"]

Confirm both time coords are the same length.

In [None]:
len(ACSWDNB.coords["time"].values) == len(ACSWDNB1.coords["time"].values)

Multiply the *I_* arrays by 1e9

In [None]:
# h
I_ACSWDNB_1e9 = 1e9*I_ACSWDNB
# h-1
I_ACSWDNB1_1e9 = 1e9*I_ACSWDNB1

Now calculate SWDN

In [None]:
SWDN = ((ACSWDNB + I_ACSWDNB_1e9) - (ACSWDNB1 + I_ACSWDNB1_1e9)) / 3600

Let's visualize some of the results.

In [None]:
# SWDN.sel(time="2000-06-01 10:00").hvplot(x='x', y='y', crs=crs, rasterize=True, cmap='turbo', tiles='OSM')

In [None]:
# SWDN.sel(time="2000-06-01 23:00").hvplot(x='x', y='y', crs=crs, rasterize=True, cmap='turbo', tiles='OSM')

Next, pad a NaN to the beginning to match original datasets dimension length and then reset to those dimensions

In [None]:
SWDN = SWDN.pad({"time":(1,0)})
SWDN.coords["time"] = c404_drb["ACSWDNB"].coords["time"]

Now, we will do the same steps to calculate the other three building blocks of *RNET*.

SWUP

In [None]:
# (h) variables
ACSWUPB = c404_drb["ACSWUPB"][1:]

I_ACSWUPB = c404_drb["I_ACSWUPB"][1:]

# (h-1) variables)
ACSWUPB1 = c404_drb["ACSWUPB"][:-1]
ACSWUPB1.coords["time"] = ACSWUPB.coords["time"]

I_ACSWUPB1 = c404_drb["I_ACSWUPB"][:-1]
I_ACSWUPB1.coords["time"] = I_ACSWUPB.coords["time"]

# modify bucket variables by 1e9
I_ACSWUPB_1e9 = 1e9*I_ACSWUPB
I_ACSWUPB1_1e9 = 1e9*I_ACSWUPB1

# calculate variable
SWUP = ((ACSWUPB + I_ACSWUPB_1e9) - (ACSWUPB1 + I_ACSWUPB1_1e9)) / 3600

# pad to match c404_drb time dimension
SWUP = SWUP.pad({"time":(1,0)})
SWUP.coords["time"] = c404_drb["ACSWUPB"].coords["time"]

LWDN

In [None]:
# (h) variables
ACLWDNB = c404_drb["ACLWDNB"][1:]

I_ACLWDNB = c404_drb["I_ACLWDNB"][1:]

# (h-1) variables)
ACLWDNB1 = c404_drb["ACLWDNB"][:-1]
ACLWDNB1.coords["time"] = ACLWDNB.coords["time"]

I_ACLWDNB1 = c404_drb["I_ACLWDNB"][:-1]
I_ACLWDNB1.coords["time"] = I_ACLWDNB.coords["time"]

# modify bucket variables by 1e9
I_ACLWDNB_1e9 = 1e9*I_ACLWDNB
I_ACLWDNB1_1e9 = 1e9*I_ACLWDNB1

# calculate variable
LWDN = ((ACLWDNB + I_ACLWDNB_1e9) - (ACLWDNB1 + I_ACLWDNB1_1e9)) / 3600

# pad to match c404_drb time dimension
LWDN = LWDN.pad({"time":(1,0)})
LWDN.coords["time"] = c404_drb["ACLWDNB"].coords["time"]

LWUP

In [None]:
# (h) variables
ACLWUPB = c404_drb["ACLWUPB"][1:]

I_ACLWUPB = c404_drb["I_ACLWUPB"][1:]

# (h-1) variables)
ACLWUPB1 = c404_drb["ACLWUPB"][:-1]
ACLWUPB1.coords["time"] = ACLWUPB.coords["time"]

I_ACLWUPB1 = c404_drb["I_ACLWUPB"][:-1]
I_ACLWUPB1.coords["time"] = I_ACLWUPB.coords["time"]

# modify bucket variables by 1e9
I_ACLWUPB_1e9 = 1e9*I_ACLWUPB
I_ACLWUPB1_1e9 = 1e9*I_ACLWUPB1

# calculate variable
LWUP = ((ACLWUPB + I_ACLWUPB_1e9) - (ACLWUPB1 + I_ACLWUPB1_1e9)) / 3600

# pad to match c404_drb time dimension
LWUP = LWUP.pad({"time":(1,0)})
LWUP.coords["time"] = c404_drb["ACLWUPB"].coords["time"]

With all the parts, calculate RNET..

In [None]:
# calculate
RNET = SWDN + LWDN - SWUP - LWUP
# RNET

...assign its attributes...

In [None]:
# dictionary of attributes
RNET_attrs = {'description': 'NET RADIATION FROM PAST HOUR FOR BUCKET',
 'grid_mapping': 'crs',
 'long_name': 'Bucket net radiation',
 'units': 'W m-2'
}

# assign attributes
RNET = RNET.assign_attrs(RNET_attrs)
# RNET

 and assign it back to CONUS404

In [None]:
c404_drb = c404_drb.assign(RNET=RNET)

Now drop the extra radiation variables

In [None]:
c404_variables_drop = ["ACSWDNB", "I_ACSWDNB", "ACSWUPB", "I_ACSWUPB", "ACLWDNB", "I_ACLWDNB", "ACLWUPB", "I_ACLWUPB"]
c404_drb = c404_drb.drop_vars(c404_variables_drop)

Visualize RNET

In [None]:
# c404_drb["RNET"].sel(time="2000-06-01 23:00").hvplot(x='x', y='y', crs=crs, rasterize=True, cmap='turbo', tiles='OSM')

Our dataset has been clipped to the area of interest and all the needed variables calculated. The final bit of engineering is resampling the data from hourly to monthly. Xarray has a built in method `resample()` to do this but it only allows a single aggregation method for all the DataArrays in the DataSet. Unfortunately, the DataArrays need different aggregation techniques: sum for *PREC_ACC_NC* and mean for *RNET* and *TK*. We'll accomplish this by splitting *PREC_ACC_NC* from the dataset, resampling it and the dataset seperately, then merging them back together.

In [None]:
# copy data
PREC_ACC_NC = c404_drb["PREC_ACC_NC"]

# resample to 1 month by summing
PREC_ACC_NC = PREC_ACC_NC.resample(time="1M").sum()

# copy attributes from original
PREC_ACC_NC.attrs = c404_drb["PREC_ACC_NC"].attrs

# drop from c404_drb
c404_drb = c404_drb.drop_vars("PREC_ACC_NC")

Resample the dataset and aggregate by mean.

In [None]:
c404_drb = c404_drb.resample(time="1M").mean()

Add back the resampled *PREC_ACC_NC*

In [None]:
c404_drb["PREC_ACC_NC"] = PREC_ACC_NC
# c404_drb

Correct attributes as needed

In [None]:
c404_drb.PREC_ACC_NC.attrs["integration_length"] = 'accumulated over prior month'
c404_drb.RNET["description"] = "MEAN RADIATION FROM PAST MONTH FOR BUCKET"
c404_drb.TK["description"] = "MEAN AIR TEMPERATURE AT THE LOWEST MODEL LEVEL OVER THE PREVIOUS MONTH"

Review the final preprocessed dataset

In [None]:
c404_drb

## test out huc6 zonal statistics

In [None]:
# bring in HUC12 boundaries of DRB
# huc12 = pygeohydro.WBD("huc").bygeom(drb.geometry[0], crs)

# set CRS to match ds
# huc12 = huc12.to_crs(crs)
# huc12

In [None]:
# print(drb.geometry)
drb.bounds.minx.values

In [None]:
import requests

In [None]:
env = drb.to_crs(4326).bounds.values.tolist()[0]
url = "https://hydro.nationalmap.gov/arcgis/rest/services/wbd/MapServer/6/query?"
payload = {"where":"OBJECTID=1",  "f":"pjson", "geometryType":"esriGeometryEnvelope",
           "geometry":{"xmin":env[0],"ymin":env[1],"xmax":env[2],"ymax":env[3], "spatialReference": {"wkid":4326}}
}

# =&geometry=<xmin>,<ymin>,<xmax>,<ymax>
r = requests.get(url, params=payload)

In [None]:
print(env)

In [None]:
r.status_code

In [None]:
r.json()

In [None]:
df2 = pd.DataFrame.from_dict(r.json())
df2

In [None]:
r?

In [None]:
from geocube.api.core import make_geocube

### Preprocess reference data

Now that the CONUS404 dataset has been preprocessed, it is also import for analysis to do the same with the reference data used in the comparitive analysis. In this section, data will be brought in from several sources and preprocessed in data type appropriate ways.

#### NOAA's Global Historical Climate Network - Daily (GHCN) Dataset
It is always important to review and readme or metadata files for the data you wish to bring in. The [GHCN readme](https://noaa-ghcn-pds.s3.amazonaws.com/readme.txt) is useful because it explains what is in the S3 bucket, the various columns in the datasets, and other information. When we later call in the observational data, the [by station readme](https://noaa-ghcn-pds.s3.amazonaws.com/readme-by_station.txt) provides a more detailed explanation of the data there.

After reading the metadata for the file, it can be seen that only the first three columns are needed to map the stations: the station ID, latitude, and longitude. However, we want to make sure that we are only using HCN stations so we need to also use the HCN/CRN Flag column to filter to HCN sites. 

Start by getting a list of stations from the AWS S3 bucket where the daily data is housed.

In [None]:
ghcn_all = pd.read_csv('s3://noaa-ghcn-pds/ghcnd-stations.txt', sep="\t", header=None)
# ghcn_all.head(2)

As you can see, the file reads in as all the contents from one line ending up in one column. 

So we have to split the column into other columns and retain only the needed columns.

In [None]:
ghcn_all = ghcn_all[0].str.split(" +",expand = True)
# ghcn_all.head(2)

As you can see, columns 0-3 look as we'd expect. However, column 4 is where it starts to get messy as the method for expanding the columns has split up the station names at the spaces between. This means that the HCN flag, which we would expect to be in column 6, could be in columns 6-13. Thankfully, the pandas ```loc``` function makes do this filtering easy.

In [None]:
ushcn = ghcn_all.loc[(ghcn_all[6] == "HCN") | (ghcn_all[7] == "HCN") | (ghcn_all[8] == "HCN") | (ghcn_all[9] == "HCN") | (ghcn_all[10] == "HCN") | (ghcn_all[11] == "HCN") | (ghcn_all[12] == "HCN") | (ghcn_all[13] == "HCN")].copy()
ushcn = ushcn.iloc[:, 0:3].rename({0:"station", 1:"lat", 2:"lon"}, axis=1).copy() # after the search, trim the columns and rename to get the data to what is needed to map
# ushcn.head()

We now need to clip the points to only those in the DRB. We do that by using the latitude and longitude to create a GeoDataFrame...

In [None]:
ushcn_gdf = gpd.GeoDataFrame(ushcn, geometry=gpd.points_from_xy(ushcn['lon'], ushcn['lat'], crs="EPSG:4326"))

# convert to same crs as drb
ushcn_gdf = ushcn_gdf.to_crs(crs)

# ushcn_gdf.plot()

followed by clipping using the *drb* geodataframe above

In [None]:
hcn_drb_gdf = gpd.clip(ushcn_gdf, drb)
# hcn_drb_gdf.plot()

Now we want to pull in the tabular data for all of the DRB stations. These are stored on AWS in an individual CSV for each station named *station.csv*. So, we need to get all of the station IDs from our dataset and use them to create a list of URLs for these.

In [None]:
hcn_drb_data_url = [f"s3://noaa-ghcn-pds/csv/by_station/{station}.csv" for station in hcn_drb_gdf["station"].unique().tolist()]
# print(drb_hcn_data_url[0:3])

In [None]:
# len(hcn_drb_data_url)

We'll now pass that list of URLs to *dask.dataframe.read_csv*, which will read the data in parallel. We'll then refine the entries to those in 1979 and after.

In [None]:
hcn_drb_data = dd.read_csv(hcn_drb_data_url, parse_dates=["DATE"], usecols=["ID", "DATE", "ELEMENT", "DATA_VALUE"])

hcn_drb_data = hcn_drb_data.loc[hcn_drb_data["DATE"] >= "1979-10-01"]

Next, we'll refine the dataframe by a list of elements and then compute it.

##### Note: We are using TMAX and TMIN rather than TAVG as TAVG has no records prior to 1998.

In [None]:
# list of elements we are interested in
element_list = ["PRCP", "TMAX", "TMIN"]

hcn_drb_data = hcn_drb_data.loc[hcn_drb_data["ELEMENT"].isin(element_list)]

In [None]:
# check shape
# hcn_drb_data.compute().shape

In [None]:
# how much memory does it take up?
# hcn_drb_data.compute().memory_usage()

The dask dataframe is about 21 mb in size.

Similar to the CONUS404 data, we have a little more engineering to do with the data. We need to calculate the average temperatue using TMIN and TMAX (in Kelvin) as well as resample the data to a 1 month interval. We'll convert the Dask Dataframe into a Pandas Dataframe to do this.

In [None]:
hcn_drb_data_df = hcn_drb_data.compute()

We start by whittling down to our two temperature elements, dropping the *ELEMENT* column, and grouping our data by *ID* and *DATE* in order to take the mean of *TMIN* and *TMAX* and convert this to degrees Kelvin.

In [None]:
# paring down data
hcn_drb_tk = hcn_drb_data_df.loc[hcn_drb_data_df["ELEMENT"].isin(["TMAX","TMIN"])]

# dropping ELEMENT
hcn_drb_tk = hcn_drb_tk.drop("ELEMENT", axis=1)

# calculate mean temperature for each station and date
hcn_drb_tk = hcn_drb_tk.groupby(["ID", "DATE"]).mean()

# rename the DATA_VALUE column to TK
hcn_drb_tk = hcn_drb_tk.rename({"DATA_VALUE":"TK"}, axis=1)

# convert from tenths of degrees Celsius to degrees Kelvin
hcn_drb_tk["TK"] = (hcn_drb_tk["TK"] * 0.1) + 273.15

# reset the index
hcn_drb_tk.reset_index(inplace=True)

Isolate the *PRCP* element and rename like TK

In [None]:
hcn_drb_prcp = hcn_drb_data_df.loc[hcn_drb_data_df["ELEMENT"] == "PRCP"].copy()

# dropping ELEMENT
hcn_drb_prcp = hcn_drb_prcp.drop("ELEMENT", axis=1)

# rename the DATA_VALUE column to PREC_ACC_NC
hcn_drb_prcp = hcn_drb_prcp.rename({"DATA_VALUE":"PREC_ACC_NC"}, axis=1)

# convert from tenths of mm to mm
hcn_drb_prcp["PREC_ACC_NC"] = hcn_drb_prcp["PREC_ACC_NC"] * 0.1

# reset the index
hcn_drb_prcp.reset_index(inplace=True, drop=True)

# hcn_drb_prcp

Combine *TK* and *PRCP* DataFrames

In [None]:
hcn_drb = hcn_drb_tk.merge(hcn_drb_prcp, how="inner", on=["ID", "DATE"])
# hcn_drb

And then resample to 1 month and aggregate

In [None]:
hcn_drb = hcn_drb.groupby("ID").resample("1M", on="DATE").agg({"TK":"mean", "PREC_ACC_NC":"sum"}).reset_index(drop=False)
# hcn_drb

Now add the latitude and longitude coordinates 

In [None]:
hcn_drb_coords = pd.DataFrame(hcn_drb_gdf.drop(columns="geometry"))

# rename ID columnt o match drb_hcn
hcn_drb_coords = hcn_drb_coords.rename({"station": "ID", "lon":"LONGITUDE", "lat":"LATITUDE"}, axis=1)

# merge
hcn_drb = hcn_drb.merge(hcn_drb_coords, on="ID", how="left")

hcn_drb

#### NOAA's Global Climate Reference Network (GCRN) Dataset

Use *fsspec* to make FTP call to NOAA for CRN data <br>
First, create file system

In [None]:
fs = FTPFileSystem("ftp.ncei.noaa.gov")

Since the file type is a *tsv*, we will use the *pd.read_table* function to create a Dataframe

In [None]:
uscrn_all = pd.read_table(fs.open("/pub/data/uscrn/products/stations.tsv")) 
uscrn_all.head()

Now turn into GDF

In [None]:
uscrn_gdf = gpd.GeoDataFrame(uscrn_all, geometry=gpd.points_from_xy(uscrn_all["LONGITUDE"], uscrn_all["LATITUDE"]), crs="EPSG:4326")

# convert to same crs as drb
uscrn_gdf = uscrn_gdf.to_crs(crs)

# uscrn_gdf.plot()

Find which USCRN sites are in DRB

In [None]:
crn_drb_gdf = gpd.clip(uscrn_gdf, drb)
# crn_drb_gdf.plot()

In [None]:
crn_drb_gdf.head()

We now know what CRN sites are in the Delaware River Basin. We must now retrieve the data for this site from the FTP server.

First, we'll get the location name.

In [None]:
crn_stat_name = crn_drb_gdf["LOCATION"].values.tolist()[0]
print(crn_stat_name)

Initilize the FTP connection again

In [None]:
fs = FTPFileSystem("ftp.ncei.noaa.gov")

In [None]:
file_list_glob = fs.glob(
    f"/pub/data/uscrn/products/daily01/**/*{crn_stat_name}*")

In [None]:
crn_drb = pd.DataFrame()

for file in file_list_glob:
    stat_data = pd.read_csv(fs.open(file), header=None, sep="\t")
    crn_drb = pd.concat([crn_drb, stat_data])

In [None]:
crn_drb = crn_drb[0].str.split(" +",expand = True)
# crn_drb.head()

Now bring in the headers for the station data

In [None]:
crn_headers = fs.open("/pub/data/uscrn/products/daily01/headers.txt")
crn_data_headers = pd.read_csv(crn_headers, sep="\t", header=None).iloc[1,:].str.split(" +").values.tolist()[0][0:28]
# crn_data_headers

Check that the number of headers equals the number of columns in our data

In [None]:
print(len(crn_drb.columns) == len(crn_data_headers))

Now then rename the column headers

In [None]:
crn_drb.columns = crn_data_headers
# crn_drb

Convert the *LST_DATE* column to datetime and refine columns to those of interest.

In [None]:
crn_drb["DATE"] = pd.to_datetime(crn_drb["LST_DATE"])
crn_drb = crn_drb[["DATE", "P_DAILY_CALC", "T_DAILY_AVG", "SOLARAD_DAILY", "LONGITUDE", "LATITUDE"]]

If you examine the data types, you'll see that the 4 columns of numbers are actually data type *object* when we need them as numeric

In [None]:
crn_drb.dtypes

Lets rectify that by applying the `pd.to_numeric` function to the columns.

In [None]:
cols = crn_drb.columns.drop("DATE")
crn_drb[cols] = crn_drb[cols].apply(pd.to_numeric, errors='coerce')
crn_drb.dtypes

The CRN dataset has many values of -9999.0, which is where a record was not recorded due to data quality or other issues. 

In [None]:
crn_drb.head(2)

However, when it comes to performing mathematical, this will obviously skew the results. So, we will set these values to NaN, which will be ignored during calculations.

In [None]:
# set to NaN
crn_drb = crn_drb.replace(-9999.0, np.nan)

# add a station ID column
crn_drb["ID"] = crn_stat_name

In [None]:
crn_drb

Now to convert columns to the correct units

In [None]:
# Celsius to Kelvin
crn_drb["TK"] = crn_drb["T_DAILY_AVG"] + 273.15
crn_drb = crn_drb.drop("T_DAILY_AVG", axis=1)

# rename column
crn_drb = crn_drb.rename({"SOLARAD_DAILY": "RNET", "P_DAILY_CALC": "PREC_ACC_NC"}, axis=1)

In [None]:
crn_drb.columns

In [None]:
crn_drb = crn_drb.groupby("ID").resample("1M", on="DATE").agg({"TK":"mean", "PREC_ACC_NC":"sum", "RNET":"mean", "LATITUDE":"mean", "LONGITUDE":"mean"}).reset_index(drop=False)
# crn_drb

#### PRISM data
This time we will open the PRISM dataset, temporarally slice it, spatially clip it, and refine the data. Many of the steps will look the same as the CONUS404 dataset so there will be less explanation of the steps.

In [None]:
fs = fsspec.filesystem("s3", anon=False, requester_pays=True, skip_instance_cache=True)

In [None]:
prism_years = range(1979, 2021, 1)
chunks={"time": 6, "lon": 703, "lat": 311}
pr = [xr.open_dataset(fs.open(f"s3://nhgf-development/thredds/prism_v2/prism_{str(year)}.nc"), chunks=chunks, decode_coords="all") for year in prism_years]
prism = xr.concat(pr, dim="time")
prism = prism.drop_vars("time_bnds")

# prism

In [None]:
# NAD83
prism_crs = 4269

# write crs to prism
prism.rio.write_crs(prism_crs, inplace=True)

Rename the dimensions to match CF conventions used by rioxarray

In [None]:
prism = prism.rename({"lon":"x", "lat":"y", "ppt": "PREC_ACC_NC"}) 

Bring in DRB boundaries and reproject to match PRISM crs.

In [None]:
# bring in boundaries of DRB and create single polygon
drb_NAD83 = pygeohydro.WBD("huc6", outfields=["huc6", "name"]).byids("huc6", ["020401", "020402"])
# create a column where all entries have the same value
drb_NAD83["name"] = "DRB"

# dissolve by that column
drb_NAD83 = drb_NAD83.dissolve(by="name")

# set CRS to match ds
drb_NAD83 = drb_NAD83.iloc[[0]].to_crs(prism_crs)

#visualize
# drb_NAD83.plot()

In [None]:
# clip to DRB
prism_drb = prism.rio.clip(drb_NAD83.geometry, crs=prism_crs, drop=True, invert=False)

#slice time
prism_drb = prism_drb.sel(time=slice(start_time, end_time))

Display the clipped data

In [None]:
prism_drb.sel(time="2000-06-01", method="nearest").hvplot(x='x', y='y', geo=True, rasterize=True, tiles='OSM', alpha=0.7, cmap='turbo')

Calculate the mean monthly tempertaure and convert to Kelvin and populate its attributes

In [None]:
# mean temperature in Kelvin
prism_drb = prism_drb.assign(TK = ((prism_drb.tmn+prism_drb.tmx)/2) + 273.15) 

# dictionary of attributes
prism_tk_attrs = {'units ': 'degrees Kelvin',
 'long_name': 'Mean monthly temperature'
}

# assign attributes
prism_drb["TK"] = prism_drb["TK"].assign_attrs(prism_tk_attrs)

# drop variables
prism_drb = prism_drb.drop_vars(["tmn", "tmx"])

prism_drb

# Update: method for exporting data to NGHF

In [None]:
# # Last code cell of the notebook
# import watermark.watermark as watermark
# print(watermark(iversions=True, python=True, machine=True, globals_=globals()))

In [None]:
client.close(); cluster.shutdown()