# Comparative analysis of CONUS404 to reference datasets

<img src='./Eval_Analysis.svg' width=600>

With our 

In [None]:
# library imports
import fsspec #testing
import hvplot.xarray #testing
import intake #testing
import os #testing
import warnings #testing
import rioxarray #testing
import dask #testing
import metpy #testing
import calendar #testing

from shapely.geometry import Polygon #testing
from dask.distributed import LocalCluster, Client #testing
from pygeohydro import pygeohydro #testing
from fsspec.implementations.ftp import FTPFileSystem #testing
from holoviews.streams import PolyEdit, PolyDraw #testing
from geocube.api.core import make_geocube #testing

import xarray as xr #testing
import geopandas as gpd #testing
import pandas as pd #testing
import geoviews as gv #testing
import dask.dataframe as dd #testing
import numpy as np #testing  

warnings.filterwarnings('ignore')

# Update to helper function after repo consolidation
## **Start a Dask client using an appropriate Dask Cluster** 
This is an optional step, but can speed up data loading significantly, especially when accessing data from the cloud.

In [None]:
def configure_cluster(machine):
    ''' Helper function to configure cluster
    '''
    if machine == 'denali':
        from dask.distributed import LocalCluster, Client
        cluster = LocalCluster(threads_per_worker=1)
        client = Client(cluster)
    
    elif machine == 'tallgrass':
        from dask.distributed import Client
        from dask_jobqueue import SLURMCluster
        cluster = SLURMCluster(queue='cpu', cores=1, interface='ib0',
                               job_extra=['--nodes=1', '--ntasks-per-node=1', '--cpus-per-task=1'],
                               memory='6GB')
        cluster.adapt(maximum_jobs=30)
        client = Client(cluster)
        
    elif machine == 'local':
        import os
        import warnings
        from dask.distributed import LocalCluster, Client
        warnings.warn("Running locally can result in costly data transfers!\n")
        n_cores = os.cpu_count() # set to match your machine
        cluster = LocalCluster(threads_per_worker=n_cores)
        client = Client(cluster)
        
    elif machine in ['esip-qhub-gateway-v0.4']:   
        import sys, os
        sys.path.append(os.path.join(os.environ['HOME'],'shared','users','lib'))
        import ebdpy as ebd
        aws_profile = 'nhgf-development'
        ebd.set_credentials(profile=aws_profile)

        aws_region = 'us-west-2'
        endpoint = f's3.{aws_region}.amazonaws.com'
        ebd.set_credentials(profile=aws_profile, region=aws_region, endpoint=endpoint)
        worker_max = 30
        client,cluster = ebd.start_dask_cluster(profile=aws_profile, worker_max=worker_max, 
                                              region=aws_region, use_existing_cluster=True,
                                              adaptive_scaling=False, wait_for_cluster=False, 
                                              worker_profile='Medium Worker', propagate_env=True)
        
    return client, cluster

### Setup your cluster

#### QHub...
Uncomment single commented spaces (#) to run

In [None]:
# set machine
machine = 'esip-qhub-gateway-v0.4'

# use configure cluster helper function to setup dask
client, cluster = configure_cluster(machine)

#### or HPC
Uncomment single commented spaces (#) to run

In [None]:
## set machine
# machine = os.environ['SLURM_CLUSTER_NAME']

## use configure_cluster helper function to setup dask
# client, cluster = configure_cluster(machine)

# Calculate weighted annual means

Create a function to calculate weighted annual means

Credit: [NCAR](https://ncar.github.io/esds/posts/2021/yearly-averages-xarray/)

In [None]:
def weighted_temporal_mean(ds, var):
    """
    weight by days in each month
    """
    # Determine the month length
    month_length = ds.time.dt.days_in_month

    # Calculate the weights
    wgts = month_length.groupby("time.year") / month_length.groupby("time.year").sum()

    # Make sure the weights in each year add up to 1
    np.testing.assert_allclose(wgts.groupby("time.year").sum(xr.ALL_DIMS), 1.0)

    # Subset our dataset for our variable
    obs = ds[var]

    # Setup our masking for nan values
    cond = obs.isnull()
    ones = xr.where(cond, 0.0, 1.0)

    # Calculate the numerator
    obs_sum = (obs * wgts).resample(time="AS").sum(dim="time")

    # Calculate the denominator
    ones_out = (ones * wgts).resample(time="AS").sum(dim="time")

    # Return the weighted average
    return obs_sum / ones_out

**CONUS404**

In [None]:
# url to c404_drb
c404_drb_url = 's3://nhgf-development/workspace/tutorial/CONUS404/c404_drb.nc'

fs = fsspec.filesystem("s3", anon=False, requester_pays=True, skip_instance_cache=True)

# open dataset
c404_drb = xr.open_dataset(fs.open(c404_drb_url), decode_coords="all")

# set crs
c404_crs = c404_drb.rio.crs.to_proj4()

# see what data variables ds has
print(c404_drb.data_vars)

# see what coordss ds has
print(c404_drb.coords)

In [None]:
c404_RNET = weighted_temporal_mean(c404_drb, "RNET").assign_attrs(c404_drb.RNET.attrs)
c404_PREC_ACC_NC = weighted_temporal_mean(c404_drb, "PREC_ACC_NC").assign_attrs(c404_drb.PREC_ACC_NC.attrs)
c404_TK = weighted_temporal_mean(c404_drb, "TK").assign_attrs(c404_drb.TK.attrs)

c404_drb_yr = c404_RNET.to_dataset(name="RNET")
c404_drb_yr = c404_drb_yr.assign(PREC_ACC_NC = c404_PREC_ACC_NC, TK=c404_TK) \
                        .assign_attrs(c404_drb.attrs) \
                        .assign_coords({"crs":c404_drb.coords["crs"]})
c404_drb_yr

**PRISM**

In [None]:
# url to prism_drb
prism_drb_url = 's3://nhgf-development/workspace/tutorial/CONUS404/prism_drb.nc'

fs = fsspec.filesystem("s3", anon=False, requester_pays=True, skip_instance_cache=True)

# open dataset
prism_drb = xr.open_dataset(fs.open(prism_drb_url), decode_coords="all")

# see what data variables ds has
print(prism_drb.data_vars)

# see what coordss ds has
print(prism_drb.coords)

In [None]:
prism_PREC_ACC_NC = weighted_temporal_mean(prism_drb, "PREC_ACC_NC").assign_attrs(prism_drb.PREC_ACC_NC.attrs)
prism_TK = weighted_temporal_mean(prism_drb, "TK").assign_attrs(prism_drb.TK.attrs)

prism_drb_yr = prism_PREC_ACC_NC.to_dataset(name="PREC_ACC_NC")
prism_drb_yr = prism_drb_yr.assign(TK=prism_TK) \
                        .assign_attrs(prism_drb.attrs) \
                        .assign_coords({"spatial_ref":prism_drb.coords["spatial_ref"]})
prism_drb_yr

**CERES-EBAF**

In [None]:
# url to ceres_drb
ceres_drb_url = 's3://nhgf-development/workspace/tutorial/CONUS404/ceres_drb.nc'

fs = fsspec.filesystem("s3", anon=False, requester_pays=True, skip_instance_cache=True)

# open dataset
ceres_drb = xr.open_dataset(fs.open(ceres_drb_url), decode_coords="all", chunks={"time":10})

# see what data variables ds has
print(ceres_drb.data_vars)

# see what coordss ds has
print(ceres_drb.coords)

In [None]:
ceres_RNET = weighted_temporal_mean(ceres_drb, "RNET").assign_attrs(ceres_drb.RNET.attrs)

ceres_drb_yr = ceres_RNET.to_dataset(name="RNET")
ceres_drb_yr = ceres_drb_yr.assign_attrs(ceres_drb.attrs) \
                        .assign_coords({"spatial_ref":ceres_drb.coords["spatial_ref"]})
ceres_drb_yr

## Calculate long-term means

In [None]:
c404_drb_ltm = c404_drb_yr.mean(dim="time")
# c404_drb_ltm = c404_drb_ltm.persist()

prism_drb_ltm = prism_drb_yr.mean(dim="time")
# prism_drb_ltm = prism_drb_ltm.persist()

ceres_drb_ltm = ceres_drb_yr.mean(dim="time")
# ceres_drb_ltm = ceres_drb_ltm.persist()

## Set up geographic areas for any zonal statistics

Read in HUC6 boundaries

In [None]:
# bring in HUC6 boundaries found in the DRB
drb_gdf = pygeohydro.WBD("huc6", outfields=["huc6", "name"]).byids("huc6", ["020401", "020402"])

# set CRS to match c404_drb
drb_gdf = drb_gdf.to_crs(c404_crs)

# convert huc6 field to int as this works best for the following steps
drb_gdf["huc6"] = drb_gdf["huc6"].astype(int) #note: this may drop the # of digits from 6 to less depending on how many zeroes there were, may need to pad back to 6 digits later

#visualize
# drb_gdf.plot(edgecolor="orange", facecolor="purple", linewidth=2.5)

## Create datamask and build new dataset

In [None]:
# CONUS404

# create an output grid
c404_out_grid = make_geocube(
    vector_data = drb_gdf,
    measurements=["huc6"],
    like=c404_drb_ltm.rio.write_crs(c404_crs)
)

# add datarrays to grid
c404_out_grid["RNET"] = (c404_drb_ltm.RNET.dims, c404_drb_ltm.RNET.values, 
                         c404_drb_ltm.RNET.attrs, c404_drb_ltm.RNET.encoding)

c404_out_grid["TK"] = (c404_drb_ltm.TK.dims, c404_drb_ltm.TK.values,
                         c404_drb_ltm.TK.attrs, c404_drb_ltm.TK.encoding)

c404_out_grid["PREC_ACC_NC"] = (c404_drb_ltm.PREC_ACC_NC.dims, c404_drb_ltm.PREC_ACC_NC.values,
                         c404_drb_ltm.PREC_ACC_NC.attrs, c404_drb_ltm.PREC_ACC_NC.encoding)


# c404_grouped = c404_out_grid.drop_vars("spatial_ref").groupby(c404_out_grid.huc6)

# PRISM

# create an output grid
prism_out_grid = make_geocube(
    vector_data = drb_gdf,
    measurements=["huc6"],
    like=prism_drb_ltm.rio.write_crs(c404_crs)
)

# add datarrays to grid
prism_out_grid["TK"] = (prism_drb_ltm.TK.dims, prism_drb_ltm.TK.values,
                         prism_drb_ltm.TK.attrs, prism_drb_ltm.TK.encoding)

prism_out_grid["PREC_ACC_NC"] = (prism_drb_ltm.PREC_ACC_NC.dims, prism_drb_ltm.PREC_ACC_NC.values,
                         prism_drb_ltm.PREC_ACC_NC.attrs, prism_drb_ltm.PREC_ACC_NC.encoding)

# groupby
# prism_grouped = prism_out_grid.drop_vars("spatial_ref").groupby(prism_out_grid.huc6)

# CERES-EBAF

# create an output grid
ceres_out_grid = make_geocube(
    vector_data = drb_gdf,
    measurements=["huc6"],
    like=ceres_drb_ltm.rio.write_crs(c404_crs)
)

# add datarrays to grid
ceres_out_grid["RNET"] = (ceres_drb_ltm.RNET.dims, ceres_drb_ltm.RNET.values,
                         ceres_drb_ltm.RNET.attrs, ceres_drb_ltm.RNET.encoding)

# groupby
# ceres_grouped = ceres_out_grid.drop_vars("spatial_ref").groupby(ceres_out_grid.huc6)

In [None]:
prism_PREC_ACC_NC_bias = c404_out_grid.PREC_ACC_NC - prism_out_grid.PREC_ACC_NC

In [None]:
prism_PREC_ACC_NC_bias["huc6"] = c404_out_grid.huc6

In [None]:
prism_PREC_ACC_NC_bias.groupby("huc6").mean()

Areal statistics

In [None]:
%run ../../Metrics_NWM_StdSuite_v1.ipynb

In [None]:
def mae(obs, sim):
    return np.mean(np.abs(obs - sim))

In [None]:
print("prism:", prism_drb_ltm.data_vars, "\nceres", ceres_drb_ltm.data_vars)

In [None]:
areal_stats = {
    "c404_PREC_ACC_NC_mean" : c404_drb_ltm.PREC_ACC_NC.compute().mean().data.tolist(),
    "prism_PREC_ACC_NC_mean" : prism_drb_ltm.PREC_ACC_NC.compute().mean().data.tolist(),
    "c404_PREC_ACC_NC_med" : c404_drb_ltm.PREC_ACC_NC.compute().median().data.tolist(),
    "prism_PREC_ACC_NC_med" : prism_drb_ltm.PREC_ACC_NC.compute().median().data.tolist(),
    "c404_PREC_ACC_NC_std" : c404_drb_ltm.PREC_ACC_NC.compute().std().data.tolist(),
    "prism_PREC_ACC_NC_std" : prism_drb_ltm.PREC_ACC_NC.compute().std().data.tolist(),
    "c404_TK_mean" : c404_drb_ltm.TK.compute().mean().data.tolist(),
    "prism_TK_mean" : prism_drb_ltm.TK.compute().mean().data.tolist(),
    "c404_TK_med" : c404_drb_ltm.TK.compute().median().data.tolist(),
    "prism_TK_med" : prism_drb_ltm.TK.compute().median().data.tolist(),
    "c404_TK_std" : c404_drb_ltm.TK.compute().std().data.tolist(),
    "prism_TK_std" : prism_drb_ltm.TK.compute().std().data.tolist(),
    "c404_RNET_mean" : c404_drb_ltm.RNET.compute().mean().data.tolist(),
    "ceres_RNET_mean" : ceres_drb_ltm.RNET.compute().mean().data.tolist(),
    "c404_RNET_med" : c404_drb_ltm.RNET.compute().median().data.tolist(),
    "ceres_RNET_med" : ceres_drb_ltm.RNET.compute().median().data.tolist(),
    "c404_RNET_std" : c404_drb_ltm.RNET.compute().std().data.tolist(),
    "ceres_RNET_std" : ceres_drb_ltm.RNET.compute().std().data.tolist(),
    "prism_PREC_ACC_NC_corr" : xr.corr(c404_drb_ltm.PREC_ACC_NC, prism_drb_ltm.PREC_ACC_NC).compute().data.tolist(),
    "prism_TK_corr" : xr.corr(c404_drb_ltm.TK, prism_drb_ltm.TK).compute().data.tolist(),
    "ceres_RNET_corr" : xr.corr(c404_drb_ltm.RNET, ceres_drb_ltm.RNET).compute().data.tolist(),
    "prism_PREC_ACC_NC_mae" : mae(c404_drb_ltm.PREC_ACC_NC, prism_drb_ltm.PREC_ACC_NC).compute().data.tolist(),
    "prism_TK_mae" : mae(c404_drb_ltm.TK, prism_drb_ltm.TK).compute().data.tolist(),
    "ceres_RNET_mae" : mae(c404_drb_ltm.RNET, ceres_drb_ltm.RNET).compute().data.tolist(),
    "prism_PREC_ACC_NC_rmsd" : np.sqrt(MSE(c404_drb_ltm.PREC_ACC_NC, prism_drb_ltm.PREC_ACC_NC)).compute().data.tolist(),
    "prism_TK_rmsd" : np.sqrt(MSE(c404_drb_ltm.TK, prism_drb_ltm.TK)).compute().data.tolist(),
    "ceres_RNET_rmsd" : np.sqrt(MSE(c404_drb_ltm.RNET, ceres_drb_ltm.RNET)).compute().data.tolist()
}

areal_stats

In [None]:
c404_drb_ltm.PREC_ACC_NC.median().data.tolist()

In [None]:
print(c404_drb_ltm.PREC_ACC_NC.min(), c404_drb_ltm.PREC_ACC_NC.max())

In [None]:
print(c404_drb_ltm.PREC_ACC_NC.std().data.tolist(), prism_drb_ltm.PREC_ACC_NC.std().data.tolist())

## **Extract gridded values to points**

The goal of this section is extract values from CONUS404 where they intersect with station data. This process is described in article about the ESRI tool [Extract Values to Points](https://pro.arcgis.com/en/pro-app/latest/tool-reference/spatial-analyst/extract-values-to-points.htm). This tabular data will then be exported for use in the next notebook, **CONUS404 Analysis**.

Dataset outline:
1. Read in the prepared dataset
2. Extract data from overlapping pixel at same time step as point
<br>

**Climate Reference Network point extraction**

In [None]:
fs = fsspec.filesystem("s3", anon=False, requester_pays=True, skip_instance_cache=True)

crn_drb_df = pd.read_parquet(fs.open("s3://nhgf-development/workspace/tutorial/CONUS404/crn_drb.parquet"))

# create geodataframe
crn_drb = gpd.GeoDataFrame(crn_drb_df, crs=4326,
                       geometry=gpd.points_from_xy(crn_drb_df.LONGITUDE, 
                                                         crn_drb_df.LATITUDE))

# modify date field
crn_drb["DATE"] = crn_drb["DATE"].astype(str).str[:-3]

crn_drb.rename({"DATE": "time",
                "TK": "crn_TK", 
                "RNET": "crn_RNET", 
                "PREC_ACC_NC": "crn_PREC_ACC_NC"},
                  axis=1, inplace=True)

crn_drb.head()

Get coordinates from crn_drb to index c404_drb by

In [None]:
# isolate single row and transform to c404_drb crs
crn_coords_gdf = crn_drb.iloc[[0]].to_crs(c404_crs)

# extract lat/long values
crn_lat = crn_coords_gdf.iloc[0]["geometry"].y
crn_lon = crn_coords_gdf.iloc[0]["geometry"].x

# time
crn_time_min = crn_drb_df["time"].min()
crn_time_max = crn_drb_df["time"].max()
crn_time_min, crn_time_max

# subset c404_drb to lat/long using nearest
c404_crn_sub = c404_drb.sel(x=crn_lon, y=crn_lat, method="nearest")

# slice to time-steps of crn_drb
c404_crn_sub = c404_crn_sub.sel(time=slice(crn_time_min, crn_time_max))

c404_crn_sub

Convert subset to dataframe and reorganize columns

In [None]:
c404_sub_crn_df = c404_crn_sub.to_dataframe().reset_index(drop=False)

# trim columns
c404_sub_crn_df = c404_sub_crn_df[["time", "TK", "RNET", "PREC_ACC_NC"]]

# rename columns
c404_sub_crn_df.rename({"TK": "c404_TK", 
                    "RNET": "c404_RNET", 
                    "PREC_ACC_NC": "c404_PREC_ACC_NC"},
                  axis=1, inplace=True)

# trim time
c404_sub_crn_df["time"] = c404_sub_crn_df["time"].astype(str).str[:-3]

c404_sub_crn_df

Combine CONUS404 subset with CRN data

In [None]:
crn_c404_point = crn_drb.merge(c404_sub_crn_df, on="time").reset_index(drop=False)

# drop columns
crn_c404_point.drop(["index", "LATITUDE", "LONGITUDE", "ID", "geometry"], axis=1, inplace=True)

crn_c404_point.head()

Export dataset

In [None]:
crn_c404_point.to_parquet("s3://nhgf-development/workspace/tutorial/CONUS404/crn_c404_point.parquet")

**Historical Climate Network (HCN) point extraction**

The HCN data is different than the CRN data as the HCN data comes from multiple stations whereas the CRN data was from a single station. This will involve using multiple sets of geographic coordinates to extract data from CONUS404.

In [None]:
# read in the HCN dataset
fs = fsspec.filesystem("s3", anon=False, requester_pays=True, skip_instance_cache=True)

hcn_drb_df = pd.read_parquet(fs.open("s3://nhgf-development/workspace/tutorial/CONUS404/hcn_drb.parquet"))

#rename columns
hcn_drb_df.rename({"DATE": "time",
                "TK": "hcn_TK",  
                "PREC_ACC_NC": "hcn_PREC_ACC_NC"},
                  axis=1, inplace=True)

# change DATE field to 
hcn_drb_df["time"] = hcn_drb_df["time"].astype(str).str[:-3]

hcn_drb_df.head()

Get a DataFrame of the station IDs, lats, and longs to use for extract data

In [None]:
hcn_stations = hcn_drb_df.copy().drop(["time", "hcn_TK", "hcn_PREC_ACC_NC"], axis=1)
hcn_stations["LONGITUDE"] = pd.to_numeric(hcn_stations["LONGITUDE"])
hcn_stations["LATITUDE"] = pd.to_numeric(hcn_stations["LATITUDE"])

hcn_stations = hcn_stations.groupby('ID').mean().reset_index(drop=False)
# hcn_stations

Create a GeoDataFrame to convert the lat and long to the coordinate system of CONUS404

In [None]:
hcn_stations_gdf = gpd.GeoDataFrame(hcn_stations, crs=4326,
                       geometry=gpd.points_from_xy(hcn_stations.LONGITUDE, 
                                                         hcn_stations.LATITUDE))

# transform to c404_drb crs
hcn_stations_gdf = hcn_stations_gdf.to_crs(c404_crs)

# extract lat/long values
hcn_stations_gdf["y"] = hcn_stations_gdf["geometry"].y
hcn_stations_gdf["x"] = hcn_stations_gdf["geometry"].x

#drop lat/lon/geo
hcn_stations_df = hcn_stations_gdf.drop(["LATITUDE", "LONGITUDE", "geometry"], axis=1)

Subset c404_drb to time period of HCN

In [None]:
# time min/max
hcn_time_min = hcn_drb_df["time"].min()
hcn_time_max = hcn_drb_df["time"].max()

# slice c404 to HCN time
c404_hcn_timesub = c404_drb.sel(time=slice(hcn_time_min, hcn_time_max))

Use Dataframe rows to extract data from c404_drb

In [None]:
# list of extracted data
c404_hcn_subs = []

for index, data in hcn_stations_df.iterrows():
    c404_hcn_sub_step = c404_hcn_timesub.sel(x=data.x, y=data.y, method="nearest").to_dataframe()
    c404_hcn_sub_step["ID"] = data.ID
    c404_hcn_subs.append(c404_hcn_sub_step)

# concat list of extracted data into single Dataframe
c404_hcn_sub = pd.concat(c404_hcn_subs)

#reset index
c404_hcn_sub.reset_index(drop=False, inplace=True)

# drop columns
c404_hcn_sub.drop(["RNET", "lon", "lat", "y", "x", "crs"], axis=1, inplace=True)

# rename columns
c404_hcn_sub.rename({"TK":"c404_TK",
                    "PREC_ACC_NC": "c404_PREC_ACC_NC"},
                   axis=1, inplace=True)

# trim time
c404_hcn_sub["time"] = c404_hcn_sub["time"].astype(str).str[:-3]

# c404_hcn_sub

Merge CONUS404 observations to HCH observations using the station ID and time

In [None]:
hcn_c404_point = hcn_drb_df.merge(c404_hcn_sub, left_on=["ID", "time"], right_on=["ID", "time"])

# drop columns
hcn_c404_point.drop(["LATITUDE", "LONGITUDE"], axis=1, inplace=True)

hcn_c404_point.head()

Export the dataset

In [None]:
hcn_c404_point.to_parquet("s3://nhgf-development/workspace/tutorial/CONUS404/hcn_c404_point.parquet")

Check all of the files that have been created in the data preparation notebooks

In [None]:
fs = fsspec.filesystem("s3", anon=False, requester_pays=True, skip_instance_cache=True)

fs.ls("s3://nhgf-development/workspace/tutorial/CONUS404")

Shut down the client and cluster

In [None]:
client.close(); cluster.shutdown()
del client; del cluster

# Next: CONUS404 Analysis notebook

Now that we have moved through our data preparation and calculated zonal and point statistics, we can move on to analyzing the differences between CONUS404 and the reference data in the CONUS404 Analysis notebook.

In [None]:
# # Last code cell of the notebook
# import watermark.watermark as watermark
# print(watermark(iversions=True, python=True, machine=True, globals_=globals()))