## Finalize the output files

Edit variable names, metadata, etc.

### Imports

In [None]:
# --- Import Modules --- #

# Import Python Core Modules
import sys
import os
import time
import datetime

# Import Additional Modules
import numpy as np
import xarray as xr
import pandas as pd
import geopandas as gpd

tic = time.time()
print('Process initiated at {0}'.format(time.ctime()))
# --- End Import Modules --- #

In [None]:
in_nc = r'/path/to/outputs/agg_out/CONUS_HUC12_WB_combined_19791001_20220930.nc'

# Output directory
outDir = r'/path/to/outputs/agg_out/'

# Output files
out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2011_2013.nc')
#out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2011_2013_2.csv')

# Select output formats
write_NC = True      # Output netCDF file
#write_CSV = True     # Output CSV file

# Name the zone dimension
zone_name = 'WBDHU12'

# Name the time dimension
time_coord = 'time'

### Dictionaries to rename variables and set attributes

In [None]:
# Dictionary to rename variables. Also used to subset dataset by variable name.
var_rename_dict = {'totPRECIP':'Precip',
                   'totPRECIP':'PrecipLand',
                   'deltaACSNOW':'Snowfall',
                   'totqSfcLatRunoff':'Surfaceflow',
                   'totqBucket':'Baseflow',
                   'deltaACCET':'ET',
                   'avgSNEQV':'SWE',
                   'avgSOILM':'SoilWater',
                   'avgSOILSAT':'SoilSat',
                   'deltaUGDRNOFF':'Recharge',
                   'bucket_depth':'GWStore',
                   'Area_sqkm':'CatchmentArea',
                   'Precip':'Precip',
                   'landmask':'LandFraction',
                   'total_gridded_area': 'total_gridded_area',
                   'avgSOILM_wltadj_depthmean': 'avgSOILM_wltadj_depthmean',
                   'avgSOILSAT_wltadj_top1': 'avgSOILSAT_wltadj_top1',}

# Rename dimensions
rename_dim_dict = {zone_name:'huc_id'}

# Variable attributes dictionary
var_atts_dict = {'Precip':{'units':'mm',
                           'long_name':"Total monthly precipitation (land & water)"},
                'PrecipLand':{'units':'mm',
                              'long_name':"Total monthly precipitation (land only)"},
                'Snowfall':{'units':'mm',
                            'long_name':"Total monthly snowfall (land only)"},
                'Surfaceflow':{'units':'mm',
                               'long_name':"Total monthly surface flow"},
                'Baseflow':{'units':'mm',
                            'long_name':"Total monthly baseflow"},
                'ET':{'units':'mm',
                      'long_name':"Total monthly evapotranspiration (land only)"},
                'SWE':{'units':'mm',
                       'long_name':"Average monthly snow water equivalent (land only)"},
                'SoilWater':{'units':'mm',
                             'long_name':"Average monthly soil moisture in 2m soil column (land only)"},
                'SoilSat':{'units':'fraction',
                           'long_name':"Average monthly fractional soil saturation in 2m soil column (land only)"},
                'Recharge':{'units':'mm',
                            'long_name':"Total monthly recharge (land only)"},
                'GWStore':{'units':'mm',
                           'long_name':"Average monthly groundwater store"},
                'LandFraction':{'units':'fraction',
                                'standard_name':'area_fraction',
                                'long_name':"Land fraction of HUC12 from gridded data"},
                'CatchmentArea':{'units':'square kilometers',
                                 'long_name':"Total NWM catchment area (square kilometers)"},
                }

# Global attributes dictionary
out_global_atts = {'title':"HUC12 monthly WRF-Hydro modeling application",
                   'institution':"USGS",
                   'history':"Author, {}".format(time.ctime())
                  }   

In [None]:
ds = xr.open_dataset(in_nc)
ds

### Code to add back in Char HUCIDs from source file

In [None]:
%%time

# Convert the HUC dataset (polygons) to WGS84 to match the points
HUC_gpkg = r'/caldera/hovenweep/projects/usgs/water/impd/hytest/niwaa_wrfhydro_monthly_huc12_aggregations_sample_data/HUC12_grids/HUC12.gpkg'
HUC_gdf = gpd.read_file(HUC_gpkg, layer='WBDHU12_CONUS', ignore_geometry=True)

### Assign the coordinates to match the string-type HUC12 IDs from the input feature class

In [None]:
%%time

# Create a new dataframe with just the HUC12 data
HUC_df = HUC_gdf[['HUC12', 'AREASQKM', 'STATES', 'TOHUC']].copy()

# Create new field that will match to the datatype in the input file
HUC_df['HUC12_int'] = HUC_df['HUC12'].astype(np.int64)

# Export the netCDF coordinate to a dataframe
nc_df = ds['WBDHU12'].to_dataframe()
nc_df.index = np.arange(nc_df.shape[0]) #reset_index()
combined_df = pd.merge(nc_df, HUC_df,  how='inner', left_on=['WBDHU12'], right_on=['HUC12_int'])

# Deal with duplicates
combined_df = combined_df[~combined_df.duplicated(subset=['HUC12_int'], keep='last')]

# Make sure they have the same number of values
assert combined_df['HUC12'].unique().shape == nc_df['WBDHU12'].unique().shape

# Make sure they are identical
assert (combined_df['HUC12_int'] == ds['WBDHU12'].data).sum() == ds['WBDHU12'].data.shape

da = xr.DataArray(combined_df['HUC12'].astype('S12'), coords={'WBDHU12': combined_df['HUC12'].astype('S12')},dims=['WBDHU12'])
ds['WBDHU12'] = da
#del da, HUC_df, combined_df, nc_df, HUC_gdf
ds

### Rename variables

In [None]:
ds_out = ds.rename_vars(var_rename_dict)
ds_out

### Rename Dimensions and coordinate variables

Using `xr.rename` instead of `xr.rename_dims` ensures that any coordinate variables are also renamed

In [None]:
ds_out = ds_out.rename(rename_dim_dict)
ds_out

### Subset variables

In [None]:
ds_out = ds_out[list(var_rename_dict.values())]
ds_out

### Change data types

In [None]:
for variable in ds_out.data_vars:
    if ds_out[variable].dtype == np.float64:
        print('Found a float64 for variable {0}'.format(variable))
        ds_out[variable] = ds_out[variable].astype(np.float32)
ds_out

### Re-order dimensions

In [None]:
for variable in ds_out.data_vars:
    #print(variable, ds_out[variable].dims)
    if ds_out[variable].dims == ('time', 'huc_id'):
        print('Var {0} not correct: {1}'.format(variable, ds_out[variable].dims))
        ds_out[variable] = ds_out[variable].transpose()
        #ds_out[variable] = ds_out[variable][['huc_id', 'time', variable]]
ds_out.load()
ds_out

### Set variable and global attributes

In [None]:
# Eliminate any unecessary variable attributes (such as spatial metadata)
for variable in ds_out.data_vars:
    if variable in var_atts_dict:
        ds_out[variable].attrs = var_atts_dict[variable]

# Now eliminate unnecessary global attributes 
ds_out.attrs = out_global_atts

ds_out

### Reorganize time dimension to year and month dimensions

In [None]:
# Interpret times as strings - for later input to CSV files as a time index
datetime_strings = [pd.to_datetime(ds_out['time']).strftime('%Y%m%d%H')]

# year-month strings
yearmo_strings = [pd.to_datetime(ds_out['time']).strftime('%Y-%m')]
yearmo_strings

ds_out['yrmo'] = xr.DataArray(np.array(yearmo_strings, dtype='U'), dims=('yrmo_index', time_coord), name='yrmo')
ds_out

In [None]:
# Set the nodata value
nodata_value = float(-9999)

In [None]:
#Drop unecessary dimension on `yrmo` variable
yrmo_dim_len = [len(ds_out[dim]) for dim in ds_out['yrmo'].dims]
if len(ds_out['yrmo'].dims) > 1 and 1 in yrmo_dim_len:
   remove_dims = [dim for dim in ds_out['yrmo'].dims if len(ds_out[dim])==1]
   print('Removing dimension(s) {0} from variable "yrmo".'.format(remove_dims))
   ds_out['yrmo'] = ds_out['yrmo'].squeeze()
ds_out

5) Enforce a sort order on the variables in the dataset

In [None]:
# Build list of variables, sorted by 1D, then 2D alphabetical
sorted_varlist = ['yrmo', 'CatchmentArea', 'LandFraction', 'total_gridded_area']

# Build list of all 2D+ variables, sorted alphabetically no matter the case
sorted_varlist2 = [item for item in list(ds_out.data_vars) if item not in sorted_varlist]
sorted_varlist2.sort(key=str.casefold)

# Add the lists together
sorted_varlist += sorted_varlist2
assert len(list(ds_out.data_vars)) == len(sorted_varlist)
print('Found {0} variables. Sorted by number of dimensions and then alphabetically'.format(len(sorted_varlist)))
      
# Sort the variables in the dataset
out_ds = ds_out[sorted_varlist]
out_ds

### 6) Fix NoData Issues

A decision was made to make all NaN values consistent between 10-year and 40-year Water Budget component files, using -9999.0 as the _FillValue in the netCDF files, and adding descirptions to variable attributes to identify what NaN means in each variable.

In [None]:
#### Fix variable attributes and encodings
output_encoding = {}
for data_var in out_ds.data_vars:
    print(data_var)

    # Fix nodata description attribute
    if data_var in ['PrecipLand', 
                    'Snowfall', 
                    'ET', 
                    'SWE', 
                    'SoilWater', 
                    'SoilSat', 
                    'Recharge', 
                    'Precip', 
                    'LandFraction', 
                    'total_gridded_area', 
                    'avgSOILM_wltadj_depthmean', 
                    'avgSOILSAT_wltadj_top1']:
        out_ds[data_var].attrs['nodata_description'] = 'HUC12 contains no land cells'
    elif data_var in ['CatchmentArea']:
        out_ds[data_var].attrs['nodata_description'] = 'HUC12 contains no WRF-Hydro catchment polygons'
    elif data_var in ['Surfaceflow', 
                      'Baseflow', 
                      'GWStore']:
        out_ds[data_var].attrs['nodata_description'] = 'HUC12 contain no WRF-Hydro flowlines.'
    else:
        continue
    print('\tnodata_description: {0}'.format(out_ds[data_var].attrs['nodata_description']))

    # Change NaN to nodata value
    nodata_mask = out_ds[data_var].isnull().data
    print('\tFound {0} nodata values'.format(nodata_mask.sum()))
    #display(out_ds[data_var].data[nodata_mask])
    out_ds[data_var].data[nodata_mask] = nodata_value
    #display(out_ds[data_var].data[nodata_mask])
    
    # Variable encoding
    out_ds[data_var].encoding['_FillValue'] = nodata_value
    output_encoding[data_var] = {'_FillValue':nodata_value}

    # Remove redundant missing value encoding
    if 'missing_value' in out_ds[data_var].encoding:
        del out_ds[data_var].encoding['missing_value']
    print('\t'.format(out_ds[data_var].encoding['_FillValue']))

In [None]:
# View the variable encodings
for var in out_ds.data_vars:
    print(var)
    for key,item in out_ds[var].encoding.items():
        print(f'    {key}: {item}')

In [None]:
%%time

print('  Writing output to {0}'.format(out_nc))
out_ds.compute()
out_ds.to_netcdf(out_nc, encoding=output_encoding)
out_ds.close()

### Clean up

In [None]:
ds.close()
ds_out.close()
print('Process completed in {0: 3.2f} seconds.'.format(time.time()-tic))