## Finalize the output files

Edit variable names, metadata, etc.

### Imports

In [11]:
# --- Import Modules --- #

# Import Python Core Modules
import sys
import os
import time
import datetime

# Import Additional Modules
import numpy as np
import xarray as xr
import pandas as pd
import geopandas as gpd

tic = time.time()
print('Process initiated at {0}'.format(time.ctime()))
# --- End Import Modules --- #

Process initiated at Tue Apr  1 13:52:02 2025


In [12]:
#in_nc = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/CONUS_HUC12_WB_combined_20091001_20190930.nc'
#in_nc = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/CONUS_HUC12_WB_combined_20191001_20210930.nc'
#in_nc = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/CONUS_HUC12_WB_combined_20091001_20210930.nc'
#in_nc = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget/CONUS_HUC12_WB_combined_19791001_20220930_2.nc'
in_nc = r'/caldera/hovenweep/projects/usgs/water/impd/hytest/working/niwaa_wrfhydro_monthly_huc12_aggregations/merge_out/CONUS_HUC12_WB_combined_19791001_20220930.nc'

# Output directory
# outDir = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/Water_Budget'
outDir = r'/caldera/hovenweep/projects/usgs/water/impd/hytest/working/niwaa_wrfhydro_monthly_huc12_aggregations/final_out'

# Output files
#out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2010_2019.nc')
#out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2010_2019.csv')
#out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2020_2021.nc')
#out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2020_2021.csv')
#out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2010_2021.nc')
#out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY2010_2021.csv')
out_nc = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY1980_2022_2.nc')
out_csv = os.path.join(outDir, 'huc12_monthly_wb_iwaa_wrfhydro_WY1980_2022_2.csv')

# Select output formats
write_NC = True      # Output netCDF file
write_CSV = True     # Output CSV file

# Name the zone dimension
zone_name = 'WBDHU12'

# Name the time dimension
time_coord = 'time'

### Dictionaries to rename variables and set attributes

In [13]:
# Dictionary to rename variables. Also used to subset dataset by variable name.
var_rename_dict = {'totPRECIP':'Precip',
                   'totPRECIP':'PrecipLand',
                   'deltaACSNOW':'Snowfall',
                   'totqSfcLatRunoff':'Surfaceflow',
                   'totqBucket':'Baseflow',
                   'deltaACCET':'ET',
                   'avgSNEQV':'SWE',
                   'avgSOILM':'SoilWater',
                   'avgSOILSAT':'SoilSat',
                   'deltaUGDRNOFF':'Recharge',
                   'bucket_depth':'GWStore',
                   'Area_sqkm':'CatchmentArea',
                   'Precip':'Precip',
                   'landmask':'LandFraction',
                   'total_gridded_area': 'total_gridded_area',
                   'avgSOILM_wltadj_depthmean': 'avgSOILM_wltadj_depthmean',
                   'avgSOILSAT_wltadj_top1': 'avgSOILSAT_wltadj_top1',}

# Rename dimensions
rename_dim_dict = {zone_name:'huc_id'}

# Variable attributes dictionary
var_atts_dict = {'Precip':{'units':'mm',
                           'long_name':"Total monthly precipitation (land & water)"},
                'PrecipLand':{'units':'mm',
                              'long_name':"Total monthly precipitation (land only)"},
                'Snowfall':{'units':'mm',
                            'long_name':"Total monthly snowfall (land only)"},
                'Surfaceflow':{'units':'mm',
                               'long_name':"Total monthly surface flow"},
                'Baseflow':{'units':'mm',
                            'long_name':"Total monthly baseflow"},
                'ET':{'units':'mm',
                      'long_name':"Total monthly evapotranspiration (land only)"},
                'SWE':{'units':'mm',
                       'long_name':"Average monthly snow water equivalent (land only)"},
                'SoilWater':{'units':'mm',
                             'long_name':"Average monthly soil moisture in 2m soil column (land only)"},
                'SoilSat':{'units':'fraction',
                           'long_name':"Average monthly fractional soil saturation in 2m soil column (land only)"},
                'Recharge':{'units':'mm',
                            'long_name':"Total monthly recharge (land only)"},
                'GWStore':{'units':'mm',
                           'long_name':"Average monthly groundwater store"},
                'LandFraction':{'units':'fraction',
                                'standard_name':'area_fraction',
                                'long_name':"Land fraction of HUC12 from gridded data"},
                'CatchmentArea':{'units':'square kilometers',
                                 'long_name':"Total NWM catchment area (square kilometers)"},
                }

# Global attributes dictionary
out_global_atts = {'title':"HUC12 monthly water budget components from WRF-Hydro IWAA v1.0",
                   'institution':"NCAR",
                   'source1':"https://github.com/NOAA-Big-Data-Program/bdp-data-docs/blob/main/nwm/README.md",
                   'source2':"https://www.sciencebase.gov/catalog/file/get/60cb5edfd34e86b938a373f4?name=WBD_National_GDB.zip",
                   'history':"A. Dugger, Tue Mar 14 20:37:45 2023"}   

In [14]:
ds = xr.open_dataset(in_nc)
ds

### Code to add back in Char HUCIDs from source file

In [15]:
%%time

# Convert the HUC dataset (polygons) to WGS84 to match the points
# HUC_gpkg = r'/glade/derecho/scratch/ksampson/USGS/CONUS_Water_Budget/HUCs/HUC12.gpkg'
HUC_gpkg = r'/caldera/hovenweep/projects/usgs/water/impd/hytest/working/niwaa_wrfhydro_monthly_huc12_aggregations/HUC12_grids/HUC12.gpkg'
HUC_gdf = gpd.read_file(HUC_gpkg, layer='WBDHU12_CONUS', ignore_geometry=True)

CPU times: user 2.42 s, sys: 1min 5s, total: 1min 7s
Wall time: 1min 8s


### Assign the coordinates to match the string-type HUC12 IDs from the input feature class

In [16]:
%%time

# Create a new dataframe with just the HUC12 data
HUC_df = HUC_gdf[['HUC12', 'AREASQKM', 'STATES', 'TOHUC']].copy()

# Create new field that will match to the datatype in the input file
HUC_df['HUC12_int'] = HUC_df['HUC12'].astype(np.int64)

# Export the netCDF coordinate to a dataframe
nc_df = ds['WBDHU12'].to_dataframe()
nc_df.index = np.arange(nc_df.shape[0]) #reset_index()
combined_df = pd.merge(nc_df, HUC_df,  how='inner', left_on=['WBDHU12'], right_on=['HUC12_int'])

# Deal with duplicates
combined_df = combined_df[~combined_df.duplicated(subset=['HUC12_int'], keep='last')]

# Make sure they have the same number of values
assert combined_df['HUC12'].unique().shape == nc_df['WBDHU12'].unique().shape

# Make sure they are identical
assert (combined_df['HUC12_int'] == ds['WBDHU12'].data).sum() == ds['WBDHU12'].data.shape

da = xr.DataArray(combined_df['HUC12'].astype('S12'), coords={'WBDHU12': combined_df['HUC12'].astype('S12')},dims=['WBDHU12'])
ds['WBDHU12'] = da
#del da, HUC_df, combined_df, nc_df, HUC_gdf
ds

CPU times: user 161 ms, sys: 0 ns, total: 161 ms
Wall time: 160 ms


### Rename variables

In [17]:
ds_out = ds.rename_vars(var_rename_dict)
#ds_out

In [19]:
ds_out

### Rename Dimensions and coordinate variables

Using `xr.rename` instead of `xr.rename_dims` ensures that any coordinate variables are also renamed

In [20]:
ds_out = ds_out.rename(rename_dim_dict)
#ds_out

### Subset variables

In [21]:
ds_out = ds_out[list(var_rename_dict.values())]
ds_out

### Change data types

In [22]:
for variable in ds_out.data_vars:
    if ds_out[variable].dtype == np.float64:
        print('Found a float64 for variable {0}'.format(variable))
        ds_out[variable] = ds_out[variable].astype(np.float32)
ds_out

Found a float64 for variable Snowfall
Found a float64 for variable Surfaceflow
Found a float64 for variable Baseflow
Found a float64 for variable ET
Found a float64 for variable SoilWater
Found a float64 for variable Recharge
Found a float64 for variable GWStore
Found a float64 for variable CatchmentArea
Found a float64 for variable LandFraction
Found a float64 for variable total_gridded_area


### Re-order dimensions

In [23]:
for variable in ds_out.data_vars:
    #print(variable, ds_out[variable].dims)
    if ds_out[variable].dims == ('time', 'huc_id'):
        print('Var {0} not correct: {1}'.format(variable, ds_out[variable].dims))
        ds_out[variable] = ds_out[variable].transpose()
        #ds_out[variable] = ds_out[variable][['huc_id', 'time', variable]]
ds_out.load()
ds_out

Var Surfaceflow not correct: ('time', 'huc_id')
Var Baseflow not correct: ('time', 'huc_id')
Var GWStore not correct: ('time', 'huc_id')


### Set variable and global attributes

In [24]:
# Eliminate any unecessary variable attributes (such as spatial metadata)
for variable in ds_out.data_vars:
    if variable in var_atts_dict:
        ds_out[variable].attrs = var_atts_dict[variable]

# Now eliminate unnecessary global attributes 
ds_out.attrs = out_global_atts

ds_out

### Reorganize time dimension to year and month dimensions

In [25]:
# Interpret times as strings - for later input to CSV files as a time index
datetime_strings = [pd.to_datetime(ds_out['time']).strftime('%Y%m%d%H')]

# year-month strings
yearmo_strings = [pd.to_datetime(ds_out['time']).strftime('%Y-%m')]
yearmo_strings

ds_out['yrmo'] = xr.DataArray(np.array(yearmo_strings, dtype='U'), dims=('yrmo_index', time_coord), name='yrmo')
ds_out

### Export NetCDF

In [26]:
%%time

# Read into memory before writing to disk?
ds_out.compute()
    
# Write output file (netCDF)
if write_NC:
    tic1 = time.time()
    print('  Writing output to {0}'.format(out_nc))
    ds_out.to_netcdf(out_nc, 
                     mode='w', 
                     format="NETCDF4", 
                     compute=True)
    print('\tExport to netCDF completed in {0:3.2f} seconds.'.format(time.time()-tic1))

  Writing output to /caldera/hovenweep/projects/usgs/water/impd/hytest/working/niwaa_wrfhydro_monthly_huc12_aggregations/final_out/huc12_monthly_wb_iwaa_wrfhydro_WY1980_2022_2.nc
	Export to netCDF completed in 0.66 seconds.
CPU times: user 327 ms, sys: 215 ms, total: 543 ms
Wall time: 656 ms


### Export CSV

In [27]:
%%time

# Remove certain variables (non-temporal) from the output CSV file
remove_vars = ['CatchmentArea', 'yrmo']    # 'LandFraction'     
    
# Write output file (CSV)
if write_CSV:
    tic1 = time.time()    
    
    # Output to Pandas DataFrame
    df_out = ds_out.to_dataframe()
    
    # Remove variables we do not want
    if remove_vars is not None:
        df_out = df_out.drop(columns=remove_vars)
        
    df_out.index = df_out.index.droplevel(2)
    df_out = df_out.reset_index()
    
    # Add year and month columns
    df_out.insert(2, 'year',  pd.DatetimeIndex(df_out[time_coord]).year)
    df_out.insert(3, 'month',  pd.DatetimeIndex(df_out[time_coord]).month)
    #df_out['year'] = pd.DatetimeIndex(df_out[time_coord]).year
    #df_out['month'] = pd.DatetimeIndex(df_out[time_coord]).month
    df_out = df_out.drop(columns='time')

    df_out.to_csv(out_csv, index=False)
    print('\tExport to CSV completed in {0:3.2f} seconds.'.format(time.time()-tic1))
df_out

	Export to CSV completed in 26.88 seconds.
CPU times: user 26.4 s, sys: 419 ms, total: 26.8 s
Wall time: 26.9 s


Unnamed: 0,huc_id,year,month,PrecipLand,Snowfall,Surfaceflow,Baseflow,ET,SWE,SoilWater,SoilSat,Recharge,GWStore,Precip,LandFraction,total_gridded_area,avgSOILM_wltadj_depthmean,avgSOILSAT_wltadj_top1
0,b'010100020101',2011,1,,,0.365792,30.682590,,,,,,22.455330,,0.941176,64.0,,
1,b'010100020101',2011,2,,,0.000000,13.614398,,,,,,14.725545,,0.941176,64.0,,
2,b'010100020101',2011,3,,,4.637056,18.826815,,,,,,15.671555,,0.941176,64.0,,
3,b'010100020101',2011,4,,,35.563644,92.934387,,,,,,34.136120,,0.941176,64.0,,
4,b'010100020101',2011,5,,,10.025396,90.452347,,,,,,36.048641,,0.941176,64.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3031590,b'181002041600',2013,7,,,0.000000,0.051717,,,,,,0.916956,,0.062975,58.0,,
3031591,b'181002041600',2013,8,,,0.002475,0.049992,,,,,,0.887035,,0.062975,58.0,,
3031592,b'181002041600',2013,9,,,0.000000,0.047112,,,,,,0.869429,,0.062975,58.0,,
3031593,b'181002041600',2013,10,,,0.000000,0.047622,,,,,,0.861319,,0.062975,58.0,,


### Clean up

In [28]:
ds.close()
ds_out.close()
print('Process completed in {0: 3.2f} seconds.'.format(time.time()-tic))

Process completed in  402.70 seconds.
