In [None]:
%%javascript
IPython.notebook.kernel.restart()

In [None]:
import fsspec
import os
import tempfile
import xarray as xr

from nco import Nco
from nco.custom import Atted

# Processing netCDF files
This notebook documents two approaches to processing netCDF files 
with `pyNCO` (https://pynco.readthedocs.io/en/latest/). 
The gridmet product (https://www.climatologylab.org/gridmet.html) is used for these examples. 
The gridmet files contain one variable per year. This notebook shows examples of merging those files 
into a single, multi-year file containing all variables.

In the first approach the management of the temporary files is done manually. In this case care must be taken to 
track the temporary files, avoiding file collisions (e.g. accidently overwriting temporary files) and removing 
the files once processing is done.

The second approach makes use of the `tempfile` library which automates the use and handling of the temporary 
files needed during processing. Temporary files are automatically removed at the end of the processing.

In [None]:
# Sample gridmet filename: 2020_gm_tmin_2021_03_31.nc
#     year ----------------^^^^
#     variable --------------------^^^^
#     date retrieved -------------------^^^^^^^^^^

var = 'tmin'
base_dir = '/Volumes/USGS_NHM2/datasets/gridmet'
src_dir = f'{base_dir}/gridmet_raw'
output_dir = f'{base_dir}/test_output'

output_filename = f'gridmet_{var}_1979-2020'    # .nc is added later

## Using pyNCO with manual temporary file management

In [None]:
# Create NCO object
# Setting debug=True will output information about the nco execution that can be useful
# when examining problems in the processing scripts.
nco = Nco(debug=False)

In [None]:
# Open a filesystem
fs = fsspec.filesystem('file')

# Build list of gridmet files
flist = sorted(fs.glob(f'{src_dir}/*_gm_{var}_*.nc'))
print(flist[0])
print(flist[-1])
len(flist)

NOTE (2022-01): This notebook works from the original retrieved Gridmet data (on denali). 
This data was saved in netCDF Classic format but still included the _ChunkSizes attribute which 
confuses some tools. Removing the attribute can be done in-place if you have read-write access to the files; 
otherwise you have to make a copy of the original files without the _ChunkSizes attribute. 
More recent versions of the gridmet data do not have this problem.

In [None]:
# Original NCO command
# ncatted -a _ChunkSizes,,d,, ${ff}

# opts = ['-a _ChunkSizes,,d,,']
# nco.ncatted(input=somefile.nc)

opts = ['-h', Atted(mode='delete', att_name='_ChunkSizes')]

for ff in flist[0:2]:
    nco.ncatted(input=ff, options=opts)

### Change fixed time dimension to record dimension

The original Gridmet data has a fixed time dimension named `day`. In order to concatenate individual files 
(1 year, 1 variable per file) into a single file per variable for the period of record we need to change the 
fixed time dimension into an unlimited record dimension.

In [None]:
%%time
# ncks -O --mk_rec_dmn day ${ff} -o merged/${ff}

# Adding -h prevents adding entries into the global history
record_dim = 'day'
opts = [f'-O -h --mk_rec_dmn {record_dim}']

for ff in flist[0:2]:
    tmp_file = f'tmp_{os.path.basename(ff)}'
    print(f'Processing {tmp_file}')
    
    nco.ncks(input=ff, output=f'{output_dir}/{tmp_file}', options=opts)

### Concatenate single-year files into a single multi-year file

In [None]:
%%time
# ncrcat ${var}.nc -o gridmet_${var}_1979-2020.nc

o_flist = sorted(fs.glob(f'{output_dir}/*.nc'))

# The -h option prevents the netCDF operators tools from automatically appending to the 
# global history attribute. Otherwise the history can get rather large and messy.
opts = ['-h']

nco.ncrcat(input=o_flist, output=f'{output_dir}/{var}.nc', options=opts)

### Create simplified global history

In [None]:
# Construct a simplified/sanitized history entry
input_files = ' '.join([os.path.basename(ff) for ff in o_flist])
history_text = f'ncrcat {input_files} -o {var}.nc'

opts = ['-h', Atted(mode='append', att_name='history', var_name='global', value=history_text, stype='c')]

# When only the input argument is specified for the ncatted command the
# attribute editing is done in-place.
xx = nco.ncatted(input=f'{output_dir}/{var}.nc', options=opts)

### Rechunk the concatenated file.

In [None]:
%%time
# ncks -O -4 -L 2 --cnk_map=dmn --cnk_dmn day,122 --cnk_dmn lat,98 --cnk_dmn lon,231 ${ff} -o ../${ff}
time_cnk = 122
lat_cnk = 98
lon_cnk = 231

opts = ['-O', '-4', '-L 2', 
        '--cnk_map=dmn', 
        f'--cnk_dmn {record_dim},{time_cnk}',
        f'--cnk_dmn lat,{lat_cnk}', 
        f'--cnk_dmn lon,{lon_cnk}']

nco.ncks(input=f'{output_dir}/{var}.nc', output=f'{output_dir}/{output_filename}.nc', options=opts)

### Explore the merged netCDF file

In [None]:
ds = xr.open_dataset(f'{output_dir}/{output_filename}.nc', chunks='auto')
ds

In [None]:
# Examine one of the variables
ds.daily_minimum_temperature

In [None]:
ds.attrs.keys()

# Using pyNCO with automatic temporary file management

Here we create a temporary directory to contain the intermediate files. The directory and its contents are 
automatically removed once the code block has completed. The final output file is written to the output directory.

In [None]:
%%time
# When clean_history is true the original global history attribute contents are cleared and replaced with
# only the history related to our processing.
clean_history = True

with tempfile.TemporaryDirectory() as tmp_dir:
    print(f'Working in {tmp_dir}')
    
    # Make the time dimension a record dimension
    print('    Make record dimension')
    record_dim = 'day'
    opts = [f'-O --mk_rec_dmn {record_dim}']

    for ff in flist[0:2]:
        tmp_filename = f'tmp_{os.path.basename(ff)}'
        print(f'Processing {tmp_filename}')

        nco.ncks(input=ff, output=f'{tmp_dir}/{tmp_filename}', options=opts)
        
    # Concatenate the individual files
    print('    Concatenate files')
    o_flist = sorted(fs.glob(f'{tmp_dir}/*.nc'))

    nco.ncrcat(input=o_flist, output=f'{tmp_dir}/{var}_concat.nc')   # , options=opts)        
        
    # Rechunk the data
    print('    Rechunk data')
    time_cnk = 122
    lat_cnk = 98
    lon_cnk = 231

    opts = ['-O', '-4', '-L 2', 
            '--cnk_map=dmn', 
            f'--cnk_dmn {record_dim},{time_cnk}',
            f'--cnk_dmn lat,{lat_cnk}', 
            f'--cnk_dmn lon,{lon_cnk}']

    nco.ncks(input=f'{tmp_dir}/{var}_concat.nc', output=f'{output_dir}/{output_filename}.nc', options=opts)
    
    if clean_history:
        print('    Clean history')
        # Read the final file and create a modified history
        ds = xr.open_dataset(f'{output_dir}/{output_filename}.nc', chunks='auto')

        history = ds.attrs['History']
        aa = history.split('\n')

        new_hist = []

        for io, dd in enumerate(aa):
            bb = dd.split()
            keep = True

            for ii, cc in enumerate(bb):
                if cc in ['ncatted']:
                    keep = False
                    continue
                elif '--output' in cc:
                    bb[ii] = '--output=' + os.path.basename(cc.split('=')[1])
                else:
                    if os.path.isfile(cc):
                        bb[ii] = os.path.basename(cc)
            if keep:
                new_hist.append(' '.join(bb))

        ds.close()
        
        # Remove History global attribute
        opts = ['-h', Atted(mode='delete', att_name='History', var_name='global')]
        nco.ncatted(input=f'{output_dir}/{output_filename}.nc', options=opts)        
        
        # Add new history global attribute (all lowercase)
        # NOTE: the double backslash for the value argument is needed so that \n is passed to ncatted correctly
        opts = ['-h', Atted(mode='create', att_name='history', var_name='global', 
                            value='\\n'.join(new_hist), stype='c')]
        nco.ncatted(input=f'{output_dir}/{output_filename}.nc', options=opts)        