## Extract surface layer from 3D field and save new 2D field

In [1]:
## import required packages
import numpy as np
import matplotlib.pyplot as plt
import xarray as xr
import s3fs
import pandas as pd
from datetime import datetime
import json

In [None]:
# need to manually add key
sassie_key =
sassie_secret =

In [3]:
## s3 directories
sassie_s3_netcdf_dir = 's3://podaac-dev-sassie/ECCO_model/N1/V1/HH/NETCDF/'
sassie_s3_netcdf_dir_surface = 's3://podaac-dev-sassie/ECCO_model/N1/V1/HH/NETCDF_3D_SURF/' # directory for surface 3D fields

## variable to process
var_3d = 'THETA'

In [4]:
def create_encoding(ecco_ds, output_array_precision = np.float32):
    
    # Create NetCDF encoding directives
    # ---------------------------------------------
    # print('\n... creating variable encodings')
    # ... data variable encoding directives
    
    # Define fill values for NaN
    if output_array_precision == np.float32:
        netcdf_fill_value = nc4.default_fillvals['f4']

    elif output_array_precision == np.float64:
        netcdf_fill_value = nc4.default_fillvals['f8']
    
    dv_encoding = dict()
    for dv in ecco_ds.data_vars:
        dv_encoding[dv] =  {'compression':'zlib',\
                            'complevel':5,\
                            'shuffle':False,\
                            'fletcher32': False,\
                            '_FillValue':netcdf_fill_value}

    # ... coordinate encoding directives
    coord_encoding = dict()
    
    for coord in ecco_ds.coords:
        # set default no fill value for coordinate
        if output_array_precision == np.float32:
            coord_encoding[coord] = {'_FillValue':None, 'dtype':'float32'}
        elif output_array_precision == np.float64:
            coord_encoding[coord] = {'_FillValue':None, 'dtype':'float64'}

        # force 64 bit ints to be 32 bit ints
        if (ecco_ds[coord].values.dtype == np.int32) or \
           (ecco_ds[coord].values.dtype == np.int64) :
            coord_encoding[coord]['dtype'] ='int32'

        # fix encoding of time
        if coord == 'time' or coord == 'time_bnds':
            coord_encoding[coord]['dtype'] ='int32'

            if 'units' in ecco_ds[coord].attrs:
                # apply units as encoding for time
                coord_encoding[coord]['units'] = ecco_ds[coord].attrs['units']
                # delete from the attributes list
                del ecco_ds[coord].attrs['units']

        elif coord == 'time_step':
            coord_encoding[coord]['dtype'] ='int32'

    # ... combined data variable and coordinate encoding directives
    encoding = {**dv_encoding, **coord_encoding}

    return encoding

In [5]:
def save_surface_layer_from_3d(var_3d, sassie_s3_netcdf_dir, ec2_scratch_dir, sassie_key, sassie_secret):

    ## initialize s3 system
    s3 = s3fs.S3FileSystem(anon=False, key=sassie_key, secret=sassie_secret) 
    
    ## list all files
    nc_file_list = np.sort(s3.glob(f'{sassie_s3_netcdf_dir}{var_3d}_AVG_DAILY/*.nc'))

    print(f'\n> Looking for files on {sassie_s3_netcdf_dir}{var_3d}_AVG_DAILY/')
    print(f'... num files  : {len(nc_file_list)}')
    print(f'... first file : {nc_file_list[0]}')
    print(f'... last file  : {nc_file_list[-1]}')
    
    ## append "s3://" to create url in order to open the dataset
    print(f'\n> Preparing list of files to process')
    nc_file_list_urls = []
    for file in nc_file_list:
        file_url_tmp = f"s3://{file}"
        nc_file_list_urls.append(file_url_tmp)
    
    ## loop through each file, extract surface, and save new netCDF
    for file_url in nc_file_list_urls[0:1]:
        
        print(f"\n... opening {file_url}")
        s3_file = s3.open(file_url)
        s3_file_ec2 = xr.open_dataset(s3_file)
        s3_file_ec2.close()
     
        ## isolate the surface layer
        print(f"... extracting surface layer\n")
        tmp_surface = s3_file_ec2.isel(k=[0], k_u=[0], k_l=[0], k_p1=slice(0,2))
        
        ## edit typo in metadata
        tmp_surface.k_p1.attrs['comment'] = "Top and bottom of model tracer cell."
        
        # print(s3_file_ec2)
    
        ## save newly generated 2D surface layer dataset to scratch directory
        print(f"... saving surface netCDF dataset to scratch directory {ec2_scratch_dir}")
    
        ## edit filename to indicate that it is a surface layer file (not 3D)
        filename_split = file.split("/")[-1].split("day")
        netcdf_filename_new = f"{filename_split[0]}SURFACE_day{filename_split[1]}"

        ## create encoding
        encoding_var = create_encoding(tmp_surface, output_array_precision = np.float32)
        
        tmp_surface.to_netcdf(f"{ec2_scratch_dir}/{netcdf_filename_new}.nc", encoding = encoding_var)
        print(f"\n* * * * saved netcdf to {ec2_scratch_dir}/{netcdf_filename_new}.nc* * * *\n")

In [6]:
def push_nc_dir_from_ec2(ec2_scratch_dir, root_dest_s3_name, var_name):
    """
    Pushes the netcdf files from a directory to an S3 bucket.

    Args:
        ec2_scratch_dir (str): The path to the directory containing the netcdf files on the EC2 instance.
        root_dest_s3_name (str): The root name of the S3 bucket where the files will be pushed.
        var_name (str): The name of the variable used to create the S3 bucket.

    Returns:
        None
    """
    ## push file to s3 bucket
    mybucket = root_dest_s3_name + var_name + "_AVG_DAILY_SURF"
    nc_files = list(ec2_scratch_dir.glob('*.nc'))

    print(f'\n>pushing netcdf files in {ec2_scratch_dir} to s3 bucket : {mybucket}')
    print(f'... looking for *.nc files in {ec2_scratch_dir}')
    print(f'... found {len(nc_files)} nc files to upload')

    if len(nc_files)>0:
        cmd=f"aws s3 cp {ec2_scratch_dir} {mybucket}/ --recursive --include '*.nc' --no-progress > /dev/null 2>&1"
        print(f'... aws command: {cmd}')
        with suppress_stdout():
           os.system(cmd)
    else:
        print("... nothing to upload!") 

In [None]:
def extract_surface_layer(var_3d, sassie_s3_netcdf_dir, ec2_nvme_scratch_dir, sassie_key, sassie_secret, root_dest_s3_name):

    ## process variable
    print(f"= = = = = processing {var_3d} = = = = =\n")

    ## create temporary scratch directory on ec2
    nc_root_dir_ec2 =  Path(f"{ec2_nvme_scratch_dir}/tmp_nc/{var_3d}_AVG_DAILY_SURF")
    print(f'... temporary nc directory {nc_root_dir_ec2}/n')
    nc_root_dir_ec2.mkdir(exist_ok=True, parents=True)

    ## extract surface layer from 3D netcdfs and save to scratch directory
    save_surface_layer_from_3d(var_3d, sassie_s3_netcdf_dir, nc_root_dir_ec2, sassie_key, sassie_secret)

    ## push all local netcdfs on ec2 to the cloud
    push_nc_dir_from_ec2(nc_root_dir_ec2, root_dest_s3_name, var_3d)

    ## clean up local scratch disk
    

In [89]:
fields_3d = [
    "SALT",
    "THETA",
    "UVEL",
    "VVEL",
    "WVEL",
    "KPPdiffS",
    "KPPdiffT",
    "KPPviscA",
    "PHIHYD",
    "PHIHYDcR",
    "RHOAnoma",
    "ADVr_SLT",
    "ADVr_TH",
    "ADVx_SLT",
    "ADVx_TH",
    "ADVy_SLT",
    "ADVy_TH",
    "DFrE_SLT",
    "DFrE_TH",
    "DFrI_SLT",
    "DFrI_TH",
    "DFxE_SLT",
    "DFxE_TH",
    "DFyE_SLT",
    "DFyE_TH",
    "UVELMASS",
    "VVELMASS",
    "WVELMASS"
]

In [90]:
len(fields_3d)

28