# CLOUD VERSION 


ONLY modify the data access parts of the code - ultimately, this will be merged with the latest version of osse_code. There will NOT be two versions.

* note- I use a RUN_ON_CLOUD flag; an alternative would be to detect automatically, as is done in the tutorial... https://github.com/podaac/tutorials/blob/master/notebooks/Pre-SWOT_Numerical_Simulation_Demo.ipynb

* s3fs.cp (based on the tutorial, which uses s3fs to load directly w xarray) doesn't work  - "permission denied" error. So, try another tool?

* coudl try s3.download(f, "DEMO_FILES/" + os.path.basename(f)) => but is that better for local download?

In [1]:
## Imports

# Native packages
from math import radians, degrees, sin, cos, asin, acos, sqrt
import datetime
import time
import sys
import os

# Third-party packages for data manipulation
import numpy as np
import pandas as pd
import xarray as xr

# Other third-party packages
import netCDF4 as nc4

# Third-party packages for data interpolation
from scipy import interpolate
from scipy.interpolate import griddata
from xgcm import Grid
import xgcm.grid

# Third-party packages for data visualizations
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from mpl_toolkits import mplot3d
from mpl_toolkits.mplot3d import axes3d

# CLOUD
# for running on the AWS cloud:
import requests
import s3fs


# osse tools package
# del sys.modules['osse_tools_cloud']  # uncomment if troubleshooting osse_tools
from osse_tools_cloud import download_llc4320_data, compute_derived_fields, get_survey_track, survey_interp

# dask
# from dask.distributed import Client

In [None]:
# not sure how optimal these parameters are....
# client = Client(n_workers=20, threads_per_worker=4, memory_limit='20GB')
client = Client()
client

In [82]:
# --------------------------------------------------------------------
# USER INPUTS:
# --------------------------------------------------------------------

# flags:
RUN_ON_CLOUD = 1 # 0: running locally, downloading data; 1: running on the AWS cloud
SAVE_FIGURES = True # True or False


# specify region from this list:
# WesternMed  ROAM_MIZ  NewCaledonia  NWPacific  BassStrait  RockallTrough  ACC_SMST
# MarmaraSea  LabradorSea  CapeBasin
RegionName = 'ACC_SMST' 

# specify date range as start date & number of days.
start_date = datetime.date(2012,1,1)
# NOTE: ndays must be >1 
ndays = 2


# directory where data files are stored 
if not RUN_ON_CLOUD:
    datadir = '/data1/adac/mitgcm/netcdf/' + RegionName + '/'  # location of input netcdf files
    outputdir = '/data1/adac/osse_output/' + RegionName + '/'  # location of OSSE outputs
    figdir = '/data2/Dropbox/projects/adac/figures/' + RegionName + '/' # location of figures

else:
    datadir = '/home/jovyan/data/llc4320/' + RegionName + '/'  # location of input netcdf files
    outputdir = '/home/jovyan/osse_output/' + RegionName + '/'  # location of OSSE outputs
    figdir = '/home/jovyan/osse_output/figures/' + RegionName + '/' # location of figures
    
    
# optional details for sampling (if not specified, reasonable defaults will be used)
# NOTE!! mooring and sim_mooring are different:
#    sim_mooring treats the mooring datapoints like a glider, 
#    whereas mooring interpolates directly to the mooring grid and should be faster
sampling_details = {
#     'SAMPLING_STRATEGY' : 'sim_glider', 
   'SAMPLING_STRATEGY' : 'trajectory_file', # options: sim_glider, sim_uctd or trajectory_file.add:  ASV
#     'SAMPLING_STRATEGY' : 'mooring', # options: sim_glider, sim_uctd, sim_mooring or trajectory_file.add: ASV. 
#     'PATTERN' : 'lawnmower', # back-forth or lawnmower 
    'zrange' : [-1, -1000],  # depth range of T/S profiles (down is negative). * add U/V range? *
#     'zmooring_TS' : list(range(-10,-1000,-10)) # instrument depths for moorings. T/S and U/V are the same.
    'hspeed' : 0.25,  # platform horizontal speed in m/s (for glider, uCTD)
    'vspeed' : 0.1, # platform vertical (profile) speed in m/s  (for glider, uCTD)
   'trajectory_file' : '../data/survey_trajectory_ACC_SMST_glider.nc', # if SAMPLING_STRATEGY = 'trajectory_file', specify trajectory file
    'AT_END' : 'reverse', # behaviour at and of trajectory: 'reverse', 'repeat' or 'terminate'. (could also 'restart'?)
    'DERIVED_VARIABLES' : True # specify whether or not to process the derived variables (steric height, rotated velocity, vorticity) - slower and takes significant to derive/save the stored variables
}


#### Download & load model data and derived fields

Based on [LLC4320](https://data.nas.nasa.gov/viz/vizdata/llc4320/index.html), the 1/48-degree global MITgcm simulation produced by the ECCO project. Ten regional cut-outs of the simulation are available on the [PO.DAAC](https://podaac.jpl.nasa.gov/datasetlist?ids=Processing+Levels&values=4+-+Gridded+Model+Output&search=Pre-SWOT+llc4320&view=list&provider=); the 4x4 degree regional domains are small enough to enable fairly easy downloads and processing. The data from the model were retrieved using download_llc4320.ipynb and saved locally.

In [None]:
# uncomment the following two lines if troubleshooting osse_tools
# del sys.modules['osse_tools']  
# from osse_tools import download_llc4320_data, compute_derived_fields, get_survey_track, survey_interp

    
if not RUN_ON_CLOUD:

    # download files:
    download_llc4320_data(RegionName, datadir, start_date, ndays)

    # derive & save new files with steric height & vorticity
    if sampling_details['DERIVED_VARIABLES']:
        compute_derived_fields(RegionName, datadir, start_date, ndays)    

In [3]:
# from this tutorial: https://github.com/podaac/tutorials/blob/master/notebooks/Pre-SWOT_Numerical_Simulation_Demo.ipynb
# => TODO: move all this to the .py
    
if RUN_ON_CLOUD:

    from netrc import netrc
    from urllib import request
    from platform import system
    from getpass import getpass
    from http.cookiejar import CookieJar
    from os.path import expanduser, join

    # Authenticate with your Earthdata Login/URS credentials by configuring a .netrc file in your home directory.
    
    def setup_earthdata_login_auth(endpoint: str='urs.earthdata.nasa.gov'):
        netrc_name = "_netrc" if system()=="Windows" else ".netrc"
        try:
            username, _, password = netrc(file=join(expanduser('~'), netrc_name)).authenticators(endpoint)
        except (FileNotFoundError, TypeError):
            print('Please provide your Earthdata Login credentials for access.')
            print('Your info will only be passed to %s and will not be exposed in Jupyter.' % (endpoint))
            username = input('Username: ')
            password = getpass('Password: ')
        manager = request.HTTPPasswordMgrWithDefaultRealm()
        manager.add_password(None, endpoint, username, password)
        auth = request.HTTPBasicAuthHandler(manager)
        jar = CookieJar()
        processor = request.HTTPCookieProcessor(jar)
        opener = request.build_opener(auth, processor)
        request.install_opener(opener)

    setup_earthdata_login_auth()



In [71]:
# this works
fs.glob(f"podaac-ops-cumulus-protected/{ShortName}/{target_file}")
# this throws an error
# fs.cp(f"podaac-ops-cumulus-protected/{ShortName}/{target_file}", '/home/jovyan/data/')

# this works - but is it effecient? could I just open remotely instead of storing "locally"? Yes, assume it's faster to d/l
# s3.download(f, "DEMO_FILES/" + os.path.basename(f))
fs.download(f"podaac-ops-cumulus-protected/{ShortName}/{target_file}", datadir)

[None]

In [72]:
%%time
d1 = xr.open_dataset(f"{datadir}/{target_file}")

CPU times: user 31 ms, sys: 325 ms, total: 356 ms
Wall time: 535 ms


In [73]:
%%time
d2 = xr.open_dataset(fs.open(f"podaac-ops-cumulus-protected/{ShortName}/{target_file}"))

CPU times: user 2.54 s, sys: 508 ms, total: 3.04 s
Wall time: 7.73 s


In [92]:
del sys.modules['osse_tools_cloud']  
from osse_tools_cloud import download_llc4320_data

# download files:
download_llc4320_data(RegionName, datadir, start_date, ndays)

LLC4320_pre-SWOT_ACC_SMST_20120101.nc
LLC4320_pre-SWOT_ACC_SMST_20120102.nc
copying LLC4320_pre-SWOT_ACC_SMST_20120102.nc to local storage


NameError: name 's3fs' is not defined

In [84]:
s3path = "podaac-ops-cumulus-protected"
fs.download(f"{s3path}/{ShortName}/{target_file}", datadir)

[None]

Frozen({'j_g': 395, 'i': 480, 'i_g': 480, 'j': 395, 'k': 88, 'k_u': 88, 'k_l': 88, 'k_p1': 89, 'nb': 2, 'time': 24})

In [44]:
ShortName

NameError: name 'ShortName' is not defined

In [None]:
# # files to load:


# # 
# if RUN_ON_CLOUD:
#     # produce "short name" as listed on PO.DAAC
#     shortname = 'MITgcm_LLC4320_Pre-SWOT_JPL_L4_' + RegionName + '_v1.0/'
     
#     # list of target files:
#     remote_files = []
#     for n in range(ndays):
#         s3path = f's3://podaac-ops-cumulus-protected/{shortname}LLC4320_pre-SWOT_{RegionName}_{date_list[n].strftime("%Y%m%d")}.nc'
#         remote_files.append(s3.glob(s3path)[0])
#     print(remote_files)

### Cloud access using "earthdata"

In [None]:
start_date = datetime.date(2012,1,1)
ndays = 31
ndays = 60
ndays = 5
# print((start_date, datetime.timedelta(ndays)))
# print(start_date, start_date+datetime.timedelta(ndays))
# print(f'"{start_date}", "{start_date+datetime.timedelta(ndays)}"')
print(f'("{start_date}", "{start_date+datetime.timedelta(ndays)}")')

In [None]:
# We import the classes from earthdata
from earthdata import Auth, DataCollections, DataGranules, Store

auth = Auth()

# First we try to use a .netrc, if it's not present we use the interactive login
if not auth.login(strategy="netrc"):
    auth.login(strategy="interactive")

In [None]:
# use the DataGranules function from earthdata:



GranuleQuery = DataGranules().parameters(
    short_name = "MITgcm_LLC4320_Pre-SWOT_JPL_L4_ACC_SMST_v1.0",
    temporal = ("2012-01-01", "2012-01-06")
    # temporal = ("2012-01-01", "2012-01-05")
    # temporal = f'("{start_date}", "{start_date+datetime.timedelta(ndays)}")'
)

granules = GranuleQuery.get(1)

for granule in granules:
    # print(granule)
    # pprint(granule)
    display(granule)

In [None]:
# "target_files" points to remote files
target_files = [s3.open(file) for file in remote_files]
target_files

In [None]:
%%time
ds = xr.open_mfdataset(target_files, parallel=True, drop_variables={'U', 'V', 'oceTAUX', 'oceTAUY'})
ds

In [None]:
# 

In [None]:
# Load all model data files. No not load U, V, or oceTAUX, oceTAUY as we will replace these with transformed versions 
date_list = [start_date + datetime.timedelta(days=x) for x in range(ndays)]
target_files = [f'{datadir}LLC4320_pre-SWOT_{RegionName}_{date_list[n].strftime("%Y%m%d")}.nc' for n in range(ndays)] # list target files
# ds = xr.open_mfdataset(target_files, parallel=True)
ds = xr.open_mfdataset(target_files, parallel=True, drop_variables={'U', 'V', 'oceTAUX', 'oceTAUY'})

# # rename rotated vectors data to original names
#             # !! caution, skipping this step will cause great confusion later !!
#             ds = ds.rename_vars({'U':'Utemp', 'V':'Vtemp'}).rename_vars({'U_r':'U', 'V_r':'V'})
#             ds = ds.rename_vars({'oceTAUX':'TAUXtemp', 'oceTAUY':'TAUYtemp'}).rename_vars({'oceTAUX_r':'oceTAUX', 'oceTAUY_r':'oceTAUY'})

#             # 5) get rid of all the original U/V/TAUX/TAUY variables (which have been renamed to *temp)
#             ds = ds.drop_vars({'Utemp','Vtemp','TAUXtemp','TAUYtemp'})
#             ds



# load the corresponding derived fields
derivedir = datadir + 'derived/'
derived_files = [f'{derivedir}LLC4320_pre-SWOT_{RegionName}_derived-fields_{date_list[n].strftime("%Y%m%d")}.nc' for n in range(ndays)] # list target files
dsd = xr.open_mfdataset(derived_files)

# merge the derived and raw data
ds = ds.merge(dsd)

# rename the transformed vector variables to their original names
ds = ds.rename_vars({'U_transformed':'U', 'V_transformed':'V', 
                     'oceTAUX_transformed':'oceTAUX', 'oceTAUY_transformed':'oceTAUY'})
# Convert lon, lat and z to index i, j and k with f_x, f_y and f_z
# XC, YC and Z are the same at all times, so select a single time
X = ds.XC.isel(time=0) 
Y = ds.YC.isel(time=0)
# X = ds.XC
# Y = ds.YC

In [None]:
# drop a bunch of other vars we don't actually use - can comment this out if these are wanted
ds = ds.drop_vars({'DXV','DYU', 'DXC','DXG', 'DYC','DYG', 'XC_bnds', 'YC_bnds', 'Zp1', 'Zu','Zl','Z_bnds', 'nb'})

In [None]:
sampling_details['zmooring_TS'] = list(range(-10,-1000,-100))
sampling_details

### Create & plot sampling track

In [None]:
del sys.modules['osse_tools'] 
from osse_tools import download_llc4320_data, compute_derived_fields, get_survey_track, survey_interp

# sampling_details['AT_END'] = 'repeat'
# # sampling_details['AT_END'] = 'terminate'
# sampling_details['AT_END'] = 'reverse'

sampling_details['zmooring_TS'] = list(range(-10,-1000,-100))
survey_track, survey_indices, sampling_parameters = get_survey_track(ds, sampling_details)

# print specified sampling_details + any default values
sampling_parameters 

### Visualizations

In [None]:
## Visualize track over a single model snapshot:
%matplotlib inline
plt.figure(figsize=(15,5))

ax = plt.subplot(1,2,1)
ssto = plt.pcolormesh(X,Y,ds.Theta.isel(k=0, time=0).values, shading='auto')
if not (sampling_parameters['SAMPLING_STRATEGY'] == 'mooring' or sampling_parameters['SAMPLING_STRATEGY'] == 'sim_mooring'):
    tracko = plt.scatter(survey_track.lon, survey_track.lat, c=survey_track.time-survey_track.time[0], cmap='Reds', s=0.75)
    plt.colorbar(ssto).set_label('SST, $^o$C')
    plt.colorbar(tracko).set_label('days from start')
    plt.title('SST and survey track: ' + RegionName)
else:
    plt.plot(survey_track.lon, survey_track.lat, marker='*', c='r')
    plt.title('SST and mooring location: ' + RegionName)

    
ax = plt.subplot(1,2,2)
if not (sampling_parameters['SAMPLING_STRATEGY'] == 'mooring' or sampling_parameters['SAMPLING_STRATEGY'] == 'sim_mooring'):
    plt.plot(survey_track.time, survey_track.dep, marker='.')
else:
    # not quite right but good enough for now.
    # (times shouldn't increase with depth)
    plt.scatter((np.tile(survey_track['time'], int(survey_track['dep'].data.size))),
         np.tile(survey_track['dep'], int(survey_track['time'].data.size)),marker='.')             
# plt.xlim([start_date + datetime.timedelta(days=0), start_date + datetime.timedelta(days=2)])
    
plt.show()
    

### Interpolate data with the specified sampling pattern (this is where the magic happens!)

In [None]:
del sys.modules['osse_tools'] 
from osse_tools import survey_interp, get_survey_track
start_time = time.time()
subsampled_data, sgridded = survey_interp(ds, survey_track, survey_indices)
print("--- %s seconds ---" % (time.time() - start_time))


# NOTE! this breaks if we've already run "compute" on U, V.
# can try a flag/test here if that's faster.


### Visualizations

Basic plots to show the interpolated variables

In [None]:
# 3d fields
vbls3d = ['Theta','Salt','U','V','vorticity']
ylim = [min(sgridded['depth'].values), max(sgridded['depth'].values)]
# ylim = [-200, -1]

nr = len(vbls3d) # # of rows
fig,ax=plt.subplots(nr,figsize=(8,10),constrained_layout=True)


for j in range(nr):
    sgridded[vbls3d[j]].plot(ax=ax[j], ylim=ylim)
    ax[j].plot(sgridded.time.data, -sgridded.KPPhbl.data, c='k')
    ax[j].set_title(vbls3d[j])


In [None]:
## selected 2d fields
j=0
nr = 6 # # of rows
fig,ax=plt.subplots(nr,figsize=(10,8),constrained_layout=True)


# wind vectors
ax[j].quiver(sgridded.time.data,0,sgridded.oceTAUX.data, sgridded.oceTAUY.data)
ax[j].set_title('Wind stress')    
ax[j].set_ylabel('N m-2')
# SH 
j+=1
ax[j].plot(sgridded.time,sgridded.steric_height-sgridded.steric_height.mean(), 
             sgridded.time.data,sgridded.steric_height_true-sgridded.steric_height_true.mean())
ax[j].set_title('Steric height')
ax[j].legend(['subsampled','true'])
ax[j].set_ylabel('m')

# SSH
j+=1
ax[j].plot(sgridded.time,sgridded.Eta)
ax[j].set_title('SSH')
ax[j].set_ylabel('m')

# MLD
j+=1
ax[j].plot(sgridded.time,sgridded.KPPhbl)
ax[j].set_title('MLD')
ax[j].set_ylabel('m')
ax[j].invert_yaxis()

# surface heat flux
j+=1
ax[j].plot(sgridded.time,sgridded.oceQnet, sgridded.time,sgridded.oceQsw)
ax[j].set_title('Surface heat flux into the ocean')
ax[j].legend(['total','shortwave'])
ax[j].set_ylabel('W m-2')

# surface FW flux
j+=1
ax[j].plot(sgridded.time,sgridded.oceFWflx)
ax[j].set_title('Surface freshwater flux into the ocean') 
ax[j].set_ylabel('kg m-2 s-1')

# horiz line:
for j in range(nr):
    ax[j].axhline(0, color='grey', linewidth=0.8)

    

In [None]:
ds['PhiBot'].isel(time=0).plot()

In [None]:
sgridded

### Save interpolated data

For both raw and gridded subsampled data, add attributes and save


In [None]:
# add metadata to attributes
attrs = sampling_parameters
attrs['start_date'] = start_date.strftime('%Y-%m-%d')
end_date = sgridded['time'].data[-1]
attrs['end_date'] = np.datetime_as_string(end_date,unit='D')
attrs['ndays'] = ndays

# output filename base:
# --- auto generated but can be set by user ---
filename_out_base = (f'{outputdir}OSSE_{RegionName}_{sampling_parameters["SAMPLING_STRATEGY"]}_{attrs["start_date"]}_to_{attrs["end_date"]}')
print(filename_out_base)

In [None]:
# ------ subsampled data
if sampling_parameters['SAMPLING_STRATEGY'] != 'mooring':
    filename_out = filename_out_base + '_subsampled.nc'
    print(f'saving to {filename_out}')
    subsampled_data.attrs = attrs
    netcdf_fill_value = nc4.default_fillvals['f4']
    dv_encoding={'zlib':True,  # turns compression on\
                'complevel':9,     # 1 = fastest, lowest compression; 9=slowest, highest compression \
                'shuffle':True,    # shuffle filter can significantly improve compression ratios, and is on by default \
                'dtype':'float32',\
                '_FillValue':netcdf_fill_value}
    # save to a new file
    # subsampled_data.to_netcdf(filename_out,format='netcdf4',encoding=dv_encoding)
    subsampled_data.to_netcdf(filename_out,format='netcdf4')
    !ls -ltrh {filename_out}

In [None]:
# ------ gridded:
filename_out = filename_out_base + '_gridded.nc'
print(f'saving to {filename_out}')
sgridded.attrs = attrs
netcdf_fill_value = nc4.default_fillvals['f4']
dv_encoding={'zlib':True,  # turns compression on\
            'complevel':9,     # 1 = fastest, lowest compression; 9=slowest, highest compression \
            'shuffle':True,    # shuffle filter can significantly improve compression ratios, and is on by default \
            'dtype':'float32',\
            '_FillValue':netcdf_fill_value}
# save to a new file
# subsampled_data.to_netcdf(filename_out,format='netcdf4',encoding=dv_encoding)
sgridded.to_netcdf(filename_out,format='netcdf4')
!ls -ltrh {filename_out}

### Visualize interpolated data in 3D

In [None]:
%matplotlib qt

fig = plt.figure(figsize=(12, 12))
ax = plt.axes(projection='3d')
fig.subplots_adjust(left=0.25, bottom=0.25)

ax.set_xlabel('longitude', fontsize=15, rotation=150)
ax.set_ylabel('latitude',fontsize=15)
ax.set_zlabel('depth', fontsize=15, rotation=60)

p = ax.scatter3D(subsampled_data.lon.data, subsampled_data.lat.data, subsampled_data.dep.data, c=subsampled_data.Theta.data, s=1)
fig.colorbar(p).set_label('Temperature ($^o$C)')
ax.set_title('Temperature interpolated to the survey track')

#### EXPLORE DIFFERENT INTERPOLATION METHODS 

It's re-gridding the 3-d interpolated data that is slow. (2d is trivial)

Which part?
the line this_var = subsampled_data[vbl].data.compute().copy() 
is SUPER slow (20 sec) for V, slow (9 sec) for U, moderate (2 sec) for theta and salt, 0 for SH and vorticity

I guess because making those regridding computations is slow?
- is the regridding necessary? (yes, to do it right - but could check)
- can we regrid when we derive fields? (yes - but not sure it's any faster)


In [None]:
# ORIGINAL CODE in osse_tools.py
start_time = time.time()
# vbls3d = ['U','V', 'Theta','Salt','vorticity','steric_height']
# vbls2d = ['steric_height_true', 'Eta', 'KPPhbl', 'PhiBot', 'oceFWflx', 'oceQnet', 'oceQsw', 'oceSflux']


vbls3d = ['U_c','V_c', 'U', 'V']
vbls3d = ['U', 'V']

# -----

# ------Regrid the data to depth/time (3-d fields) or subsample to time (2-d fields)
print('Gridding the interpolated data...')
# get times associated with profiles:
SAMPLING_STRATEGY = survey_track['SAMPLING_STRATEGY']
if SAMPLING_STRATEGY == 'sim_mooring':
    # - for mooring, use the subsampled time grid:
    times = np.unique(subsampled_data.time.values)
else:
    # -- for glider/uctd, take the shallowest & deepest profiles (every second value, since top/bottom get sampled twice for each profile)
    time_deepest = subsampled_data.time.where(subsampled_data.dep == subsampled_data.dep.min(), drop=True).values[0:-1:2]
    time_shallowest = subsampled_data.time.where(subsampled_data.dep == subsampled_data.dep.max(), drop=True).values[0:-1:2]
    times = np.sort(np.concatenate((time_shallowest, time_deepest)))
    # this results in a time grid that may not be uniformly spaced, but is correct
    # - for a uniform grid, use the mean time spacing - may not be perfectly accurate, but is evenly spaced
    dt = np.mean(np.diff(time_shallowest))/2 # average spacing of profiles (half of one up/down, so divide by two)
    times_uniform = np.arange(survey_track.n_profiles.values*2) * dt
    print("--- deal with times: %s seconds ---" % int(time.time() - start_time))
# nt is the number of profiles (times):
nt = len(times)  
# xgr is the vertical grid; nz is the number of depths for each profile
# depths are negative, so sort in reverse order using flip
zgridded = np.flip(np.unique(subsampled_data.dep.data))
nz = int(len(zgridded))

# -- initialize the dataset:
sgridded = xr.Dataset(
    coords = dict(depth=(["depth"],zgridded),
              time=(["time"],times))
)
# -- 3-d fields: loop & reshape 3-d data from profiles to a 2-d (depth-time) grid:
# first, extract each variable, then reshape to a grid
# add U and V to the list:
# vbls3d.append('U')
# vbls3d.append('V')


for vbl in vbls3d:
    print(vbl)
    start_time = time.time()
    this_var = subsampled_data[vbl].data.compute().copy() 
    print("--- this_var = subsampled_data[vbl].data.compute().copy(): %s seconds ---" % int(time.time() - start_time))

    # reshape to nz,nt
    start_time = time.time()
    this_var_reshape = np.reshape(this_var,(nz,nt), order='F') # fortran order is important!
    print("--- this_var_reshape = np.reshape(this_var,(nz,nt), order='F'): %s seconds ---" % int(time.time() - start_time))
    # for platforms with up & down profiles (uCTD and glider),
    # every second column is upside-down (upcast data)
    # starting with the first column, flip the data upside down so that upcasts go from top to bottom
    start_time = time.time()
    if SAMPLING_STRATEGY != 'sim_mooring':
        this_var_fix = this_var_reshape.copy()
        #this_var_fix[:,0::2] = this_var_fix[-1::-1,0::2] 
        this_var_fix[:,1::2] = this_var_fix[-1::-1,1::2]  # Starting with SECOND column
        sgridded[vbl] = (("depth","time"), this_var_fix)
    elif SAMPLING_STRATEGY == 'sim_mooring':
        sgridded[vbl] = (("depth","time"), this_var_reshape)
        
    print("--- this_var_fix[:,1::2] = this_var_fix[-1::-1,1::2]: %s seconds ---" % int(time.time() - start_time))
print("--- 3d vars: TOTAL %s seconds ---" % int(time.time() - start_time))

# # for sampled steric height, we want the value integrated from the deepest sampling depth:
# sgridded['steric_height'] = (("time"), sgridded['steric_height'].isel(depth=nz-1))
# # rename to "sampled" for clarity
# sgridded.rename_vars({'steric_height':'steric_height_sampled'})


In [None]:
subsampled_data

In [None]:
## Create a new dataset to contain the interpolated data, and interpolate
subsampled_data = xr.Dataset(
    dict(
        t = xr.DataArray(survey_track.time, dims='points'), # call this time, for now, so that the interpolation works
        lon = xr.DataArray(survey_track.lon, dims='points'),
        lat = xr.DataArray(survey_track.lat, dims='points'),
        dep = xr.DataArray(survey_track.dep, dims='points'),
        points = xr.DataArray(survey_track.points, dims='points')
    )
)

print('Interpolating model fields to the sampling track...')
# loop & interpolate through 3d variables:
vbls3d = ['Theta','Salt','vorticity','steric_height']
for vbl in vbls3d:
    subsampled_data[vbl]=ds[vbl].interp(survey_indices)
# Interpolate U and V from i_g, j_g to i, j, then interpolate:
# Get u, v
grid = Grid(ds, coords={'X':{'center': 'i', 'left': 'i_g'}, 
                        'Y':{'center': 'j', 'left': 'j_g'},
                        'Z':{'center': 'k'}})
U_c = grid.interp(ds.U, 'X', boundary='extend')
V_c = grid.interp(ds.V, 'Y', boundary='extend')
subsampled_data['U'] = U_c.interp(survey_indices)
subsampled_data['V'] = V_c.interp(survey_indices)    
# # subsampled_data['U_test']=ds['U'].interp(survey_indices)
# # ds
# survey_indices
# grid = Grid(ds, coords={'X':{'center': 'i', 'left': 'i_g'}, 
#                         'Y':{'center': 'j', 'left': 'j_g'},
#                         'Z':{'center': 'k'}})
# U_c = grid.interp(ds.U, 'X', boundary='extend')
# V_c = grid.interp(ds.V, 'Y', boundary='extend')
subsampled_data['U_test']=ds['U'].interp(survey_indices)


### TEST! is it faster to interpolate mooring data to a grid rather than as "points"?


In [None]:
sampling_parameters

In [None]:
# interpolate to xi,yi,ti,zi
xi = sampling_parameters['xmooring']
yi = sampling_parameters['ymooring']
ti = ds['time']
zi = sampling_parameters['zmooring_TS']

# ---- copied ----
# Convert lon, lat and z to index i, j and k with f_x, f_y and f_z
# XC, YC and Z are the same at all times, so select a single time
X = ds.XC.isel(time=0) 
Y = ds.YC.isel(time=0)
i = ds.i
j = ds.j
z = ds.Z.isel(time=0)
k = ds.k
f_x = interpolate.interp1d(X[0,:].values, i)
f_y = interpolate.interp1d(Y[:,0].values, j)
f_z = interpolate.interp1d(z, k, bounds_error=False)
# ---- copied ----
    
# survey_indices_mooring= xr.Dataset(
#         dict(
#             i = xr.DataArray(f_x(xmooring), dims='points'),
#             j = xr.DataArray(f_y(ymooring), dims='points'),
#             k = xr.DataArray(f_z(survey_track.dep), dims='points'),
#             time = xr.DataArray(survey_track.time, dims='points'),
#         )
#     )

In [None]:
# -- initialize the dataset:
# mgridded = xr.Dataset(
#     coords = dict(depth=(["depth"],zgridded),
#               time=(["time"],times))
# # )
# vbls3d = ['Theta','Salt','vorticity','steric_height', 'U', 'V']
# vbls3d = ['Theta','Salt','vorticity','steric_height', 'U_c', 'V_c']
# for vbl in vbls3d:
#     print('interpolating ' + vbl)
#     mgridded[vbl]=ds[vbl].interp(i=f_x(xi), j=f_y(yi), k=f_z(zi), time=ti).compute()
   
ds

In [None]:
plt.pcolormesh(np.transpose(dum.data))
plt.show()

In [None]:
sgridded['Theta'].plot()

In [None]:

# try regridding U/V earlier (when deriving)

In [None]:
# sgridded.oceTAUX.plot()
plt.quiver(sgridded.time.data,0,sgridded.oceTAUX.data, sgridded.oceTAUY.data)

In [None]:
# plot to compare the two methods 
# ( not quite working )
tpl = ds['time']/24,sss[:,0,0,0]
ttt=ds['time']
# ipl = ( subsampled_data.dep.values == subsampled_data.dep.values.min() )
# plt.plot(subsampled_data.time(ipl),subsampled_data.Salt.sel(ipl).data,'-')
# plt.plot(subsampled_data.time,subsampled_data.Salt.data,'-')
plt.plot(tgr,np.transpose(dum),'-')
plt.plot(ttt[:],sss[:,0,0,0],'k--')
# plt.legend('survey_interp method','direct interpolation')

### Visualize steric height

In [None]:
%matplotlib inline
sh_anom = sh_true.values - sh_true.values.mean()
plt.figure(figsize=(7,5))
sho = plt.scatter(survey_track.lon, survey_track.lat, c=sh_anom)
plt.title('Steric height anomaly')
plt.colorbar(sho).set_label('m')
plt.show()

### Comparison of true vs sampled steric height
Plot comparing the "true" steric height along the track and the subsampled steric height, which is computed by integrating the specific volume anomaly for each subsampled profile from its deepest sampling depth

In [None]:
%matplotlib inline

# truth:
plt.plot(sh_true.time, sh_true.values - sh_true.values.mean())
# get index of the deepest sampling depths
i = ( subsampled_data.dep.values == subsampled_data.dep.values.min() )
plt.plot(subsampled_data.time.values[i], subsampled_data.steric_height.values[i] - subsampled_data.steric_height.values[i].mean(),'.-')
plt.title('Steric height anomaly along the survey track')
plt.legend(['truth','subsampled data'])
plt.xlabel('time, days')
plt.ylabel('steric height anom., m')

In [None]:
# NEW CODE  BELOW

# interpolation:
subsampled_data = xr.Dataset()  

# loop & interpolate through 3d variables:
vbls3d = ['Theta','Salt','vorticity']
# vbls3d = ['Theta']
for vbl in vbls3d:
    subsampled_data[vbl]=ds[vbl].interp(survey_indices)
# # Interpolate U and V from i_g, j_g to i, j, then interpolate:
# U_c = grid.interp(ds.U, 'X', boundary='extend')
# V_c = grid.interp(ds.V, 'Y', boundary='extend')
# subsampled_data['U'] = U_c.interp(survey_indices)
# subsampled_data['V'] = V_c.interp(survey_indices)

subsampled_data['lon']=survey_track.lon
subsampled_data['lat']=survey_track.lat
subsampled_data['dep']=survey_track.dep
subsampled_data['time']=survey_track.time 

# loop & interpolate through 2d variables:
vbls2d = ['Eta', 'KPPhbl', 'PhiBot', 'oceFWflx', 'oceQnet', 'oceQsw', 'oceSflux', 'oceTAUX', 'oceTAUY']
vbls2d = ['Eta', 'Depth']
# create 2-d survey track by removing the depth dimension
survey_indices_2d =  survey_indices.drop_vars('k')
for vbl in vbls2d:
    subsampled_data[vbl]=ds[vbl].interp(survey_indices_2d)
# survey_indices_2d.i.plot()
# ds['KPPhbl'].interp(survey_indices_2d).plot()

# interp
# this returns a value at every timestep (points) - very high resolution
# - could subsample to the model time grid 


# plot
# plt.plot(sh_true.time, sh_true.values - sh_true.values.mean())
# plt.plot(subsampled_data.time, ssh - ssh.mean())


In [None]:
subsampled_data = xr.Dataset() 

# loop & interpolate through 3d variables:
vbls3d = ['Theta','Salt','vorticity','steric_height']
for vbl in vbls3d:
    subsampled_data[vbl]=ds[vbl].interp(survey_indices)
# Interpolate U and V from i_g, j_g to i, j, then interpolate:
# Get u, v
grid = Grid(ds, coords={'X':{'center': 'i', 'left': 'i_g'}, 
                        'Y':{'center': 'j', 'left': 'j_g'},
                        'Z':{'center': 'k'}})
U_c = grid.interp(ds.U, 'X', boundary='extend')
V_c = grid.interp(ds.V, 'Y', boundary='extend')
subsampled_data['U'] = U_c.interp(survey_indices)
subsampled_data['V'] = V_c.interp(survey_indices)


# add lat/lon/time to dataset
subsampled_data['lon']=survey_track.lon
subsampled_data['lat']=survey_track.lat
subsampled_data['dep']=survey_track.dep
subsampled_data['time']=survey_track.time  

# loop & interpolate through 2d variables:
vbls2d = ['Eta', 'KPPhbl', 'PhiBot', 'oceFWflx', 'oceQnet', 'oceQsw', 'oceSflux']
# create 2-d survey track by removing the depth dimension
survey_indices_2d =  survey_indices.drop_vars('k')
for vbl in vbls2d:
    subsampled_data[vbl]=ds[vbl].interp(survey_indices_2d)   
# taux & tauy must be treated like U and V
oceTAUX_c = grid.interp(ds.oceTAUX, 'X', boundary='extend')
oceTAUY_c = grid.interp(ds.oceTAUY, 'Y', boundary='extend')
subsampled_data['oceTAUX'] = oceTAUX_c.interp(survey_indices_2d)
subsampled_data['oceTAUY'] = oceTAUY_c.interp(survey_indices_2d)


    

In [None]:
ds.U

In [None]:
plt.plot(sh_true.time, sh_true.values - sh_true.values.mean())
plt.plot(subsampled_data.time, ssh - ssh.mean())


In [None]:
# mooring test

model_xav = ds.XC.cccc.mean(dim='i').values
model_yav = ds.YC.isel(time=0, i=0).mean(dim='j').values

xmooring = model_xav # default lat/lon is the center of the domain
ymooring = model_yav
zmooring_TS = [-1, -10, -50, -100] # depth of T/S instruments
zmooring_UV = [-1, -10, -50, -100, -200, -300, -400, -500] # depth of U/V instruments


ts = ds.time
n_samples = ts.size
n_depths_TS = np.size(zmooring_TS)
n_depths_UV = np.size(zmooring_UV)
# depth sampling - different for TS and UV
zs_TS = np.transpose(np.tile(zmooring_TS,(n_samples,1)))
zs_UV = np.transpose(np.tile(zmooring_UV,(n_samples,1)))
xs_TS = xmooring * np.ones([n_depths_TS, n_samples])
ys_TS = ymooring * np.ones([n_depths_TS, n_samples])
xs_UV = xmooring * np.ones([n_depths_TS, n_samples])
ys_UV = ymooring * np.ones([n_depths_TS, n_samples])

survey_track = xr.Dataset(
    dict(
        lon = xr.DataArray(xs_TS,dims='points'),
        lat = xr.DataArray(ys_TS,dims='points'),
        dep = xr.DataArray(zs_TS,dims='points'),
        time = xr.DataArray(ts_TS,dims='points')
    )
)



In [None]:
# np.broadcast_to([-1, -10, -50, -100],(2,4))
np.broadcast_to([-1, -10, -50, -100],(2,4))

In [None]:
# np.broadcast_to(zmooring_TS,(2,4))
# np.tile(zmooring_TS.transpose,(2,1))
# np.tile(np.transpose(zmooring_TS),(1,n_samples)).shape
# xs_TS.shape
# [-1, -10, -50, -100]
# zmooring_TS

# np.transpose(np.array(zmooring_TS)).shape 
np.transpose(np.tile(zmooring_TS,(n_samples,1))).shape
xs_TS.shape

In [None]:
subsampled_data

In [None]:
plt.scatter(subsampled_data.time,subsampled_data.dep,c=subsampled_data.Theta)
plt.plot(subsampled_data.time,-subsampled_data.Depth,c='r')

In [None]:
sh_true.plot()

In [None]:

# we interpolated everything to the "points" - one datapoint per sample
# but we may also want a cleaner (gridded) output product that has been reshaped into 
# profiles (i.e., a X x Z x T array)
# (This might not make sense for glider data, since profiles are likely > one model gridpoint)

# separate these into "subsampled_points" and "subsampled_profiles"?

# a couple ways to determine the profiles:
# - one per down / one per up, with time as either the start/mean/end
# - one per down-up, with the time as the deepest point
# - maybe others? would be useful to get feedback on this...

# use "where" to determine the indices of the start/end (shallowest/deepest) of each profile:

# subsampled_data.steric_height.where(subsampled_data.dep == subsampled_data.dep.min(), drop=True)
# subsampled_deepest = subsampled_data.where(subsampled_data.dep == subsampled_data.dep.min(), drop=True)

# plt.plot(subsampled_data.dep.values[ideep])

# this is the DEEPEST point only. 
subsampled_data = subsampled_data
dum = subsampled_data.where(subsampled_data.dep == subsampled_data.dep.min(), drop=True)
shall = subsampled_data.where(subsampled_data.dep == subsampled_data.dep.max(), drop=True)
%matplotlib inline
plt.figure(figsize=(7,5))
plt.plot(dum.lon,dum.lat,'.')
plt.plot(shall.lon,shall.lat,'.')
plt.show()


In [None]:
import time
# index?
ishallow = ( subsampled_data.dep.values == subsampled_data.dep.values.max() )
ideep = ( subsampled_data.dep.values == subsampled_data.dep.values.min() )
# boolean to index
ishallow = [ishallow for ishallow, x in enumerate(ishallow) if x]
ideep = [ideep for ideep, x in enumerate(ideep) if x]

t_profiles = subsampled_data['time'].isel(points=ishallow)
t_profiles.plot()
z = np.unique(subsampled_data['dep'])

# # # initialize the dataset:
# subsampled_profile = xr.Dataset(
#     coords={
#         "time": t_profiles,
#         "depth": z
#     },
#     "Salt": (("time", "depth"), []),
# )


# loop through each profile:

pr = []
for n in np.arange(np.size(ishallow)):
    i = np.arange(ishallow[n],ideep[n])
    # append
    pr = np.append(pr,subsampled_data['Salt'].isel(points=i))
    
#     # loop & interpolate through 3d variables:
#     for vbl in vbls3d:
# #         subsampled_profile[vbl] = subsampled_data[vbl].isel(points=i)
#         dum = subsampled_data[vbl].isel(points=i)

pr   
        

    

In [None]:
n

In [None]:
np.shape(pr)

In [None]:

# # n=0
# # i = np.arange(ishallow[n],ideep[n])
# # start = time.time()
# # dum = subsampled_data.Theta.isel(points=i)
# # end = time.time()
# # print("The time of execution of above program is :", end-start)

In [None]:
subsampled_data.groupby("dep").mean().scatter()


In [None]:
ss = xr.Dataset(
    {
        "Salt": (("time", "depth"), [subsampled_data.Salt.isel(points=i)]),
    },
    coords={
        "time": [1],
        "depth": z,
    },
    
)
ss
# "precipitation": (("lat", "lon"), np.random.rand(4).reshape(2, 2)),

In [None]:
len(z)

In [None]:
subsampled_data

In [None]:
# {**sampling_details}


def test_code(sampling_details):
    for key, value in sampling_details.items():
#         print(key , '=' , value , '')
#         print(type(key))
        if isinstance(value,str):
            exec(key + '="' + value + '"',None, globals())
#             print(key)
            
        if isinstance(value,list):
            exec(key + '=' + str(value) + '',locals(), globals())
#             print(key)
            
    print(zrange+1000)
    return zrange
    
ddd = test_code(sampling_details)
# test_code(sampling_details)
ddd
# for key in sampling_details:
#     exec(key + '=' + '"' + sampling_details[key] + '"')

# isinstance(sampling_details['zrange'],list)
# isinstance(sampling_details['SAMPLING_STRATEGY'],list)
# isinstance?

# type(sampling_details['zrange'])

In [None]:

#             # 3) merge transformed data with ds
#             print('merging with ds')
#             print(ds)
#             print(oceTAUX_r)
#             ds = ds.merge(U_r.to_dataset(name='U_r')).merge(V_r.to_dataset(name='V_r'))
#             ds = ds.merge(oceTAUX_r.to_dataset(name='oceTAUX_r')).merge(oceTAUY_r.to_dataset(name='oceTAUY_r'))

#             # 4) rename transformed data to original names (renaming to 'temp' first)
#             # !! caution, skipping this step will cause great confusion later !!
#             ds = ds.rename_vars({'U':'Utemp', 'V':'Vtemp'}).rename_vars({'U_r':'U', 'V_r':'V'})
#             ds = ds.rename_vars({'oceTAUX':'TAUXtemp', 'oceTAUY':'TAUYtemp'}).rename_vars({'oceTAUX_r':'oceTAUX', 'oceTAUY_r':'oceTAUY'})
           

#             # 6) get rid of all the original U/V/TAUX/TAUY variables (which have been renamed to *temp)
#             ds = ds.drop_vars({'Utemp','Vtemp','TAUXtemp','TAUYtemp'})
            
