In [3]:
import sys, os
from pyprojroot import here

# spyder up to find the root
root = here(project_files=[".root"])

# append to path
sys.path.append(str(root))

In [4]:
import math
from pathlib import Path

import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Altitools
from alti_tools._src.data.ssh import download_ssh_toy
from alti_tools._src.transforms import spatial, temporal
from alti_tools._src.viz import psd as psd_plots
from alti_tools._src.preprocess.swot import preprocess_karin_swot
from alti_tools._src.utils.tracking import get_current_timestamp
from alti_tools._src.utils.files import list_all_files, check_list_equal_elem
from alti_tools._src.utils.files import check_if_directory, check_if_file
from alti_tools._src.data.configs.altimetry import get_raw_altimetry_config, get_raw_altimetry_files
from alti_tools._src.data.io import load_alongtrack_parallel

import seaborn as sns
import corner

sns.set_context(context="talk", font_scale=0.7)


import xarray as xr
import pandas as pd
from scipy import ndimage
import holoviews as hv
import xrft
from dask.diagnostics import ProgressBar

import tqdm

%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

* [X] Load data for Training scenarios
  * [X] 1 NADIR
  * [X] 4 NADIR
  * [X] 1 SWOT
  * [X] 1 SWOT + 4 NADIR
* Subset Region + Time
   1. Training
   2. Evaluation
   3. Spin-Up

## Data

### Download Data



In [5]:
# TODO: add dataset download

### Load Data

In [6]:
dataset_dir = "/Volumes/EMANS_HDD/data/dc20a_osse/test/preprocess/osse_2020a_natl60/"
!ls $dataset_dir


[31m2020a_SSH_mapping_NATL60_envisat.nc[m[m
[31m2020a_SSH_mapping_NATL60_geosat2.nc[m[m
[31m2020a_SSH_mapping_NATL60_jason1.nc[m[m
[31m2020a_SSH_mapping_NATL60_karin_swot.nc[m[m
[31m2020a_SSH_mapping_NATL60_nadir_swot.nc[m[m
[31m2020a_SSH_mapping_NATL60_topex-poseidon_interleaved.nc[m[m


In [7]:
from alti_tools._src.utils.files import list_all_files, list_of_files_to_dict
from alti_tools._src.data.natl60.osse import get_swot_obs_setup_files

# get all files in directory
all_files = list_all_files(dataset_dir)

# get files required for the observations setup
setup_files = get_swot_obs_setup_files(all_files, setup="nadir1")

In [62]:
ds_files = list(map(lambda ifile: xr.open_dataset(ifile), setup_files))

In [63]:
ds_files[0]

In [45]:
# choose the variables we want to open
variables = ["ssh_obs", "ssh_model", "lon", "lat"]

def preprocess(x):
    # subset variables
    x = x[variables]

    # subset region

    # subset temporal region

    return x

ds_swot = load_alongtrack_parallel(setup_files, preprocess=preprocess)

# sort by time
ds_swot = ds_swot.sortby("time")

ds_swot

Unnamed: 0,Array,Chunk
Bytes,411.85 kiB,411.85 kiB
Shape,"(52717,)","(52717,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 411.85 kiB 411.85 kiB Shape (52717,) (52717,) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",52717  1,

Unnamed: 0,Array,Chunk
Bytes,411.85 kiB,411.85 kiB
Shape,"(52717,)","(52717,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,411.85 kiB,411.85 kiB
Shape,"(52717,)","(52717,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 411.85 kiB 411.85 kiB Shape (52717,) (52717,) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",52717  1,

Unnamed: 0,Array,Chunk
Bytes,411.85 kiB,411.85 kiB
Shape,"(52717,)","(52717,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,411.85 kiB,411.85 kiB
Shape,"(52717,)","(52717,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 411.85 kiB 411.85 kiB Shape (52717,) (52717,) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",52717  1,

Unnamed: 0,Array,Chunk
Bytes,411.85 kiB,411.85 kiB
Shape,"(52717,)","(52717,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,411.85 kiB,411.85 kiB
Shape,"(52717,)","(52717,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 411.85 kiB 411.85 kiB Shape (52717,) (52717,) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",52717  1,

Unnamed: 0,Array,Chunk
Bytes,411.85 kiB,411.85 kiB
Shape,"(52717,)","(52717,)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [55]:
print(f"Time\nMin: {ds_swot.time.min().data}, Max: {ds_swot.time.max().data}")
print(f"Longitude\nMin: {ds_swot.lon.min().compute().data}, Max: {ds_swot.lon.max().compute().data}")
print(f"Latitude\nMin: {ds_swot.lat.min().compute().data}, Max: {ds_swot.lat.max().compute().data}")

Time
Min: 2012-10-01T03:20:37.213526000, Max: 2013-09-29T05:35:56.659327000
Longitude
Min: 295.007584, Max: 304.991328
Latitude
Min: 33.007988, Max: 42.998908


In [8]:
from ml_collections import config_dict

def get_preprocess_config():

    # initialize config dictionary
    config = config_dict.ConfigDict()

    # training scenario
    config.data = data = config_dict.ConfigDict()
    data = "swothnadir4" # "nadir1", "nadir4", "swot", "swothnadir4"

    ##########################
    # Spatial Temporal Subset
    ##########################
    config.subset = subset = config_dict.ConfigDict()

    # =======================
    # SPIN-UP REGION
    # =======================
    # training scenario
    config.subset.spinup = spinup = config_dict.ConfigDict()
    # longitude bounds
    spinup.lon_min = -65.0
    spinup.lon_max = -55.0
    # latitude bounds
    spinup.lat_min = 33.0
    spinup.lat_max = 43.0
    # temporal bounds
    spinup.time_min = "2012-10-01"   # we can start in the beginning of the nature run :)
    spinup.time_max = "2012-10-22"

    # =======================
    # TRAINING REGION
    # =======================
    # training scenario
    config.subset.train = train = config_dict.ConfigDict()
    train.spinup = True             # option to include spin in training
    # longitude bounds
    train.lon_min = -65.0
    train.lon_max = -55.0
    # latitude bounds
    train.lat_min = 33.0
    train.lat_max = 43.0
    # temporal bounds
    train.time_min = "2013-01-02"   # we can start in the beginning of the nature run :)
    train.time_max = "2013-09-30"

    # =======================
    # TEST REGION
    # =======================
    # training scenario
    config.subset.evaluation = evaluation = config_dict.ConfigDict()
    # longitude bounds
    evaluation.lon_min = -65.0
    evaluation.lon_max = -55.0
    # latitude bounds
    evaluation.lat_min = 33.0
    evaluation.lat_max = 43.0
    # temporal bounds
    evaluation.time_min = "2012-10-22"   # we can start in the beginning of the nature run :)
    evaluation.time_max = "2012-12-02"


    return config

In [12]:
def subset_temporal(ds, config):

    time_min = np.datetime64(config.time_min)
    time_max = np.datetime64(config.time_max)

    ds = ds.sel(time=slice(time_min, time_max))

    return ds

def rescale_temporal(ds, config):

    global_min = np.datetime64(config.time_min)
    dtime = np.timedelta64(config.time_delta)

    ds["time"] = (ds["time"].values - global_min) / dtime

    return ds

def subset_spatial(ds, config):

    ds = ds.where(
            (ds['lon'] >= config.lon_min) &
            (ds['lon'] <= config.lon_max) &
            (ds['lat'] >= config.lat_min) &
            (ds['lat'] <= config.lat_max),
            drop=True
        )

    return ds


def correct_longitude(ds, angle: str="360"):

    # lon_min = ds.lon.min().values

    # if lon_min < 0:
    #     ds['lon'] = xr.where(ds['lon'] >= 180., ds['lon']-360., ds['lon'])

    if angle == "180":
        ds["lon"] = ds["lon"] - 360
    else:
        ds['lon'] = xr.where(ds['lon'] >= 180., ds['lon']-360., ds['lon'])

    return ds

def rescale_spatial(ds, config):

    # longitude
    global_min = config.lon_min
    dlon = config.lon_delta

    ds["lon"] = (ds["lon"].values - global_min) / dlon

    # latitude
    global_min = config.lat_min
    dlat = config.lat_delta

    ds["lat"] = (ds["lat"].values - global_min) / dlat

    return ds

def coarsen_alongtrack(ds, config):

    boundary = config.get("boundary", "trim")
    coarsen = config.get("coarsen", 5)
    summary = config.get("summary", "mean")

    if summary == "median":
        ds = ds.coarsen({"time": coarsen}, boundary=boundary).median()
    else:
        ds = ds.coarsen({"time": coarsen}, boundary=boundary).mean()
    
    return ds

In [10]:
preprocess_config = get_preprocess_config()
# preprocess_config.subset.evaluation.get("lon_min", 100)
config = preprocess_config.subset.spinup

In [13]:
# get all files in directory
all_files = list_all_files(dataset_dir)

# get files required for the observations setup
setup_files = get_swot_obs_setup_files(all_files, setup="swot1nadir4")

# choose the variables we want to open
variables = ["ssh_obs", "ssh_model", "lon", "lat"]

def preprocess(x):
    # subset variables
    x = x[variables]

    x = coarsen_alongtrack(x, config)

    # subset region
    x = subset_temporal(x, config)

    # correct longitude dimensions
    x = correct_longitude(x)

    # subset temporal region
    x = subset_spatial(x, config)

    return x

ds_swot = load_alongtrack_parallel(setup_files, preprocess=preprocess)

# sort by time
ds_swot = ds_swot.sortby("time").compute()

ds_swot

In [105]:
print(f"Time\nMin: {ds_swot.time.min().data}, Max: {ds_swot.time.max().data}")
print(f"Longitude\nMin: {ds_swot.lon.min().compute().data}, Max: {ds_swot.lon.max().compute().data}")
print(f"Latitude\nMin: {ds_swot.lat.min().compute().data}, Max: {ds_swot.lat.max().compute().data}")

Time
Min: 2012-10-01T00:45:50.676369000, Max: 2012-10-21T21:20:30.937600000
Longitude
Min: -65.0, Max: -55.0
Latitude
Min: 33.000037999999996, Max: 43.0
