In [1]:
import sys, os
from pyprojroot import here

# spyder up to find the root
root = here(project_files=[".root"])

# append to path
sys.path.append(str(root))

In [3]:
import math
from pathlib import Path

import numpy as np
import matplotlib
import matplotlib.pyplot as plt

# Altitools
from alti_tools._src.data.ssh import download_ssh_toy
from alti_tools._src.transforms import spatial, temporal
from alti_tools._src.viz import psd as psd_plots
from alti_tools._src.preprocess.swot import preprocess_karin_swot
from alti_tools._src.utils.tracking import get_current_timestamp
from alti_tools._src.utils.files import list_all_files, check_list_equal_elem
from alti_tools._src.utils.files import check_if_directory, check_if_file
from alti_tools._src.data.configs.altimetry import get_raw_altimetry_config, get_raw_altimetry_files
from alti_tools._src.data.io import load_alongtrack_parallel

import seaborn as sns

sns.set_context(context="talk", font_scale=0.7)


import xarray as xr
import pandas as pd
from scipy import ndimage
import holoviews as hv
import xrft
from dask.diagnostics import ProgressBar

import tqdm

%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

* [X] Load data for Training scenarios
  * [X] 1 NADIR
  * [X] 4 NADIR
  * [X] 1 SWOT
  * [X] 1 SWOT + 4 NADIR
* Subset Region + Time
   1. Training
   2. Evaluation
   3. Spin-Up

## Data

### Download Data



In [21]:
# TODO: add dataset download

### Load Data

In [4]:
dataset_dir = "/Volumes/EMANS_HDD/data/dc20a_osse/test/preprocess/osse_2020a_natl60/"
!ls $dataset_dir


[31m2020a_SSH_mapping_NATL60_envisat.nc[m[m
[31m2020a_SSH_mapping_NATL60_geosat2.nc[m[m
[31m2020a_SSH_mapping_NATL60_jason1.nc[m[m
[31m2020a_SSH_mapping_NATL60_karin_swot.nc[m[m
[31m2020a_SSH_mapping_NATL60_nadir_swot.nc[m[m
[31m2020a_SSH_mapping_NATL60_topex-poseidon_interleaved.nc[m[m


#### Load Individual Files

In [5]:
from alti_tools._src.utils.files import list_all_files, list_of_files_to_dict
from alti_tools._src.data.natl60.osse import get_swot_obs_setup_files

# get all files in directory
all_files = list_all_files(dataset_dir)

# get files required for the observations setup
setup_files = get_swot_obs_setup_files(all_files, setup="nadir1")

In [6]:
ds_files = list(map(lambda ifile: xr.open_dataset(ifile), setup_files))

#### Loading Aggregated Data

In [14]:
from ml_collections import config_dict

def get_preprocess_config():

    # initialize config dictionary
    config = config_dict.ConfigDict()

    # training scenario
    config.data = data = config_dict.ConfigDict()
    data = "swot1nadir5" # "nadir1", "nadir4", "swot", "swothnadir4"

    ##########################
    # Temporal Coarsen
    ##########################
    config.coarsen = coarsen = config_dict.ConfigDict()
    coarsen.coarsen = True
    coarsen.boundary = "trim"
    coarsen.time_steps = 5
    coarsen.summary = "mean"

    ##########################
    # Spatial Temporal Subset
    ##########################
    config.subset = subset = config_dict.ConfigDict()

    # =======================
    # SPIN-UP REGION
    # =======================
    # training scenario
    config.subset.spinup = spinup = config_dict.ConfigDict()
    # longitude bounds
    spinup.lon_min = -65.0
    spinup.lon_max = -55.0
    # latitude bounds
    spinup.lat_min = 33.0
    spinup.lat_max = 43.0
    # temporal bounds
    spinup.time_min = "2012-10-01"   # we can start in the beginning of the nature run :)
    spinup.time_max = "2012-10-22"

    # =======================
    # TRAINING REGION
    # =======================
    # training scenario
    config.subset.train = train = config_dict.ConfigDict()
    train.spinup = True             # option to include spin in training
    # longitude bounds
    train.lon_min = -65.0
    train.lon_max = -55.0
    # latitude bounds
    train.lat_min = 33.0
    train.lat_max = 43.0
    # temporal bounds
    train.time_min = "2013-01-02"   # we can start in the beginning of the nature run :)
    train.time_max = "2013-09-30"

    # =======================
    # TEST REGION
    # =======================
    # training scenario
    config.subset.evaluation = evaluation = config_dict.ConfigDict()
    # longitude bounds
    evaluation.lon_min = -65.0
    evaluation.lon_max = -55.0
    # latitude bounds
    evaluation.lat_min = 33.0
    evaluation.lat_max = 43.0
    # temporal bounds
    evaluation.time_min = "2012-10-22"   # we can start in the beginning of the nature run :)
    evaluation.time_max = "2012-12-02"


    return config

In [8]:
from alti_tools._src.preprocess.spatial import (
    rescale_spatial, correct_longitude, subset_spatial
)
from alti_tools._src.preprocess.temporal import subset_temporal, rescale_temporal
from alti_tools._src.preprocess.alongtrack import coarsen_alongtrack

In [60]:
# check code
# ??rescale_spatial

In [15]:
preprocess_config = get_preprocess_config()
# preprocess_config.subset.evaluation.get("lon_min", 100)
subset_config = preprocess_config.subset.spinup
coarsen_config = preprocess_config.coarsen

In [16]:
subset_config

lat_max: 43.0
lat_min: 33.0
lon_max: -55.0
lon_min: -65.0
time_max: '2012-10-22'
time_min: '2012-10-01'

In [17]:
coarsen_config

boundary: trim
coarsen: true
summary: mean
time_steps: 5

In [18]:
from loguru import logger
import time

In [19]:
logger.info("Starting preprocessing script...")
t0 = time.time()

# get all files in directory
logger.info("Getting files in directory...")
all_files = list_all_files(dataset_dir)

# get files required for the observations setup
setup = "swot1nadir5"
logger.info(f"Loading files setup: '{setup}'...")
setup_files = get_swot_obs_setup_files(all_files, setup=setup)

# choose the variables we want to open
variables = ["ssh_obs", "ssh_model", "lon", "lat"]

def preprocess(x):
    # subset variables
    x = x[variables]

    # coarsen the data (temporally)
    x = coarsen_alongtrack(x, subset_config)

    # subset region
    x = subset_temporal(x, subset_config)

    # correct longitude dimensions
    x = correct_longitude(x)

    # subset temporal region
    x = subset_spatial(x, subset_config)

    return x

logger.info("Loading preprocessing script...")
ds_swot = load_alongtrack_parallel(setup_files, preprocess=preprocess)

# sort by time
logger.info("Sorting by time...")
ds_swot = ds_swot.sortby("time").compute()

logger.info("Done!")
logger.debug(f"Time Taken: {time.time()-t0:.2f} secs")

ds_swot

2022-11-07 14:29:00.803 | INFO     | __main__:<cell line: 1>:1 - Starting preprocessing script...
2022-11-07 14:29:00.803 | INFO     | __main__:<cell line: 5>:5 - Getting files in directory...
2022-11-07 14:29:00.807 | INFO     | __main__:<cell line: 10>:10 - Loading files setup: 'swot1nadir5'...
2022-11-07 14:29:00.808 | INFO     | __main__:<cell line: 34>:34 - Loading preprocessing script...
HDF5-DIAG: Error detected in HDF5 (1.12.2) thread 1:
  #000: H5A.c line 528 in H5Aopen_by_name(): can't open attribute
    major: Attribute
    minor: Can't open object
  #001: H5VLcallback.c line 1091 in H5VL_attr_open(): attribute open failed
    major: Virtual Object Layer
    minor: Can't open object
  #002: H5VLcallback.c line 1058 in H5VL__attr_open(): attribute open failed
    major: Virtual Object Layer
    minor: Can't open object
  #003: H5VLnative_attr.c line 130 in H5VL__native_attr_open(): can't open attribute
    major: Attribute
    minor: Can't open object
  #004: H5Aint.c line 54

In [21]:
print(f"Time\nMin: {ds_swot.time.min().data}\nMax: {ds_swot.time.max().data}")
print(f"Longitude\nMin: {ds_swot.lon.min().compute().data}\nMax: {ds_swot.lon.max().compute().data}")
print(f"Latitude\nMin: {ds_swot.lat.min().compute().data}\nMax: {ds_swot.lat.max().compute().data}")

Time
Min: 2012-10-01T00:45:52.555031200
Max: 2012-10-21T21:20:27.971291200
Longitude
Min: -64.98904320000003
Max: -55.00801919999998
Latitude
Min: 33.00391
Max: 42.994517599999995
