# OSSE 2023a - Global Altimetry Dataset

In [None]:
import sys, os
from pyprojroot import here

# spyder up to find the root
root = here(project_files=[".root"])


# append to path
sys.path.append(str(root))

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns

sns.reset_defaults()
sns.set_context(context="talk", font_scale=0.7)

import hvplot.xarray
import hvplot.pandas

from inr4ssh._src.operators.finite_diff import calculate_gradient, calculate_laplacian
from inr4ssh._src.preprocess.subset import temporal_subset, spatial_subset
from inr4ssh._src.preprocess.coords import (
    correct_coordinate_labels,
    correct_longitude_domain,
)
from inr4ssh._src.data.ssh_obs import load_ssh_altimetry_data_train

from inr4ssh._src.preprocess.coords import correct_coordinate_labels

# from inr4ssh._src.preprocess.
from inr4ssh._src.viz.movie import create_movie
from inr4ssh._src.metrics.psd import psd_isotropic
from inr4ssh._src.viz.psd.isotropic import plot_psd_isotropic
from inr4ssh._src.viz.obs import plot_obs_demo
from inr4ssh._src.metrics.psd import psd_spacetime, psd_spacetime_dask
from inr4ssh._src.viz.psd.spacetime import (
    plot_psd_spacetime_wavelength,
    plot_psd_spacetime_wavenumber,
)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Download

```bash
mkdir independent_data
mkdir independent_data/altika
mkdir independent_data/altika/2015
mv 2015/independent_data/altika/dt_global_al* independent_data/altika/2015
mkdir independent_data/altika/2016
mv 2016/independent_data/altika/dt_global_al* independent_data/altika/2016
mkdir independent_data/altika/2017
mv 2017/independent_data/altika/dt_global_al* independent_data/altika/2017
mkdir independent_data/altika/2018
mv 2018/independent_data/altika/dt_global_al* independent_data/altika/2018
mkdir independent_data/altika/2019
mv 2019/independent_data/altika/dt_global_al* independent_data/altika/2019
```

```bash
mkdir independent_data/sentinel3b
mkdir independent_data/sentinel3b/2018
mv 2018/independent_data/sentinel3b/dt_global_s3b_phy_l3* independent_data/sentinel3b/2018
mkdir independent_data/sentinel3b/2019
mv 2019/independent_data/sentinel3b/dt_global_s3b_phy_l3* independent_data/sentinel3b/2019
```

```bash
mkdir grid
mkdir grid/2015
mv 2015/grid/dt_upd_global_merged_msla* grid/2015
mkdir grid/2016
mv 2016/grid/dt_upd_global_merged_msla* grid/2016
mkdir grid/2017
mv 2017/grid/dt_upd_global_merged_msla* grid/2017
mkdir grid/2018
mv 2018/grid/dt_upd_global_merged_msla* grid/2018
mkdir grid/2019
mv 2019/grid/dt_upd_global_merged_msla* grid/2019
```

```bash
rm -rf 2015 2016 2017 2018 2019
```

## Preprocess

1. We will get filenames!
2.

In [None]:
from ml_collections import config_dict

config = config_dict.ConfigDict()

# altimetry configuration
config.altimeters = altimeters = config_dict.ConfigDict()
altimeters.dependent = ["c2", "h2a", "h2ag", "h2b", "j2", "j2g", "j2n", "j3", "s3a"]

altimeters.independent = [
    "sentinel3b",
    "altika",
]

config.altimeters.years = ["2015", "2016", "2017", "2018", "2019"]

In [None]:
from pathlib import Path
from typing import List, Optional
from dataclasses import dataclass
from inr4ssh._src.files import list_all_files, get_subset_elements


@dataclass
class DC23aData:
    path: str
    altimeters = config_dict.ConfigDict()
    altimeters.dependent = ["c2", "h2a", "h2ag", "h2b", "j2", "j2g", "j2n", "j3", "s3a"]
    altimeters.independent = [
        "sentinel3b",
        "altika",
    ]
    years = ["2015", "2016", "2017", "2018", "2019"]

    def get_altimeters(self, stage: str = "train"):
        if stage == "train":
            return self.altimeters.dependent
        elif stage == "evaluation":
            return self.altimeters.independent
        elif stage == "all":
            return self.altimeters.dependent + self.altimeters.independent
        else:
            raise ValueError(f"Unrecognized stage: {stage}")

    def files_all(self):
        return list_all_files(path_data)

    def files_from_str(self, altimeter: str, year: Optional[str] = None):
        files = self.files_all()
        print(len(files))
        # TODO: ext=f"{altimeter}/{year}/**/*"
        files = get_subset_files_str(files, altimeter)
        print(len(files))
        if year is not None:
            files = get_subset_files_str(files, year)
        return files

    def files_from_list(self, altimeters: List[str], years: Optional[List[str]] = None):
        files = self.files_all()
        print(len(files))
        files = get_subset_files_list(files, altimeters)
        print(len(files))
        if years is not None:
            files = get_subset_files_list(files, years)
        return files

    def train_files_all(self):
        return self.files_from_list(altimeters=self.altimeters.dependent)

    def grid_files_all(self):
        return self.files_from_str(altimeter="grid")

    def valid_files_all(self):
        return self.files_from_list(altimeters=self.get_altimeters("evaluation"))


def get_subset_files_str(files_list, element: str = "c2"):

    assert isinstance(element, str)

    files = list(filter(lambda ifile: element in str(ifile), files_list))

    return files


def get_subset_files_list(files_list, elements: List[str]):

    assert isinstance(elements, list)

    files = list()
    for ielement in elements:
        files += get_subset_files_str(files_list, ielement)

    return files

In [None]:
path_data = Path("/Volumes/EMANS_HDD/data/dc23a_ose/raw/data_emmanuel")

!ls $path_data

In [None]:
# init class
data = DC23aData(path=path_data)

# get all files
all_files = data.files_all()
print(len(all_files))

assert 11_041 == len(all_files)

# get altimeters (train, evaluation, all)
altimeters = data.get_altimeters("train")
print(altimeters)

In [None]:
# get files (independent data)
files_train = data.train_files_all()
print(len(files_train))

data.get_altimeters("train")

In [None]:
for i in files_train:
    print(i)

In [None]:
# get files (independent data)
files_train = data.train_files_all()
print(len(files_train))
assert 6_988 == len(files_train)

In [None]:
# get files (independent data)
files_train = data.train_files_all()
print(len(files_train))
assert 6_988 == len(files_train)

# files_test = data.valid_files_all()
# print(len(files_test))
# assert 2_227 == len(files_test)


files_grid = data.grid_files_all()
print(len(files_grid))
assert 1_826 == len(files_grid)
#
# assert len(files_grid) + len(files_train) + len(files_test) == len(all_files)

In [None]:
len(files_grid), len(files_train), len(files_test), len(files_grid) + len(
    files_train
) + len(files_test), len(all_files)

In [None]:
# get specific file (altimeter=c2, year=2015)
files = data.files_from_str("sentinel3b")
print(len(files))

# get specific list of files (altimeters=["c2", "j2"], year=["2015", "2016"])
files = data.files_from_list(["c2", "j2"], ["2015", "2016"])
print(len(files))

In [None]:
all_files

### Get Train/Valid/Test

In [None]:
!ls /Volumes/EMANS_HDD/data/dc23a_ose/raw/data_emmanuel

In [None]:
from pathlib import Path

In [None]:
path_data = Path("/Volumes/EMANS_HDD/data/dc23a_ose/raw/data_emmanuel")

#### Training Data

For this, we have some observations over the entire globe for the years
2015-2019 (5 years)

We take some 9 specific altimetry tracks:

* c2
* h2a
* h2ag
* h2b
* j2
* j2g
* j2n
* j3
* s3a

In [None]:
path_training = path_data.joinpath("NON_independent_data")

In [None]:
!ls $path_training

In [None]:
path_data = Path("/Volumes/EMANS_HDD/data/dc23a_ose/raw/data_emmanuel/independent_data")
len(list_all_files(path_data))

In [None]:
from typing import List
from tqdm.notebook import tqdm
import itertools


def get_altimetry_files_query(altimeter: str, year: str):

    return list_all_files(path_training, ext=f"{altimeter}/{year}/**/*")


def get_altimetry_files_dict(
    altimeters: List[str] = ["c2"], years: List[str] = ["2017"]
):

    files = {}
    for ialtimeter in altimeters:
        files[ialtimeter] = {}
        for iyear in years:
            ifiles = get_altimetry_files_query(altimeter=ialtimeter, year=iyear)
            if len(ifiles) > 0:
                files[ialtimeter][iyear] = ifiles

    return files


def get_altimetry_files_list(
    altimeters: List[str] = ["c2"], years: List[str] = ["2017"]
):

    list_of_queries = list(itertools.product(altimeters, years))

    files = list()

    for iquery in tqdm(list_of_queries):
        ifiles = get_altimetry_files_query(altimeter=iquery[0], year=iquery[1])

        files += ifiles

    return files

In [None]:
files = get_altimetry_files_query(
    config.altimeters.dependent[0], config.altimeters.years[0]
)
len(files)

In [None]:
files = get_altimetry_files_list(config.altimeters.dependent, config.altimeters.years)
len(files)

In [None]:
subset = ["c2"]


def get_subset_files_str(files_list, element: str = "c2"):

    assert isinstance(element, str)

    files = list(filter(lambda ifile: element in str(ifile), files_list))

    return files


def get_subset_files_list(files_list, elements: List[str]):

    assert isinstance(elements, list)

    files = list()
    for ielement in elements:
        files += get_subset_files_str(files_list, ielement)

    return files

In [None]:
c2_files = get_subset_files_str(files, "c2")
c2_2015_files = get_subset_files_str(c2_files, "2015")
len(c2_files), len(c2_2015_files),

In [None]:
c2_files = get_subset_files_list(files, ["c2"])
c2_2015_files = get_subset_files_list(c2_files, ["2015"])
len(c2_files), len(c2_2015_files),

In [None]:
c2_files_ = get_subset_files_list(files, ["c2"])
c2_2015_files_ = get_subset_files_list(files, ["2015"])

In [None]:
files["h2a"].keys()

In [None]:
training_files = list_all_files(path_training, ext=f"{altimeter}/{year}/**/*")

In [None]:
ds_obs = xr.open_mfdataset(training_files)
ds_obs

In [None]:
# get all files in subdirectories
training_files = list_all_files(path_training, ext="c2/*")

In [None]:
# get subset elements (c2)
files_c2 = get_subset_elements(["c2"], training_files)

len(files_c2)

In [None]:
len(training_files)

In [None]:
files_c2