# OSSE 2023a - Global Altimetry Dataset

In [None]:
import sys, os
from pyprojroot import here

# spyder up to find the root
root = here(project_files=[".root"])


# append to path
sys.path.append(str(root))

In [None]:
import numpy as np
import pandas as pd
import xarray as xr
import matplotlib.pyplot as plt
import seaborn as sns

sns.reset_defaults()
sns.set_context(context="talk", font_scale=0.7)

import hvplot.xarray
import hvplot.pandas

from inr4ssh._src.operators.finite_diff import calculate_gradient, calculate_laplacian
from inr4ssh._src.preprocess.subset import temporal_subset, spatial_subset
from inr4ssh._src.preprocess.coords import (
    correct_coordinate_labels,
    correct_longitude_domain,
)
from inr4ssh._src.data.ssh_obs import load_ssh_altimetry_data_train

from inr4ssh._src.preprocess.coords import correct_coordinate_labels

# from inr4ssh._src.preprocess.
from inr4ssh._src.viz.movie import create_movie
from inr4ssh._src.metrics.psd import psd_isotropic
from inr4ssh._src.viz.psd.isotropic import plot_psd_isotropic
from inr4ssh._src.viz.obs import plot_obs_demo
from inr4ssh._src.metrics.psd import psd_spacetime, psd_spacetime_dask
from inr4ssh._src.viz.psd.spacetime import (
    plot_psd_spacetime_wavelength,
    plot_psd_spacetime_wavenumber,
)

%matplotlib inline
%load_ext autoreload
%autoreload 2

## Download

```bash
mkdir independent_data
mkdir independent_data/altika
mkdir independent_data/altika/2015
mv 2015/independent_data/altika/dt_global_al* independent_data/altika/2015
mkdir independent_data/altika/2016
mv 2016/independent_data/altika/dt_global_al* independent_data/altika/2016
mkdir independent_data/altika/2017
mv 2017/independent_data/altika/dt_global_al* independent_data/altika/2017
mkdir independent_data/altika/2018
mv 2018/independent_data/altika/dt_global_al* independent_data/altika/2018
mkdir independent_data/altika/2019
mv 2019/independent_data/altika/dt_global_al* independent_data/altika/2019
```

```bash
mkdir independent_data/sentinel3b
mkdir independent_data/sentinel3b/2018
mv 2018/independent_data/sentinel3b/dt_global_s3b_phy_l3* independent_data/sentinel3b/2018
mkdir independent_data/sentinel3b/2019
mv 2019/independent_data/sentinel3b/dt_global_s3b_phy_l3* independent_data/sentinel3b/2019
```

```bash
mkdir grid
mkdir grid/2015
mv 2015/grid/dt_upd_global_merged_msla* grid/2015
mkdir grid/2016
mv 2016/grid/dt_upd_global_merged_msla* grid/2016
mkdir grid/2017
mv 2017/grid/dt_upd_global_merged_msla* grid/2017
mkdir grid/2018
mv 2018/grid/dt_upd_global_merged_msla* grid/2018
mkdir grid/2019
mv 2019/grid/dt_upd_global_merged_msla* grid/2019
```

```bash
rm -rf 2015 2016 2017 2018 2019
```

## Preprocess

1. We will get filenames!
2.

In [None]:
from inr4ssh._src.data.dc23a import DC23aDataFiles
from pathlib import Path

In [None]:
path_data = Path("/Volumes/EMANS_HDD/data/dc23a_ose/raw/data_emmanuel")

!ls $path_data

In [None]:
# init class
data = DC23aDataFiles(path=path_data)

# get altimeters (train, evaluation, all)
altimeters = data.get_altimeters("train")
print(altimeters)

altimeters = data.get_altimeters("evaluation")
print(altimeters)

altimeters = data.get_altimeters("all")
print(altimeters)

altimeters = data.get_altimeters("grid")
print(altimeters)

In [None]:
# get specific file (altimeter=c2, year=2015)
files = data.files_from_str("c2", "2015")
print(len(files))

# get specific file (altimeter=c2)
files = data.files_from_str("c2")
print(len(files))

# get specific list of files (altimeters=["c2", "j2"], year=["2015", "2016"])
files = data.files_from_list(["c2", "j2"], ["2015", "2016"])
print(len(files))

files = data.files_from_list(["altika", "c2"], ["2015", "2017", "2018"])
print(len(files))

### Get Train/Valid/Test

In [None]:
# get all files
all_files = data.files_all()
print(len(all_files))
assert 11_041 == len(all_files)

# get files (independent data)
files_train = data.train_files_all()
print(len(files_train))
assert 6_988 == len(files_train)

# get files (dependent data)
files_valid = data.valid_files_all()
print(len(files_valid))
assert 2_227 == len(files_valid)

# get files (grid data)
files_grid = data.grid_files_all()
print(len(files_grid))
assert 1_826 == len(files_grid)

assert len(files_grid) + len(files_valid) + len(files_train) == len(all_files)

In [None]:
path_data = Path("/Volumes/EMANS_HDD/data/dc23a_ose/raw/data_emmanuel")

#### Training Data

For this, we have some observations over the entire globe for the years
2015-2019 (5 years)

We take some 9 specific altimetry tracks:

* c2
* h2a
* h2ag
* h2b
* j2
* j2g
* j2n
* j3
* s3a

In [None]:
from inr4ssh._src.preprocess.spatial import convert_lon_360_180
from inr4ssh._src.preprocess.coords import correct_coordinate_labels

# get specific file (altimeter=c2, year=2015)
files = data.files_from_str("c2", "2015")


def preprocess(x):

    # check
    x = correct_coordinate_labels(x)

    # convert
    x["longitude"] = convert_lon_360_180(x.longitude)

    #
    x = x.sel(time=slice(np.datetime64("2015-02-01"), np.datetime64("2015-03-01")))

    return x


ds_data = xr.open_mfdataset(files, preprocess=preprocess, engine="netcdf4")

In [None]:
print(f"nbytes: {ds_data.nbytes / (1024*1024):.2f} MB")

In [None]:
ds_data

In [None]:
# get specific file (altimeter=c2)
files = data.files_from_str("c2")
print(len(files))