In [None]:
import importlib
import os
import warnings

from IPython.display import Markdown
from scipy.stats import zscore
import holoviews as hv
import hvplot.xarray
import numpy as np
import pandas as pd
import xarray as xr

from re_nobm_pcc import preprocess
from re_nobm_pcc import kit

warnings.filterwarnings(action='ignore', category=FutureWarning)
hv.extension('bokeh', logo=False)

In [None]:
preprocess = importlib.reload(preprocess)
kit = importlib.reload(kit)
HyperLwn = preprocess.HyperLwn
PhytoChl = preprocess.PhytoChl

sample = xr.open_dataset(kit.DATA_DIR/'sample.nc')
sample['pxl'] = range(sample.sizes['pxl'])
sample['labels'] = (
    sample[kit.TAXA]
    .to_array(dim='component')
    .transpose('pxl', 'component', ...)
)
sample_n = (sample - sample.mean('pxl')) #/sample.std('pxl')

# Datasets

The features and labels are both model output from NASA GMAO using the [NOBM and OASIM](https://gmao.gsfc.nasa.gov/gmaoftp/NOBM) models. The labels are four phytoplankton chlorophyll densities output by NOBM. The features are normalized water leaving radiances output by OASIM, using the NOBM model as input.

## Features

One NetCDF file contains all the predictor data. Note that the `FillValue` attribute is not set to `9.99e11` in the netCDF file (Cecile will fix in next version). There are no explicit coordinates given; they are documented as attributes.

In [None]:
!ncdump -h {os.environ['PWD']}/data/nobm/HyperLwn.R2014.nc4

In [None]:
nonnull_grid = int((~HyperLwn.isel(wavelength=0, month=0).isnull()).sum())
Markdown(f"""
Variable `HyperLwn` has non-null values at {nonnull_grid:,} pixels for each month
and wavelength.

In total, that gives {nonnull_grid * HyperLwn.sizes['month']:,} samples (that are highly non-independent!).
""")

In [None]:
nonnull = int(HyperLwn.size - HyperLwn.isnull().sum())
Markdown(f"""
Augmented with coordinates, variable `HyperLwn` is a xarray.DataArray with {nonnull:,} values.
""")

In [None]:
HyperLwn

## Labels

Each of twelve NetCDF files contain a month of NOBM model output. The first is representative. Unlike the HyperLwn file, this one contains coordinates.

In [None]:
!ncdump -h {os.environ['PWD']}/data/nobm/monthly/mon200701.R2014.nc4

The `PhytoChl` xarray.Dataset includes the different phytoplankton groups as variables.

In [None]:
PhytoChl

# Plot your Data

## Features

The radiances currently make a nice map, but the data should be more sparsely sampled.

In [None]:
dmap = (
    HyperLwn
    .sel(month=[2, 6, 10], wavelength=[465, 665], method='nearest')
    .hvplot.image(
        groupby=['month', 'wavelength'],
        subplots=True,
        clabel='Lwn (mW cm-2 microm-1 sr-1)',
        rasterize=True,
    )
    .opts(shared_axes=False)
)
dmap

A few "typical" hyperspectral radiances.

In [None]:
dmap = (
    HyperLwn
    .sel({'lon': -120, 'lat': -15, 'month': [2, 6, 10]}, method='nearest')
    .hvplot
    .line(by='month', ylabel='Lwn')
    # * hv.Slope(0, -0.2).options(color=hv.dim('wavelength'))
)
dmap

Mean centered radiances and corresponding phytoplankton abundances.

In [None]:
pxl = [4, 34, 53, 283]
grays = ['#000000', '#444444', '#777777', '#aaaaaa']
pigments = ['#47AC5F', '#FBEC2C', '#F884AB', '#E93429']
line = (
    sample['features'].sel(pxl=pxl)
    .hvplot
    .line(x='wavelength', by='pxl', ylabel='Lwn', legend=True)
    .options('Curve', fontscale=1.4, color=hv.Cycle(grays))
    .options('NdOverlay', legend_position='top_right')
)
(
    line
    + (
        sample['labels']
        .reset_coords(drop=True)
        .isel(pxl=pxl)
        .hvplot.bar(by='component')
        .options('Bars', fontscale=1.4, color=hv.Cycle(pigments))
    )
).cols(1)

SVD to reduce the wavelength dimension to `k` vectors accounting for the most variation in the features. The singular values are:

In [None]:
k = 5
scores, s, vectors = kit.svd(sample_n['features'], dim='wavelength', k=k)
list(s.round(6))

The corresponding vectors:

In [None]:
vectors.hvplot.line(x='wavelength', by='pc')

A matrix of univariate (diagonal) and bivariate (off-diagonal) histograms of the `scores`, or coefficients generating each wavelength by linear combination of the `vectors` above.

In [None]:
(
    hvplot.scatter_matrix(
        scores.to_dataset(dim='pc').to_dataframe(),
        chart='hexbin',
        gridsize=16,
    )
    .opts(hv.opts.HexTiles(cmap='Viridis', tools=['hover']))
)

## Labels

A map of the phytoplankton labels in `PhytoChl` at one month.

In [None]:
(
    PhytoChl
    .sel(month=[2, 5, 8, 11])
    .hvplot.image(
        z=kit.TAXA,
        groupby=['month'],
        subplots=True,
        clabel='chl-a',
        rasterize=True,
    )
)

The distribution of the four phytoplankton groups.

In [None]:
sample['labels_p'] = (sample['labels'].dims, kit.ecdf(sample['labels']))

In [None]:
(
    sample[['labels', 'labels_p']]
    .drop_vars('pxl')
    .hvplot
#    .line(x='labels', y='labels_p', by='component')
#    .opts(hv.opts.Curve(interpolation='steps-pre'))
    .scatter(x='labels', y='labels_p', by='component', xlabel='chl-a', ylabel='probability')
    .opts(title='ECDF of phytoplankton by component')
)

In [None]:
scores, s, vectors = kit.svd(sample_n['labels'], dim='component')
s

In [None]:
np.cov(scores, rowvar=False).round(8)

In [None]:
labels = xr.Dataset({
    'scores': scores,
    'scores_p': (scores.dims, kit.ecdf(scores)),
})
(
    labels[['scores', 'scores_p']]
    .hvplot
#    .line(x='labels', y='labels_p', by='component')
#    .opts(hv.opts.Curve(interpolation='steps-pre'))
    .scatter(x='scores', y='scores_p', by='pc', xlabel='score', ylabel='probability')
    .opts(title='ECDF of phytoplankton PCA by component')
)

In [None]:
(
    hvplot.scatter_matrix(
        scores.to_dataset(dim='pc').to_dataframe(),
        chart='hexbin',
        gridsize=16,
    )
    .opts(hv.opts.HexTiles(cmap='Viridis', tools=['hover']))
)

# OBPG Algorithms

OC4 (SeaWiFS) from https://oceancolor.gsfc.nasa.gov/atbd/chlor_a/

In [None]:
a = [0.3272, -2.9940, 2.7218, -1.2259, -0.5683]
blue = [443, 490, 510]
green = 555

In [None]:
sample['tot_hat'] = np.power(
    a[0] + np.power(
        np.log10(
            sample['features'].loc[{'wavelength': blue}].max(dim='wavelength')
            / sample['features'].loc[{'wavelength': green}]
        ),
        xr.DataArray(np.arange(1, 5), dims='power'),
    ) @ xr.DataArray(a[1:], dims='power'),
    10
)

In [None]:
(
    sample[['tot', 'tot_hat']]
    .reset_coords(drop=True)
    .hvplot.scatter(x='tot', y='tot_hat', groupby=[])
)

# Issues

- branch nobm_day to nobm_mon and finish pipeline to get monthly rrs
- transform of outputs
- pca outputs, to 
- pca inputs, to reduce complexity
- test for signal
  - 1 vs 2 nearest neighbor outputs
  - chl-a retrieval algorithms
- reflectance vs radiance
- additional data from cecile
- unbalanced data
- classification only