In [None]:
import importlib
import os

from IPython.display import Markdown
from scipy.stats import zscore
import holoviews as hv
import hvplot.xarray
import numpy as np
import pandas as pd
import xarray as xr

from re_nobm_pcc import preprocess
from re_nobm_pcc import kit

os.chdir(os.environ['PWD'])

In [None]:
preprocess = importlib.reload(preprocess)
kit = importlib.reload(kit)
HyperLwn = preprocess.HyperLwn
PhytoChl = preprocess.PhytoChl

sample = xr.open_dataset(kit.DATA_DIR/'sample.nc')
sample_z = (sample - sample.mean('pxl')) #/sample.std('pxl')

# Data Summary

## Datasets

The features and labels are both model output from NASA GMAO using the [NOBM and OASIM](https://gmao.gsfc.nasa.gov/gmaoftp/NOBM) models. The labels are four phytoplankton chlorophyll densities output by NOBM. The features are normalized water leaving radiances output by OASIM, using the NOBM model as input.

### Features

One NetCDF file contains all the predictor data. Note that the `FillValue` attribute is not set to `9.99e11` in the netCDF file (Cecile will fix in next version). There are no explicit coordinates given; they are documented as attributes.

In [None]:
!ncdump -h data/nobm/HyperLwn.R2014.nc4

In [None]:
nonnull = int(HyperLwn.size - HyperLwn.isnull().sum())
Markdown(f"""
Augmented with coordinates, variable `HyperLwn` is a xarray.DataArray with {nonnull:,} values.
""")

In [None]:
HyperLwn

### Labels

Each of twelve NetCDF files contain a month of NOBM model output. The first is representative. Unlike the HyperLwn file, this one contains coordinates.

In [None]:
!ncdump -h data/nobm/monthly/mon200701.R2014.nc4

The `PhytoChl` xarray.DataArray includes the different phytoplankton groups along a 'component' dimension.

In [None]:
PhytoChl

## Plot your data

### Features

The radiances currently make a nice map, but the data should be more sparsely sampled.

In [None]:
(
    HyperLwn
    .sel(wavelength=[465, 665], method='nearest')
    .mean(dim='month')
    .hvplot.image(by='wavelength', subplots=True, clabel='Lwn')
    .cols(1)
    .opts(title='Time average radiances near chl-a peaks')
)

A few "typical" hyperspectral radiances.

In [None]:
(
    HyperLwn
    .sel({'lon': -120, 'lat': -15, 'month': [1, 4, 7, 10]}, method='nearest')
    .hvplot
    .line(by='month', ylabel='Lwn')
    .opts(title='Radiances at one pixel over different months')
)

In [None]:
(
    sample_z['features'].isel(pxl=slice(0, 10))
    .hvplot.line(x='wavelength', by='pxl', legend=False)
    .opts(title='A sample of mean centered radiances')
)

SVD to reduce the wavelength dimension to `k` vectors accounting for the most variation in the features. The singular values are:

In [None]:
k = 5
scores, s, vectors = kit.svd(sample_z['features'], dim='wavelength', k=k)
list(s.round(6))

The corresponding vectors:

In [None]:
vectors.hvplot.line(x='wavelength', by='pc')

A matrix of univariate (diagonal) and bivariate (off-diagonal) histograms of the `scores`, or coefficients generating each wavelength by linear combination of the `vectors` above.

In [None]:
(
    hvplot.scatter_matrix(
        scores.to_dataset(dim='pc').to_dataframe(),
        chart='hexbin',
        gridsize=16,
    )
    .opts(hv.opts.HexTiles(cmap='Viridis', tools=['hover']))
)

### Labels

A map of the phytoplankton labels in `PhytoChl` at one month. Since we can map we map...

In [None]:
(
    PhytoChl
    .sel(month=[4])
    .hvplot.image(by=['month', 'component'], subplots=True, clabel='chl-a')
    .cols(1)
)

- try different log bases
- pca ideas for independent response variable? embedding on output?

The distribution of the four phytoplankton groups.

In [None]:
sample['labels_p'] = (sample['labels'].dims, kit.ecdf(sample['labels']))
(
    sample[['labels', 'labels_p']]
    .hvplot
#    .line(x='labels', y='labels_p', by='component')
#    .opts(hv.opts.Curve(interpolation='steps-pre'))
    .scatter(x='labels', y='labels_p', by='component', xlabel='chl-a', ylabel='probability')
    .opts(title='ECDF of phytoplankton by component')
)

In [None]:
scores, s, vectors = kit.svd(sample_z['labels'], dim='component')
s

In [None]:
np.cov(scores, rowvar=False).round(8)

In [None]:
labels = xr.Dataset({
    'scores': scores,
    'scores_p': (scores.dims, kit.ecdf(scores)),
})
(
    labels[['scores', 'scores_p']]
    .hvplot
#    .line(x='labels', y='labels_p', by='component')
#    .opts(hv.opts.Curve(interpolation='steps-pre'))
    .scatter(x='scores', y='scores_p', by='pc', xlabel='score', ylabel='probability')
    .opts(title='ECDF of phytoplankton PCA by component')
)

In [None]:
(
    hvplot.scatter_matrix(
        scores.to_dataset(dim='pc').to_dataframe(),
        chart='hexbin',
        gridsize=16,
    )
    .opts(hv.opts.HexTiles(cmap='Viridis', tools=['hover']))
)