In [11]:
import os
import random
import numpy as np
import cupy
import h5py
import matplotlib.pyplot as plt

from numba import cuda, jit

from dask import delayed, compute
from dask.diagnostics import ProgressBar

from cuml.experimental.preprocessing import  StandardScaler
from cuml import PCA

In [2]:
# path and sitename
site = 'TALL'
path = '/rapids/notebooks/data'
data = f'{path}/hyperspectral/DP3.30006.001/2021/FullSite/D08/2021_TALL_6/L3/Spectrometer/Reflectance'

# find the filenames
files = [os.path.join(data, f) for f in os.listdir(data) if '.h5' in f]

In [3]:
def band_list():
    '''excludes bands with H2O or CO2 absorption'''
    good_bands = np.hstack([
        np.arange(0, 188 + 1),
        np.arange(211, 269 + 1),
        np.arange(316, 425 + 1)
    ])
    
    return good_bands


In [9]:
@delayed
def sample_from_file(fname, size):
    '''samples from file'''
    
    # open the file
    f = h5py.File(fname, 'r')

    # get the within reflectance as np array
    refl_array = np.array(np.rot90(f[site]['Reflectance']['Reflectance_Data']))

    # drop bad bands from refl_array
    refl_array = refl_array[:, :, band_list()]

    # get shape of wavelenght dimension
    wl = refl_array.shape[2]

    # reshape
    flat_refl = refl_array.reshape(-1, wl)

    # drop nulls
    flat_refl = flat_refl[
    (~np.any(flat_refl == -9999, axis=1)) &
    (~np.any(np.isnan(flat_refl), axis=1))]

    # get random sample indices
    sample_idx = random.sample(range(flat_refl.shape[0]), int(flat_refl.shape[0] * size))

    # return sample
    return flat_refl[sample_idx, :]
    

def sample_from_all(files, size):
    '''
    Returns a np array of samples of shape (N, wl) where N is
    the number of samples and wl is the length of the wavelength
    dimension.
    
    args:
        files    - list of full paths to netcdf4 files to be used. 
        size     - fraction of data to be used.
    '''
    # empty list for samples
    samples = []
    
    for fname in files:
        samples.append(sample_from_file(fname, size))
        
    with ProgressBar():
        sample = np.vstack(compute(*samples))
        
    return sample


def plot_pca_var(pca):
    '''plots explained variance by PCA component'''
    
    # make fig
    plt.figure(figsize=(10,4));

    # plot
    plt.plot(range(1, 359),
             cupy.asnumpy(pca.explained_variance_ratio_.cumsum()),
             marker='o',
             linestyle='--');
    
    # details
    plt.title('Explained Variance by Number of Components');
    plt.xlabel('Components');
    plt.ylabel('Cumulative explained Var');
    plt.xlim(0, 20);
    plt.show()

In [5]:
size = 1 / 500
sample = sample_from_all(files, size)

[########################################] | 100% Completed |  4min  9.2s


In [8]:
sample = cupy.array(sample)

# scale the data
scaler = StandardScaler().fit(sample)
scaled = scaler.transform(sample)

# instantiate the PCA thingy
pca = PCA()

# fit the pca model
pca.fit(scaled)



PCA()

In [15]:
#plot_pca_var(pca)
type(cupy.asnumpy(pca.explained_variance_ratio_.cumsum()))

numpy.ndarray

In [42]:
def read_h5_return_dict(fname):
    '''reads'''  

    # open the file
    f = h5py.File(fname, 'r')
    
    # seperate out reflectance
    refl = f[site]['Reflectance']

    # get the actual data within reflectance as cupy array
    refl_array = cupy.array(np.rot90(refl['Reflectance_Data']))
    
    # drop bad bands from refl_array
    refl_array = refl_array[:, :, band_list()]
    
    # get wavelength info
    wavelengths = np.array(refl['Metadata']['Spectral_Data']['Wavelength'])
    
    # drop bad bands from wavelength
    wavelengths = wavelengths[band_list()]
    
    # bag geographic info
    epsg = refl['Metadata']['Coordinate_System']['EPSG Code'][()].decode("utf-8")
    epsg = f'EPSG:{epsg}'
    crs_info = refl['Metadata']['Coordinate_System']['Map_Info'][()].decode("utf-8").split(',')
    utm_zone = int(crs_info[7])

    xmin = float(crs_info[3])
    ymax = float(crs_info[4])
    res = (float(crs_info[5]), float(crs_info[6]))

    xmax = xmin + (refl_array.shape[1] * res[0]) 
    ymin = ymax - (refl_array.shape[0] * res[1])

    extent = (xmin, xmax, ymin, ymax) 

    # find and add scale factor and data ignore value as attrs
    scale_factor = refl['Reflectance_Data'].attrs['Scale_Factor']
    no_data_value = refl['Reflectance_Data'].attrs['Data_Ignore_Value']
    
    
    
    return(refl_array)
    
    
a = read_h5to_xarray_with_spectral_indices(files[0])                            


In [None]:
    # get wavelength info
    wavelengths = refl['Metadata']['Spectral_Data']['Wavelength']

    # bag geographic info
    epsg = refl['Metadata']['Coordinate_System']['EPSG Code'][()].decode("utf-8")
    epsg = f'EPSG:{epsg}'
    crs_info = refl['Metadata']['Coordinate_System']['Map_Info'][()].decode("utf-8").split(',')
    utm_zone = int(crs_info[7])

    xmin = float(crs_info[3])
    ymax = float(crs_info[4])
    res = (float(crs_info[5]), float(crs_info[6]))

    xmax = xmin + (refl_array.shape[1] * res[0]) 
    ymin = ymax - (refl_array.shape[0] * res[1])

    extent = (xmin, xmax, ymin, ymax) 

    # find and add scale factor and data ignore value as attrs
    scale_factor = refl['Reflectance_Data'].attrs['Scale_Factor']
    no_data_value = refl['Reflectance_Data'].attrs['Data_Ignore_Value']
    d_all.attrs = {'scale_factor': scale_factor, 'no_data_value': no_data_value}
    
    # select only good bands
    d_all =  d_all.isel(wl=band_list()[0]).chunk({'x':'auto','y':'auto','wl':-1})
    
    # calculate the spectral indices and add to dataset
    ndvi = ((d_all.reflectance.sel(wl=858.6,
                                   method='nearest') -
             d_all.reflectance.sel(wl=648.2,
                                   method='nearest')) /
            (d_all.reflectance.sel(wl=858.6,
                                   method='nearest') +
             d_all.reflectance.sel(wl=648.2,
                                   method='nearest'))
           ).assign_coords(index='ndvi').expand_dims('index')


    cai = ((0.5 *
            (d_all.reflectance.sel(wl=2000,
                                   method='nearest') /
             10000.0 +
             d_all.reflectance.sel(wl=2200,
                                   method='nearest') /
             10000.0)) - d_all.reflectance.sel(wl=2100.0,
                                              method='nearest') /
          10000.0).drop('wl').assign_coords(index='cai').expand_dims('index')


    ndli = ((np.log(1. /
                           (d_all.reflectance.sel(wl=1754.,
                                                  method='nearest') /
                            10000.0)) -
             np.log(1.0 /
                           (d_all.reflectance.sel(wl=1680.0,
                                                  method='nearest') /
                          10000.0))) /
            (np.log(d_all.reflectance.sel(wl=1754.0,
                                                 method='nearest') /
                           10000.0) +
             np.log(d_all.reflectance.sel(wl=1680,
                                               method='nearest') /
                           10000.0))).assign_coords(index='ndli').expand_dims('index')


    mrendvi = ((d_all.reflectance.sel(wl=750.0,
                                      method='nearest') -
                d_all.reflectance.sel(wl=705.0,
                                      method='nearest')) /
               (d_all.reflectance.sel(wl=750.0,
                                      method='nearest') +
                d_all.reflectance.sel(wl=705.0,
                                      method='nearest') -
                (2.0 *
                 d_all.reflectance.sel(wl=445.0,
                                       method='nearest')
                )
               )
              ).drop('wl').assign_coords(index='mrendvi').expand_dims('index')


    sipi = ((d_all.reflectance.sel(wl=800.0,
                                   method='nearest') -
             d_all.reflectance.sel(wl=445.0,
                                   method='nearest')) /
            (d_all.reflectance.sel(wl=800.0,
                                   method='nearest') -
             d_all.reflectance.sel(wl=680.0,
                                   method='nearest')
            )
           ).assign_coords(index='sipi').expand_dims('index')


    ndni = ((np.log(10000.0 /
                           d_all.reflectance.sel(wl=1510.0,
                                                 method='nearest')
                          ) -
             np.log(10000.0 /
                           d_all.reflectance.sel(wl=1680.0,
                                                 method='nearest')
                          )
            ) / 
            (np.log(10000.0 /
                           d_all.reflectance.sel(wl=1510.0,
                                                 method='nearest')
                          )+np.log(10000.0 /
                           d_all.reflectance.sel(wl=1680.0,
                                                 method='nearest')
                                         )
            )
           ).assign_coords(index='ndni').expand_dims('index')


    cri1 = ((1.0 /
             (d_all.reflectance.sel(wl=510.0,
                                    method='nearest') /
              10000.0)
            ) -
            (1.0 /
             (d_all.reflectance.sel(wl=550.0,
                                    method='nearest') /
              10000.0)
            )
           ).assign_coords(index='cri1').expand_dims('index')


    cri2 = ((1.0 /
             (d_all.reflectance.sel(wl=510.0,
                                    method='nearest') / 10000.0)
            ) - 
            (1.0 /
             (d_all.reflectance.sel(wl=700.0,
                                    method='nearest') /
              10000.0)
            )
           ).assign_coords(index='cri2').expand_dims('index')


    d_all['indices'] = xr.concat(
        [ndvi,
         cai, 
         ndli, 
         mrendvi, 
         sipi, 
         cri1, 
         cri2],
        dim='index').chunk((1.0,
                            d_all.reflectance.data.chunksize[0],
                            d_all.reflectance.data.chunksize[1])
                          ).transpose('y','x','index').chunk(('auto','auto',1))

    return d_all