In [174]:
from matplotlib.pyplot import thetagrids
import h5py
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from tqdm import tqdm

from sklearn.mixture import GaussianMixture
from sklearn import metrics

#from dask import delayed, compute
#from dask.diagnostics import ProgressBar

pd.options.mode.chained_assignment = None

import subprocess

#from dask.distributed import Client
#c = Client()
#c.cluster

Below are the dates and sites to download.  These will be used later for examining the downloaded sites as well. For the download script enter your NEON API token.

In [147]:
# NEON API Token
token = '' 

# path where downloads will be saved upon download
data_path = '/media/storage/NEON'

# path where files we are working with will be saved
neon_path = os.path.join('/media', 'data', 'NEON')

# years we want
years = ['2018', '2019', '2020', '2021']

# sites we want
sites = [ 'ABBY', 'BARR', 'BLAN', 'BONA', 'CLBJ', 'CPER', 'DCFS', 'DEJU',
          'DELA', 'DSNY', 'GRSM', 'GUAN', 'HARV', 'HEAL', 'JERC', 'JORN',
          'KONA', 'LAJA', 'LENO', 'MLBS', 'NIWO', 'NOGP', 'OAES', 'ONAQ',
          'ORNL', 'OSBS', 'PUUM', 'RMNP', 'SERC', 'SJER', 'SOAP', 'SRER',
          'STEI', 'STER', 'TALL', 'TEAK', 'TOOL', 'TREE', 'UKFS', 'UNDE',
          'WOOD', 'WREF', 'YELL']


## Downloading files

This section creates a shell script to dowload the files using neonUtilities.  Run the script in a terminal. It will take a long time.  This downloads the bundled eddy flux data (DP4.00200.001), then looks at the footprint and downloads the tiled hyperspectral data (DP3.30006.001) covering the footprint.

In [97]:
def make_cmds(site, year, token, data_path) :
    '''return command for downloading data via get_flux'''

    cmd = f'./start_get_flux.sh {site} {year}-04 {year}-07 $TOKEN {data_path}/{site}'

    return cmd

In [None]:
# make list of download commands
cmds = []
for site in tqdm(sites):
    for year in years:
        cmds.append(make_cmds(site, year, token, data_path))

# using subprocess for this works poorly     
#for cmd in cmds:
#   _ = subprocess.run(cmd, shell=True, capture_output=True)


# write a sh script to download all the files
with open('download.sh', 'w') as dst:
    dst.write('#!/bin/sh\n\n')
    dst.write(f'TOKEN={token}\n')

    for item in cmds:
        dst.write(f'{item}\n')


# go run the script in a terminal

## Finding Valid Observations

In this section we will determine which sites have enough valid observations to be useful.

In [148]:
def count_valid_observations(site, files, out_path):
    '''
    Goes through all csv files for a site and writes a csv with
    counts of valid observations to outpath. Valid means that
    they exist and have a passing final QF flag.

    csv has columns:
    'site', 'CO2', 'H2O','T', 'footprint', 'all'

    Each contains the numbr of valid observations for that
    quantity. the 'all' column counts rows where all quantities
    have valid values.
    '''

    # make empty df for quality info
    qdf = pd.DataFrame(columns=['site',
                                'CO2',
                                'H2O',
                                'T',
                                'footprint',
                                'all'],
                        index=pd.to_datetime([]))

    for f in files:

        # get the day
        day = pd.to_datetime(f.split('nsae.')[1].split('.')[0]).date()

        # open the hdf
        hdf = pd.HDFStore(f)

        try:
            # get the flux quality flags
            qfqm_CO2 = hdf.get(f'{site}/dp04/qfqm/fluxCo2/nsae')
            qfqm_H2O = hdf.get(f'{site}/dp04/qfqm/fluxH2o/nsae')
            qfqm_T = hdf.get(f'{site}/dp04/qfqm/fluxTemp/nsae')
            qfqm_foot = hdf.get(f'{site}/dp04/qfqm/foot/turb')

            # Select observations with no bad flags
            qfqm_CO2  = qfqm_CO2.loc[qfqm_CO2.qfFinl == 0]
            qfqm_H2O  = qfqm_H2O.loc[qfqm_H2O.qfFinl == 0]
            qfqm_T    = qfqm_T.loc[qfqm_T.qfFinl == 0]
            qfqm_foot = qfqm_foot.loc[qfqm_foot.qfFinl == 0]

            # get the footprint input stats
            stat = hdf.get(f'{site}/dp04/data/foot/stat/')

            # get indices of the dfs from above
            istat  = stat.set_index('timeBgn').index
            iqfqmC = qfqm_CO2.set_index('timeBgn').index
            iqfqmH = qfqm_H2O.set_index('timeBgn').index
            iqfqmT = qfqm_T.set_index('timeBgn').index
            iqfqmf = qfqm_foot.set_index('timeBgn').index

            # keep only entries in stat which correspond to good
            # qfqm flags for all variables
            good = stat[
                (istat.isin(iqfqmC)) &
                (istat.isin(iqfqmH)) &
                (istat.isin(iqfqmT)) &
                (istat.isin(iqfqmf))
            ]

            # make a dict of the counts for each and all
            row = {
                   'site': site,
                   'CO2': len(qfqm_CO2),
                   'H2O': len(qfqm_H2O),
                   'T': len(qfqm_T),
                   'footprint': len(qfqm_foot),
                   'all': len(good)
                   }

            row = pd.DataFrame(row, index=[day])
        
            # add row to qdf    
            qdf = pd.concat([qdf, row])

            # close file
            hdf.close()
    
        except KeyError:
            pass


    # write a copy to csv
    qdf.to_csv(os.path.join(out_path, f'qfqm_counts_{site}.csv'))

    




# _____________________________________________________

# path where qfqm counts will be saved
out_path = '/media/data/NEON/all_sites'
os.makedirs(out_path, exist_ok=True)

for site in tqdm(sites):
    # path to files
    file_path = f'{data_path}/{site}/filesToStack00200'

    # make list of the files for the site
    files = [os.path.join(file_path, f)
             for f
             in os.listdir(file_path)
             if '.h5' in f]

    # count the valid observations
    _ = count_valid_observations(site, files, out_path)


100%|██████████| 43/43 [05:47<00:00,  8.09s/it]


In [149]:
# make list of files in the dir where the csvs were written
files = [os.path.join(out_path,f)
         for f
         in os.listdir(out_path)
         if f.endswith('.csv')]

# read all the csvs into df         
qdf = pd.concat((pd.read_csv(f)) for f in files)

# group by site
sums = qdf.groupby('site').sum()

# filter for sites with more than 100 valid observations
sums = sums.loc[sums['all'] > 100]

sums

Unnamed: 0_level_0,CO2,H2O,T,footprint,all
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BLAN,231,228,1784,5205,180
BONA,1589,1596,1987,5066,501
DCFS,189,187,3020,5660,166
DEJU,841,800,3770,5404,715
DELA,1674,1638,573,5652,319
HARV,768,650,3956,5053,567
HEAL,2985,2854,1431,5656,875
KONA,2448,2551,1609,5706,651
LAJA,321,332,3806,4015,283
LENO,800,693,4474,5316,579


In [150]:
# find total valid 'all' observations in the the selected sites
sums['all'].sum()

17643

In [151]:
# number of selected sites
len(sums)

24

## Creating csv files of valid observations


In [152]:
def get_valid_observations(site, files):
    '''
    Goes through all csv files for a site returns a df of valid
    observations. Valid means that they exist and have a
    passing final QF flag.
    '''

    # make empty list for dfs
    dfs = []

    for f in files:

        # get the day
        day = pd.to_datetime(f.split('nsae.')[1].split('.')[0]).date()

        # open the hdf
        hdf = pd.HDFStore(f)

        try:
            # get the flux quality flags
            qfqm_CO2 = hdf.get(f'{site}/dp04/qfqm/fluxCo2/nsae')
            qfqm_H2O = hdf.get(f'{site}/dp04/qfqm/fluxH2o/nsae')
            qfqm_T = hdf.get(f'{site}/dp04/qfqm/fluxTemp/nsae')
            qfqm_foot = hdf.get(f'{site}/dp04/qfqm/foot/turb')

            # Select observations with no bad flags
            qfqm_CO2  = qfqm_CO2.loc[qfqm_CO2.qfFinl == 0]
            qfqm_H2O  = qfqm_H2O.loc[qfqm_H2O.qfFinl == 0]
            qfqm_T    = qfqm_T.loc[qfqm_T.qfFinl == 0]
            qfqm_foot = qfqm_foot.loc[qfqm_foot.qfFinl == 0]

            # get the footprint input stats
            stat = hdf.get(f'{site}/dp04/data/foot/stat/')

            # get indices of the dfs from above
            istat  = stat.set_index('timeBgn').index
            iqfqmC = qfqm_CO2.set_index('timeBgn').index
            iqfqmH = qfqm_H2O.set_index('timeBgn').index
            iqfqmT = qfqm_T.set_index('timeBgn').index
            iqfqmf = qfqm_foot.set_index('timeBgn').index

            # keep only entries in stat which correspond to good
            # qfqm flags for all variables
            stat = stat[
                (istat.isin(iqfqmC)) &
                (istat.isin(iqfqmH)) &
                (istat.isin(iqfqmT)) &
                (istat.isin(iqfqmf))
            ]

            # get the flux data
            fluxCo2 = hdf.get(f'{site}/dp04/data/fluxCo2/nsae').drop('timeEnd', axis=1)
            fluxH2o = hdf.get(f'{site}/dp04/data/fluxH2o/nsae').drop('timeEnd', axis=1)
            fluxTemp = hdf.get(f'{site}/dp04/data/fluxTemp/nsae').drop('timeEnd', axis=1)

            # now merge dfs onto stat
            stat = stat.merge(fluxCo2, how='left', on='timeBgn', suffixes=('_stat', ''))
            stat = stat.merge(fluxH2o, how='left', on='timeBgn', suffixes=('_CO2', ''))
            stat = stat.merge(fluxTemp, how='left', on='timeBgn', suffixes=('_H20', '_T'))

            dfs.append(stat)

            # close file
            hdf.close()
    
        except KeyError:
            pass

    df = pd.concat(dfs)

    return df

In [169]:


sites = list(sums.index)

for site in tqdm(sites):
    # make sure site dir exists, make filename
    site_path = os.path.join(neon_path, site)
    csv_path = os.path.join(site_path, f'flux_data.csv')
    os.makedirs(site_path, exist_ok=True)

    # path to files
    file_path = f'{data_path}/{site}/filesToStack00200'

    # make list of the files for the site
    files = [
            os.path.join(file_path, f)
            for f
            in os.listdir(file_path)
            if ('.h5' in f)
            ]


    df = get_valid_observations(site, files)
    df.to_csv(csv_path, index=False)


100%|██████████| 24/24 [05:12<00:00, 13.01s/it]


## Creating  stratified sample


In [159]:
def find_sectors(df, theta=10):
    '''
    Adds sector column to df
    '''

    # make sure theta goes into 360 an even number of times
    if 360 % theta != 0:
        while 360 % theta != 0:
            theta= theta + 1
        print(f'theta has been forced to {theta} for even division of 360')

    # set start angle, and empy list
    df['sector'] = theta * (df.angZaxsErth // theta)

    return df

In [170]:
# degrees per sector
θ = 18

# make empty df
sectors = pd.DataFrame(columns=['timeBgn', 'sector'])

for site in sites[:1]:
    csv_path = os.path.join(neon_path, site, 'flux_data.csv')
    flux_df = pd.read_csv(csv_path)

    # find sectors in which observations lie
    flux_df = find_sectors(flux_df, theta=θ)

    

In [171]:
flux_df

Unnamed: 0,timeEnd,angZaxsErth,distReso,veloYaxsHorSd,veloZaxsHorSd,veloFric,distZaxsMeasDisp,distZaxsRgh,distZaxsAbl,distXaxs90,distXaxsMax,distYaxs90,flux_CO2,flux_H20,timeBgn,flux_T,sector
0,2018-04-08T00:59:59.950Z,345.043925,6.34,0.627431,0.297594,0.227588,6.34,0.634000,1000.0,152.16,63.40,31.70,4.345192,-3.036561,2018-04-08T00:30:00.000Z,-34.410926,342.0
1,2018-04-08T01:29:59.950Z,357.408726,6.34,0.395602,0.245316,0.200000,6.34,0.634000,1000.0,158.50,69.74,25.36,1.491616,1.493820,2018-04-08T01:00:00.000Z,-26.646935,342.0
2,2018-04-08T01:59:59.950Z,351.889928,6.34,0.245366,0.230000,0.200000,6.34,0.485931,1000.0,177.52,76.08,19.02,1.988864,16.345876,2018-04-08T01:30:00.000Z,-15.238791,342.0
3,2018-04-08T03:59:59.950Z,347.133152,6.34,0.532786,0.366245,0.237374,6.34,0.564311,1000.0,139.48,57.06,19.02,3.452583,6.248057,2018-04-08T03:30:00.000Z,-50.946447,342.0
4,2018-04-08T04:29:59.950Z,350.910019,6.34,0.510539,0.334296,0.235507,6.34,0.468437,1000.0,152.16,63.40,19.02,1.694864,5.221214,2018-04-08T04:00:00.000Z,-33.512011,342.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,2018-04-03T19:29:59.950Z,64.203144,6.34,0.944606,0.243230,0.200000,6.34,0.634000,1000.0,158.50,69.74,57.06,0.250161,-7.060549,2018-04-03T19:00:00.000Z,29.907526,54.0
176,2018-04-03T19:59:59.950Z,39.960324,6.34,0.731510,0.268563,0.208439,6.34,0.364205,1000.0,177.52,69.74,31.70,-0.223327,1.635765,2018-04-03T19:30:00.000Z,39.869987,36.0
177,2018-04-03T20:59:59.950Z,182.123728,6.34,0.749050,0.481824,0.421962,6.34,0.634000,1000.0,171.18,69.74,25.36,2.467898,3.776534,2018-04-03T20:30:00.000Z,90.947143,180.0
178,2018-04-03T22:29:59.950Z,144.742877,6.34,0.514034,0.364015,0.294367,6.34,0.423205,1000.0,177.52,69.74,19.02,1.448730,2.351311,2018-04-03T22:00:00.000Z,10.831379,144.0


In [179]:
def gmm_cluster(df,k_range=(2, 10), verbose=True):
    '''
    Performs Gaussian Mixture Model clustering.
    df - dataframe containing only the features to
         be used in clustering
    k_range - tuple, representing range of numbers
         of clusters to be tried.
    '''
    # empty list
    silhouette_scores  = []

    # make list of range
    k_r = list(range(*k_range))

    for k in k_r:
        # make model
        model = GaussianMixture(n_components=k,
                                n_init=10,
                                init_params='kmeans')
        # fit
        labels = model.fit_predict(df)

        # calculate silhouette score
        silhouette_scores.append(
            metrics.silhouette_score(df, labels, metric='euclidean')
            )

    # choose the highest silhouette scoring k
    k = k_r[np.argmax(silhouette_scores)]

    # now create full model
    modelk = GaussianMixture(
        n_components=k,
        covariance_type='full',
        random_state=1
        )

    cluster = modelk.fit(df)
    labels = modelk.predict(df)

    if verbose:
        print(f'*************** {k} Cluster Model ***************')
        print('Weights: ', cluster.weights_)
        print('Converged: ', cluster.converged_)
        print('No. of Iterations: ', cluster.n_iter_)
        print('Lower Bound: ', cluster.lower_bound_)
        print(f'*************************************************')

    return labels

In [180]:
# columns to use in clustering
cols = ['veloYaxsHorSd',
'veloZaxsHorSd',
'veloFric',
'distZaxsMeasDisp',
'distZaxsRgh',
'distZaxsAbl',
'distXaxs90',
'distXaxsMax', 
'distYaxs90']

flux_df['label'] = gmm_cluster(flux_df[cols])

*************** 8 Cluster Model ***************
Weights:  [0.34415242 0.17224005 0.00555556 0.36111012 0.00555556 0.01666667
 0.0333155  0.06140413]
Converged:  True
No. of Iterations:  12
Lower Bound:  12.81607213852765
*************************************************


## 

In [194]:
for site in tqdm(sites):
    src_dir = os.path.join(data_path, site, 'footprints')
    src_files = [os.path.join(src_dir, f)
                for f in os.listdir(src_dir)
                if f.endswith('.tiff')]

    dst = os.path.join(neon_path, site, 'footprints')
    os.makedirs(dst, exist_ok=True)

    
    for src in src_files:
        shutil.copy(src, dst)


100%|██████████| 24/24 [04:02<00:00, 10.09s/it]


In [185]:
neon_path

'/media/data/NEON'

In [186]:
data_path

'/media/storage/NEON'

In [187]:
import shutil