In [1]:
from matplotlib.pyplot import thetagrids
import h5py
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from tqdm import tqdm

#from dask import delayed, compute
#from dask.diagnostics import ProgressBar

pd.options.mode.chained_assignment = None

import subprocess

#from dask.distributed import Client
#c = Client()
#c.cluster

Below are the dates and sites to download.  These will be used later for examining the downloaded sites as well. For the download script enter your NEON API token.

In [2]:
# NEON API Token
token = '' 

# pathe where downloads will be saved
data_path = '/media/storage/NEON'

# years we want
years = ['2018', '2019', '2020', '2021']

# sites we want
sites = [ 'ABBY', 'BARR', 'BLAN', 'BONA', 'CLBJ', 'CPER', 'DCFS', 'DEJU',
          'DELA', 'DSNY', 'GRSM', 'GUAN', 'HARV', 'HEAL', 'JERC', 'JORN',
          'KONA', 'LAJA', 'LENO', 'MLBS', 'NIWO', 'NOGP', 'OAES', 'ONAQ',
          'ORNL', 'OSBS', 'PUUM', 'RMNP', 'SERC', 'SJER', 'SOAP', 'SRER',
          'STEI', 'STER', 'TALL', 'TEAK', 'TOOL', 'TREE', 'UKFS', 'UNDE',
          'WOOD', 'WREF', 'YELL']


## Downloading files

This section creates a shell script to dowload the files using neonUtilities.  Run the script in a terminal. It will take a long time.  

In [97]:
def make_cmds(site, year, token, data_path) :
    '''return command for downloading data via get_flux'''

    cmd = f'./start_get_flux.sh {site} {year}-04 {year}-07 $TOKEN {data_path}/{site}'

    return cmd

In [None]:
# make list of download commands
cmds = []
for site in tqdm(sites):
    for year in years:
        cmds.append(make_cmds(site, year, token, data_path))

# using subprocess for this works poorly     
#for cmd in cmds:
#   _ = subprocess.run(cmd, shell=True, capture_output=True)


# write a sh script to download all the files
with open('download.sh', 'w') as dst:
    dst.write('#!/bin/sh\n\n')
    dst.write(f'TOKEN={token}\n')

    for item in cmds:
        dst.write(f'{item}\n')


# go run the script in a terminal

## Finding Valid Observations

In this section we will determine which sites have enough valid observations to be useful.

In [98]:
def count_valid_observations(site, files, out_path):
    '''
    Goes through all csv files for a site and writes a csv with
    counts of valid observations to outpath. Valid means that
    they exist and have a passing final QF flag.

    csv has columns:
    'site', 'CO2', 'H2O','T', 'footprint', 'all'

    Each contains the numbr of valid observations for that
    quantity. the 'all' column counts rows where all quantities
    have valid values.
    '''

    # make empty df for quality info
    qdf = pd.DataFrame(columns=['site',
                                'CO2',
                                'H2O',
                                'T',
                                'footprint',
                                'all'],
                        index=pd.to_datetime([]))

    for f in files:

        # get the day
        day = pd.to_datetime(f.split('nsae.')[1].split('.')[0]).date()

        # open the hdf
        hdf = pd.HDFStore(f)

        try:
            # get the flux quality flags
            qfqm_CO2 = hdf.get(f'{site}/dp04/qfqm/fluxCo2/nsae')
            qfqm_H2O = hdf.get(f'{site}/dp04/qfqm/fluxH2o/nsae')
            qfqm_T = hdf.get(f'{site}/dp04/qfqm/fluxTemp/nsae')
            qfqm_foot = hdf.get(f'{site}/dp04/qfqm/foot/turb')

            # Select observations with no bad flags
            qfqm_CO2  = qfqm_CO2.loc[qfqm_CO2.qfFinl == 0]
            qfqm_H2O  = qfqm_H2O.loc[qfqm_H2O.qfFinl == 0]
            qfqm_T    = qfqm_T.loc[qfqm_T.qfFinl == 0]
            qfqm_foot = qfqm_foot.loc[qfqm_foot.qfFinl == 0]

            # get the footprint input stats
            stat = hdf.get(f'{site}/dp04/data/foot/stat/')

            # get indices of the dfs from above
            istat  = stat.set_index('timeBgn').index
            iqfqmC = qfqm_CO2.set_index('timeBgn').index
            iqfqmH = qfqm_H2O.set_index('timeBgn').index
            iqfqmT = qfqm_T.set_index('timeBgn').index
            iqfqmf = qfqm_foot.set_index('timeBgn').index

            # keep only entries in stat which correspond to good
            # qfqm flags for all variables
            good = stat[
                (istat.isin(iqfqmC)) &
                (istat.isin(iqfqmH)) &
                (istat.isin(iqfqmT)) &
                (istat.isin(iqfqmf))
            ]

            # make a dict of the counts for each and all
            row = {
                   'site': site,
                   'CO2': len(qfqm_CO2),
                   'H2O': len(qfqm_H2O),
                   'T': len(qfqm_T),
                   'footprint': len(qfqm_foot),
                   'all': len(good)
                   }

            row = pd.DataFrame(row, index=[day])
        
            # add row to qdf    
            qdf = pd.concat([qdf, row])

            # close file
            hdf.close()
    
        except KeyError:
            pass


    # write a copy to csv
    qdf.to_csv(os.path.join(out_path, f'qfqm_counts_{site}.csv'))

    




# _____________________________________________________

# path where qfqm counts will be saved
out_path = '/media/data/NEON/all_sites'
os.makedirs(out_path, exist_ok=True)

for site in tqdm(sites):
    # path to files
    file_path = f'{data_path}/{site}/filesToStack00200'

    # make list of the files for the site
    files = [os.path.join(file_path, f)
             for f
             in os.listdir(file_path)
             if '.h5' in f]

    # count the valid observations
    _ = count_valid_observations(site, files, out_path)


 30%|███       | 13/43 [01:44<03:34,  7.16s/it]

In [94]:
# make list of files in the dir whaere the csvs were written
files = [os.path.join(out_path,f)
         for f
         in os.listdir(out_path)
         if f.endswith('.csv')]

# read all the csvs into df         
qdf = pd.concat((pd.read_csv(f)) for f in files)

# group by site
sums = qdf.groupby('site').sum()

# filter for sites with more than 100 valid observations
sums = sums.loc[sums['all'] > 100]

sums

Unnamed: 0_level_0,CO2,H2O,T,footprint,all
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BLAN,231,228,1784,5205,180
BONA,1589,1596,1987,5066,501
DCFS,189,187,3020,5660,166
DEJU,841,800,3770,5404,715
DELA,1674,1638,573,5652,319
HARV,768,650,3956,5053,567
HEAL,2985,2854,1431,5656,875
KONA,2448,2551,1609,5706,651
LAJA,321,332,3806,4015,283
LENO,800,693,4474,5316,579


In [95]:
sums['all'].sum()

17643

In [96]:
len(sums)

24