In [19]:
from matplotlib.pyplot import thetagrids
import h5py
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
site = 'TEAK'
file_path = f'/media/data/NEON/{site}/filesToStack00200'
files = [os.path.join(file_path, f) for f in os.listdir(file_path) if '.h5' in f]

Below we define some functions for later use.  The docstring describe what they do.  The function `find_sectors()` will be used to evenly sample from different wind directions. 

In [54]:
def find_valid_observations(f):
    '''
    Reads footprint statistics from f,
    drops bad observations,
    then returns dataframe.
    '''
    # open the hdf
    hdf = pd.HDFStore(f)

    # get the flux quality flags
    qfqm_CO2 = hdf.get(f'{site}/dp04/qfqm/fluxCo2/nsae')
    qfqm_H2O = hdf.get(f'{site}/dp04/qfqm/fluxH2o/nsae')
    qfqm_T = hdf.get(f'{site}/dp04/qfqm/fluxTemp/nsae')
    qfqm_foot = hdf.get(f'{site}/dp04/qfqm/foot/turb')

    # Select observations with no bad flags
    qfqm_CO2  = qfqm_CO2.loc[qfqm_CO2.qfFinl == 0]
    qfqm_H2O  = qfqm_H2O.loc[qfqm_H2O.qfFinl == 0]
    qfqm_T    = qfqm_T.loc[qfqm_T.qfFinl == 0]
    qfqm_foot = qfqm_foot.loc[qfqm_foot.qfFinl == 0]

    # get the footprint input stats
    stat = hdf.get(f'{site}/dp04/data/foot/stat/')

    # get indices of the dfs from above
    istat  = stat.set_index('timeBgn').index
    iqfqmC = qfqm_CO2.set_index('timeBgn').index
    iqfqmH = qfqm_H2O.set_index('timeBgn').index
    iqfqmT = qfqm_T.set_index('timeBgn').index
    iqfqmf = qfqm_foot.set_index('timeBgn').index

    # keep only entries in stat which correspond to good
    # qfqm flags for all variables
    good = stat[
        (istat.isin(iqfqmC)) &
        (istat.isin(iqfqmH)) &
        (istat.isin(iqfqmT)) &
        (istat.isin(iqfqmf))
    ]

    hdf.close()

    return good


def find_sectors(stat, theta=10):
    '''
    Returns a df of timestamps and sectors of the mean wind direction
    '''

    # make sure theta goes into 360 an even number of times
    if 360 % theta != 0:
        while 360 % theta != 0:
            theta= theta + 1
        print(f'theta has been forced to {theta} for even division of 360')

    # set start angle, and empy list
    stat['sector'] = theta * (stat.angZaxsErth // theta)

    return stat[['timeBgn', 'sector']]


    

In [110]:
# degrees per sector
θ = 18

# make empty df
sectors = pd.DataFrame(columns=['timeBgn', 'sector'])

# fill df with timestamps and sectors of valid observations
for f in files:
    # find footprint stats of the valid observations
    stat = find_valid_observations(f)

    # find sectors in which observations lie
    sects = find_sectors(stat, theta=θ)

    # scrunch the latest observations onto the df
    sectors = pd.concat([sectors, sects], axis=0)

In [100]:
# look at some of smallest sector counts
sectors.sector.value_counts().sort_values().head(10)

18.0      9
72.0     14
108.0    16
90.0     17
0.0      21
36.0     22
54.0     23
342.0    23
126.0    32
324.0    38
Name: sector, dtype: int64

Looking at the above code we can see that there is descent representation across sectors. Let's choose 30 as a sample size from each sector. There will be three underrepresented sectors, but they will not be hugely underrepresented.

In [7]:
# sample size
samp_size = 10

def get_data_by_sector(files, sectors, theta, sample_size):
    '''sectors is a list or 1d array of sectors'''
    
    for sector in sectors:

        sect_dfs = []

        for f in files:
                # open the hdf
            hdf = pd.HDFStore(f)

            # get the flux quality flags
            qfqm_CO2 = hdf.get(f'{site}/dp04/qfqm/fluxCo2/nsae')
            qfqm_H2O = hdf.get(f'{site}/dp04/qfqm/fluxH2o/nsae')
            qfqm_T = hdf.get(f'{site}/dp04/qfqm/fluxTemp/nsae')
            qfqm_foot = hdf.get(f'{site}/dp04/qfqm/foot/turb')

            # Select observations with no bad flags
            qfqm_CO2  = qfqm_CO2.loc[qfqm_CO2.qfFinl == 0]
            qfqm_H2O  = qfqm_H2O.loc[qfqm_H2O.qfFinl == 0]
            qfqm_T    = qfqm_T.loc[qfqm_T.qfFinl == 0]
            qfqm_foot = qfqm_foot.loc[qfqm_foot.qfFinl == 0]

            # get the footprint input stats
            stat = hdf.get(f'{site}/dp04/data/foot/stat/')

            # get indices of the dfs from above
            istat  = stat.set_index('timeBgn').index
            iqfqmC = qfqm_CO2.set_index('timeBgn').index
            iqfqmH = qfqm_H2O.set_index('timeBgn').index
            iqfqmT = qfqm_T.set_index('timeBgn').index
            iqfqmf = qfqm_foot.set_index('timeBgn').index

            # keep only entries in stat which correspond to good
            # qfqm flags for all variables
            good = stat[
                (istat.isin(iqfqmC)) &
                (istat.isin(iqfqmH)) &
                (istat.isin(iqfqmT)) &
                (istat.isin(iqfqmf))
            ]

            # find the sectors that exist in this df
            sects = find_sectors(good, theta=theta).set_index('timeBgn')

            # if sector is not represented here, break out of file loop
            if sector not in sects.sector.values:
                break

            # get fluxes
            flux_CO2 = hdf.get(f'{site}/dp04/data/fluxCo2/nsae')
            flux_H2O = hdf.get(f'{site}/dp04/data/fluxH2o/nsae')
            flux_T   = hdf.get(f'{site}/dp04/data/fluxTemp/nsae')

            # change names of flux columns
            flux_CO2.columns = ['timeBgn', 'timeEnd', 'flux_CO2']
            flux_H2O.columns = ['timeBgn', 'timeEnd', 'flux_H2O']
            flux_T.columns = ['timeBgn', 'timeEnd', 'flux_Temp']

            # drop timeEnd, set index to timeBgn
            flux_CO2 = flux_CO2.drop(columns=['timeEnd']).set_index('timeBgn')
            flux_H2O = flux_H2O.drop(columns=['timeEnd']).set_index('timeBgn')
            flux_T = flux_T.drop(columns=['timeEnd']).set_index('timeBgn')

            # merge dfs
            dfs = [good, flux_CO2, flux_H2O, flux_T]
            data = pd.concat(dfs, axis=1, join='inner')

            # add sectors column to data
            data = pd.concat([data, sects], axis=1, join='inner')

            # group the data by sector
            sect_group = data.groupby('sector')

            # get data for sector
            df = sect_group.get_group(sector)

            # stick it on the list
            sect_dfs.append(df)

        # concat data from all files pertaining to sector
        df = pd.concat(sect_dfs)
        
        # now cluster and draw equaly from clusters

NameError: name 'hdf' is not defined

In [108]:
np.min([1,2])

1

In [45]:
h = h5py.File(files[0])

In [47]:
h['TEAK']['dp04']['qfqm']['fluxH2o']['nsae'].keys()

AttributeError: 'Dataset' object has no attribute 'keys'

In [50]:
h.close()

In [64]:
with pd.HDFStore(f) as hdf:
    x = hdf.get(f'{site}/dp04/data/fluxTemp/nsae')
    print(x.head())


                    timeBgn                   timeEnd       flux
0  2019-09-18T00:00:00.000Z  2019-09-18T00:29:59.000Z  67.947599
1  2019-09-18T00:30:00.000Z  2019-09-18T00:59:59.000Z  48.981145
2  2019-09-18T01:00:00.000Z  2019-09-18T01:29:59.000Z -25.885673
3  2019-09-18T01:30:00.000Z  2019-09-18T01:59:59.000Z -17.070132
4  2019-09-18T02:00:00.000Z  2019-09-18T02:29:59.000Z -27.002853


In [111]:
sectorz = find_sectors(stat, theta=18).set_index('timeBgn')
stat = stat.set_index('timeBgn')
dzf = pd.concat([stat, sectorz], axis=1, join='inner')

In [112]:
dzf.sample(len(dzf))

Unnamed: 0_level_0,timeEnd,angZaxsErth,distReso,veloYaxsHorSd,veloZaxsHorSd,veloFric,distZaxsMeasDisp,distZaxsRgh,distZaxsAbl,distXaxs90,distXaxsMax,distYaxs90,sector,sector
timeBgn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-09-18T22:30:00.000Z,2019-09-18T22:59:59.950Z,180.326083,40.0,0.907912,0.680906,0.490375,42.41,2.381215,1000.0,680.0,280.0,160.0,180.0,180.0
2019-09-18T16:00:00.000Z,2019-09-18T16:29:59.950Z,183.789341,40.0,0.36587,0.508242,0.205687,42.41,2.9905,1000.0,400.0,200.0,80.0,180.0,180.0
2019-09-18T21:30:00.000Z,2019-09-18T21:59:59.950Z,182.927345,40.0,1.147007,0.795618,0.483812,42.41,0.766017,1000.0,840.0,360.0,160.0,180.0,180.0
2019-09-18T20:00:00.000Z,2019-09-18T20:29:59.950Z,176.832765,40.0,1.242696,1.021844,0.954721,42.41,4.241,1000.0,640.0,280.0,200.0,162.0,162.0
2019-09-18T19:30:00.000Z,2019-09-18T19:59:59.950Z,171.001467,40.0,1.324906,1.009159,0.962321,42.41,4.241,1000.0,640.0,280.0,200.0,162.0,162.0
2019-09-18T22:00:00.000Z,2019-09-18T22:29:59.950Z,191.161758,40.0,0.941799,0.654411,0.499504,42.41,4.241,1000.0,560.0,240.0,200.0,180.0,180.0
2019-09-18T19:00:00.000Z,2019-09-18T19:29:59.950Z,159.43725,40.0,1.057959,0.907864,0.802859,42.41,2.44563,1000.0,760.0,320.0,160.0,144.0,144.0
2019-09-18T14:00:00.000Z,2019-09-18T14:29:59.950Z,289.882401,40.0,0.302366,0.23,0.2,42.41,4.241,1000.0,600.0,280.0,200.0,288.0,288.0
2019-09-18T21:00:00.000Z,2019-09-18T21:29:59.950Z,185.103548,40.0,1.063621,0.890364,0.723416,42.41,3.546116,1000.0,640.0,280.0,160.0,180.0,180.0
2019-09-18T23:00:00.000Z,2019-09-18T23:29:59.950Z,167.296886,40.0,0.948387,0.792969,0.669109,42.41,4.241,1000.0,600.0,280.0,200.0,162.0,162.0


In [75]:
sect_group.get_group(sects[i]).sample()

pandas.core.frame.DataFrame

In [82]:

i = 0

(sect_group.get_group(sects[i])
          .sample(samp_size)
          )
        

Unnamed: 0,timeBgn,sector
33,2019-10-11T16:30:00.000Z,180.0
37,2019-06-29T18:30:00.000Z,180.0
44,2019-06-30T22:00:00.000Z,180.0
45,2019-10-17T22:30:00.000Z,180.0
43,2019-06-29T21:30:00.000Z,180.0
46,2019-07-08T23:00:00.000Z,180.0
3,2019-08-27T01:30:00.000Z,180.0
41,2019-09-16T20:30:00.000Z,180.0
6,2019-07-06T03:00:00.000Z,180.0
43,2019-07-07T21:30:00.000Z,180.0


In [81]:
timestamps

['2019-10-09T15:00:00.000Z',
 '2019-09-24T21:00:00.000Z',
 '2019-06-29T19:00:00.000Z',
 '2019-10-24T23:00:00.000Z',
 '2019-09-20T17:30:00.000Z',
 '2019-09-15T20:00:00.000Z',
 '2019-10-02T23:00:00.000Z',
 '2019-08-30T22:00:00.000Z',
 '2019-10-25T17:30:00.000Z',
 '2019-09-29T01:00:00.000Z']