In [1]:
from calendar import monthrange
from datetime import datetime
import pandas as pd
from pandas.api.types import CategoricalDtype
import os
import json

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
sys.path.append(os.path.abspath("../../tools"))
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [2]:
os.path.abspath("../../tools")

'C:\\Users\\mchau\\Documents\\co2-flux-hourly-gpp-modeling\\code\\src\\tools'

In [3]:
root_dir =  os.path.abspath(os.getcwd()+"..\\..\\..\\..\\..\\")
tmp_dir = root_dir + "\\.tmp"
data_dir = root_dir + "\\data\\"

site_metadata_filename = data_dir + "site-metadata.csv"

In [4]:
# "Golden" Sites
target_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]

In [5]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename)

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(target_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df

size:(8, 26)


Unnamed: 0,site_id,dataset,start_year,end_year,file,is_dup,IGBP,elevation,lat,long,site_name,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename,size,country,record_count,site_IGBP,site_koppen,start_time,end_time,recorded_day_count
67,FR-Pue,FLUXNET,2000,2014,FLX_FR-Pue_FLUXNET2015_FULLSET_MM_2000-2014_2-...,False,EBF,270.0,43.7413,3.5957,Puechabon,8,3,Csa,Temperate,C3,6.59,data_full_half_hourly_raw_v0_1_FR-Pue.csv,109116169.0,FR,245760.0,EBF,Temperate,2000-07-26 00:00:00,2014-12-31 23:30:00,5120.0
119,US-Ne2,FLUXNET,2001,2013,FLX_US-Ne2_FLUXNET2015_FULLSET_MM_2001-2013_1-...,False,CRO,362.0,41.16487,-96.4701,Mead - irrigated maize-soybean rotation site,25,4,Dfa,Cold,rotation,48.91,,,,,,,,,
127,US-Ton,FLUXNET,2001,2014,FLX_US-Ton_FLUXNET2015_FULLSET_MM_2001-2014_1-...,False,WSA,177.0,38.4316,-120.96598,Tonzi Ranch,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_US-Ton.csv,103999932.0,US,230928.0,WSA,Temperate,2001-05-24 00:00:00,2014-12-31 23:30:00,4811.0
130,US-Var,FLUXNET,2000,2014,FLX_US-Var_FLUXNET2015_FULLSET_MM_2000-2014_1-...,False,GRA,129.0,38.4133,-120.9507,Vaira Ranch- Ione,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_US-Var.csv,110098318.0,US,245712.0,GRA,Temperate,2000-11-01 00:00:00,2014-12-31 23:30:00,5119.0
181,US-MMS,AmeriFlux,1999,2017,FLX_US-MMS_FLUXNET2015_FULLSET_MM_1999-2017_be...,True,DBF,275.0,39.3232,-86.4131,Morgan Monroe State Forest,25,4,Dfa,Cold,C3,42.28,,,,,,,,,
206,US-Vcp,AmeriFlux,2007,2017,FLX_US-Vcp_FLUXNET2015_FULLSET_MM_2007-2017_be...,False,ENF,2542.0,35.8624,-106.5974,Valles Caldera Ponderosa Pine,26,4,Dfb,Cold,C3,0.04,data_full_half_hourly_raw_v0_1_US-Vcp.csv,72934242.0,US,174528.0,ENF,Cold,2007-01-01 00:00:00,2017-12-31 23:30:00,3636.0
219,CH-Lae,ICOS2020,2004,2020,FLX_CH-Lae_FLUXNET2015_FULLSET_MM_2004-2020_be...,True,MF,689.0,47.47833,8.36439,Laegern,26,4,Dfb,Cold,C3,0.0,data_full_half_hourly_raw_v0_1_CH-Lae.csv,116210397.0,CH,288384.0,MF,Cold,2004-04-09 00:00:00,2020-12-17 23:30:00,6008.0
244,ES-LJu,ICOS2020,2004,2020,FLX_ES-LJu_FLUXNET2015_FULLSET_MM_2004-2020_be...,True,OSH,1600.0,36.92659,-2.75212,Llano de los Juanes,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_ES-LJu.csv,111661016.0,ES,239616.0,OSH,Temperate,2004-05-26 00:00:00,2020-12-31 23:30:00,4992.0


# Define Features

In [6]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']
qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
qc_flags_features = [s for s in all_features if "_QC" in s]
qc_flags_features

['TA_F_QC',
 'SW_IN_F_QC',
 'LW_IN_F_QC',
 'VPD_F_QC',
 'P_F_QC',
 'PA_F_QC',
 'G_F_MDS_QC',
 'LE_F_MDS_QC',
 'H_F_MDS_QC',
 'NEE_VUT_REF_QC',
 'NEE_CUT_REF_QC']

# Process Single File

In [64]:
r = site_metadata_df[['site_id','filename']].iloc[-1]
print(r.filename)
local_filename = tmp_dir + "\\" + r.filename

data_full_half_hourly_raw_v0_1_ES-LJu.csv


In [65]:
site_df = pd.read_csv(local_filename)

In [66]:
#features = ['SITE_ID','TIMESTAMP_START', 'TIMESTAMP_END', 'datetime', 'date', 'year', 'month', 'day', 'hour',"GPP_NT_VUT_REF"]
#site_df = pd.read_csv(local_filename, usecols=features + qc_glags_features)
site_df['datetime'] = pd.to_datetime(site_df['datetime'])
site_df['date'] = pd.to_datetime(site_df['date'])
site_df[qc_flags_features] = site_df[qc_flags_features].astype(qc_flag_dtype)
print(f"size:{site_df.shape}")
site_df.head()

size:(239616, 64)


Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,LW_IN_F_QC,LW_IN_ERA,VPD_F,VPD_F_QC,VPD_ERA,P_F,P_F_QC,P_ERA,PA_F,PA_F_QC,PA_ERA,NETRAD,PPFD_IN,G_F_MDS,G_F_MDS_QC,LE_F_MDS,LE_F_MDS_QC,LE_CORR,H_F_MDS,H_F_MDS_QC,H_CORR,NEE_VUT_REF,NEE_VUT_REF_QC,NEE_CUT_REF,NEE_CUT_REF_QC,GPP_NT_VUT_REF,GPP_DT_VUT_REF,GPP_NT_CUT_REF,GPP_DT_CUT_REF,RECO_NT_VUT_REF,RECO_DT_VUT_REF,RECO_NT_CUT_REF,RECO_DT_CUT_REF,datetime,year,month,day,hour,SITE_ID,date,NEE_VUT_REF_qa,SW_DIF,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen
0,200405260000,200405260030,7.856,0,8.325,0.0,0.0,0,0.0,314.215,2,314.215,0.0,0,3.144,0.194,2,0.194,84.186,2,84.186,-32.207,0.0,-17.4639,0,-0.130344,1,-0.181093,-1.57822,1,-2.19269,1.24478,1,0.877808,2,,0.0,-0.224026,0.0,,2.99961,0.670451,1.08157,2004-05-26 00:00:00,2004,5,26,0,ES-LJu,2004-05-26,1.24478,,0.179995,0.326648,0.071307,0.1108,0.2183,0.052,0.0884,0.262,0.2695,0.1945,OSH,Temperate
1,200405260030,200405260100,7.65,0,8.18,0.0,0.0,0,0.0,314.215,2,314.215,0.0,0,3.043,0.194,2,0.194,84.173,2,84.173,-30.418,0.0,-22.0652,0,-0.130344,1,-0.181093,-1.57822,1,-2.19269,1.24478,1,0.877808,2,,0.0,-0.229381,0.0,,2.98894,0.665096,1.05119,2004-05-26 00:30:00,2004,5,26,0,ES-LJu,2004-05-26,1.24478,,0.179995,0.326648,0.071307,0.1108,0.2183,0.052,0.0884,0.262,0.2695,0.1945,OSH,Temperate
2,200405260100,200405260130,7.672,0,8.035,0.0,0.0,0,0.0,328.276,2,328.276,0.0,0,2.942,0.09,2,0.09,84.16,2,84.16,-19.96,0.0,-16.2372,0,-0.130344,1,-0.181093,-1.57822,1,-2.19269,1.24478,1,0.877808,2,,0.0,-0.228809,0.0,,2.99008,0.665668,1.0544,2004-05-26 01:00:00,2004,5,26,1,ES-LJu,2004-05-26,1.24478,,0.179995,0.326648,0.071307,0.1108,0.2183,0.052,0.0884,0.262,0.2695,0.1945,OSH,Temperate
3,200405260130,200405260200,7.793,0,8.3,0.0,0.0,0,0.0,328.276,2,328.276,0.0,0,3.064,0.09,2,0.09,84.137,2,84.137,-25.722,0.0,-14.2753,0,-0.130344,1,-0.181093,-1.57822,1,-2.19269,1.24478,1,0.877808,2,,0.0,-0.225664,0.0,,2.99635,0.668813,1.07221,2004-05-26 01:30:00,2004,5,26,1,ES-LJu,2004-05-26,1.24478,,0.179995,0.326648,0.071307,0.1108,0.2183,0.052,0.0884,0.262,0.2695,0.1945,OSH,Temperate
4,200405260200,200405260230,7.638,0,8.565,0.0,0.0,0,0.0,339.403,2,339.403,0.0,0,3.185,0.047,2,0.047,84.113,2,84.113,-48.71,0.0,-23.5167,0,-0.130344,1,-0.181093,-1.57822,1,-2.19269,1.24478,1,0.877808,2,,0.0,-0.229693,0.0,,2.98832,0.664785,1.04944,2004-05-26 02:00:00,2004,5,26,2,ES-LJu,2004-05-26,1.24478,,0.179995,0.326648,0.071307,0.1108,0.2183,0.052,0.0884,0.262,0.2695,0.1945,OSH,Temperate


In [67]:
site_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239616 entries, 0 to 239615
Data columns (total 64 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   TIMESTAMP_START  239616 non-null  int64         
 1   TIMESTAMP_END    239616 non-null  int64         
 2   TA_F             239616 non-null  float64       
 3   TA_F_QC          239616 non-null  category      
 4   TA_ERA           239616 non-null  float64       
 5   SW_IN_POT        239616 non-null  float64       
 6   SW_IN_F          239616 non-null  float64       
 7   SW_IN_F_QC       239616 non-null  category      
 8   SW_IN_ERA        239616 non-null  float64       
 9   LW_IN_F          239616 non-null  float64       
 10  LW_IN_F_QC       239616 non-null  category      
 11  LW_IN_ERA        239616 non-null  float64       
 12  VPD_F            239616 non-null  float64       
 13  VPD_F_QC         239616 non-null  category      
 14  VPD_ERA          239

In [68]:
site_df[qc_flags_features].describe()

Unnamed: 0,TA_F_QC,SW_IN_F_QC,LW_IN_F_QC,VPD_F_QC,P_F_QC,PA_F_QC,G_F_MDS_QC,LE_F_MDS_QC,H_F_MDS_QC,NEE_VUT_REF_QC,NEE_CUT_REF_QC
count,239616,239616,239616,239616,239616,239616,239616,239616,239616,239616,239616
unique,3,3,1,3,2,2,4,4,4,4,4
top,0,0,2,0,0,0,0,0,0,1,1
freq,234618,230126,239616,227728,209448,229143,233686,199073,217577,125237,121605


In [69]:
display(site_df[qc_flags_features].apply(pd.value_counts))

Unnamed: 0,TA_F_QC,SW_IN_F_QC,LW_IN_F_QC,VPD_F_QC,P_F_QC,PA_F_QC,G_F_MDS_QC,LE_F_MDS_QC,H_F_MDS_QC,NEE_VUT_REF_QC,NEE_CUT_REF_QC
0,234618,230126,0,227728,209448,229143,233686,199073,217577,105053,108549
1,3356,1421,0,7392,0,0,3443,37078,18929,125237,121605
2,1642,8069,239616,4496,30168,10473,1457,2609,2273,7594,7291
3,0,0,0,0,0,0,1030,856,837,1732,2171


# Process All Files

In [47]:
tmp_df = qc_flags_count_df
tmp_df.columns = tmp_df.columns.astype(str)
tmp_df['site_id'] = r.site_id
tmp_df

Unnamed: 0,0,1,2,3,site_id
TA_F_QC,244413,656,691,0,FR-Pue
SW_IN_F_QC,243991,887,882,0,FR-Pue
LW_IN_F_QC,157188,2650,85922,0,FR-Pue
VPD_F_QC,244036,1031,693,0,FR-Pue
P_F_QC,245066,0,694,0,FR-Pue
PA_F_QC,167305,0,78455,0,FR-Pue
G_F_MDS_QC,118776,2590,5147,16981,FR-Pue
LE_F_MDS_QC,200474,42644,2472,170,FR-Pue
H_F_MDS_QC,174992,68291,2272,205,FR-Pue
NEE_VUT_REF_QC,104186,129287,11770,517,FR-Pue


In [86]:
qc_df = None
for i, r in site_metadata_df[['site_id','filename']].iterrows():
    if not r.filename or type(r.filename) != type(""):
        print(f'\nERROR: {r.site_id} is mssing hourly data.')
        continue

    local_filename = tmp_dir + "\\" + r.filename
    site_df = pd.read_csv(local_filename)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    site_df['date'] = pd.to_datetime(site_df['date'])
    site_df[qc_flags_features] = site_df[qc_flags_features].astype(qc_flag_dtype)
    
    print(f"\n{r.site_id}")
#     if site_df.isna().sum().sum() != 0:
#         print(f'{site_df.isna().sum()}\n')
    
    qc_flags_count_df = site_df[qc_flags_features].apply(pd.value_counts).T
    qc_flags_count_df.columns = qc_flags_count_df.columns.astype(str)
    qc_flags_count_df['site_id'] = r.site_id
    qc_flags_count_df.reset_index(inplace=True)
    qc_flags_count_df.rename(columns={"index": "data_type"}, inplace=True)
    display(qc_flags_count_df)
    
    
    if type(qc_df) == type(None):
        qc_df = qc_flags_count_df
    else:
        qc_df = pd.concat([qc_df, qc_flags_count_df])
    
    #qc_flags_count_df.T.plot.bar(stacked=True, title = f'{r.site_id}: Data QC Distributions')


FR-Pue


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,244413,656,691,0,FR-Pue
1,SW_IN_F_QC,243991,887,882,0,FR-Pue
2,LW_IN_F_QC,157188,2650,85922,0,FR-Pue
3,VPD_F_QC,244036,1031,693,0,FR-Pue
4,P_F_QC,245066,0,694,0,FR-Pue
5,PA_F_QC,167305,0,78455,0,FR-Pue
6,G_F_MDS_QC,118776,2590,5147,16981,FR-Pue
7,LE_F_MDS_QC,200474,42644,2472,170,FR-Pue
8,H_F_MDS_QC,174992,68291,2272,205,FR-Pue
9,NEE_VUT_REF_QC,104186,129287,11770,517,FR-Pue



ERROR: US-Ne2 is mssing hourly data.

US-Ton


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,228390,2357,181,0,US-Ton
1,SW_IN_F_QC,230048,695,185,0,US-Ton
2,LW_IN_F_QC,0,0,230928,0,US-Ton
3,VPD_F_QC,230264,636,28,0,US-Ton
4,P_F_QC,230621,0,307,0,US-Ton
5,PA_F_QC,230211,0,717,0,US-Ton
6,G_F_MDS_QC,227740,2779,327,82,US-Ton
7,LE_F_MDS_QC,166059,59897,4944,28,US-Ton
8,H_F_MDS_QC,177417,49063,4424,24,US-Ton
9,NEE_VUT_REF_QC,65566,149202,15939,221,US-Ton



US-Var


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,242346,3081,285,0,US-Var
1,SW_IN_F_QC,244314,750,648,0,US-Var
2,LW_IN_F_QC,177806,5255,62651,0,US-Var
3,VPD_F_QC,242784,2667,261,0,US-Var
4,P_F_QC,244960,0,752,0,US-Var
5,PA_F_QC,239952,0,5760,0,US-Var
6,G_F_MDS_QC,236712,6564,2260,176,US-Var
7,LE_F_MDS_QC,208189,35631,1829,63,US-Var
8,H_F_MDS_QC,226061,18463,1126,62,US-Var
9,NEE_VUT_REF_QC,103239,130944,11371,158,US-Var



ERROR: US-MMS is mssing hourly data.

US-Vcp


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,164905,8610,1013,0,US-Vcp
1,SW_IN_F_QC,171379,1580,1569,0,US-Vcp
2,LW_IN_F_QC,0,0,174528,0,US-Vcp
3,VPD_F_QC,149022,9745,15761,0,US-Vcp
4,P_F_QC,115066,0,59462,0,US-Vcp
5,PA_F_QC,169772,0,4756,0,US-Vcp
6,G_F_MDS_QC,0,0,0,0,US-Vcp
7,LE_F_MDS_QC,0,0,0,0,US-Vcp
8,H_F_MDS_QC,0,0,0,0,US-Vcp
9,NEE_VUT_REF_QC,70948,95952,6987,641,US-Vcp



CH-Lae


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,278102,1258,9024,0,CH-Lae
1,SW_IN_F_QC,277629,1173,9561,21,CH-Lae
2,LW_IN_F_QC,260919,1497,25968,0,CH-Lae
3,VPD_F_QC,278014,1346,9024,0,CH-Lae
4,P_F_QC,272208,0,0,0,CH-Lae
5,PA_F_QC,245053,0,27155,0,CH-Lae
6,G_F_MDS_QC,0,0,0,0,CH-Lae
7,LE_F_MDS_QC,182597,102424,3087,276,CH-Lae
8,H_F_MDS_QC,226736,60632,994,22,CH-Lae
9,NEE_VUT_REF_QC,89467,183615,14415,887,CH-Lae



ES-LJu


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,234618,3356,1642,0,ES-LJu
1,SW_IN_F_QC,230126,1421,8069,0,ES-LJu
2,LW_IN_F_QC,0,0,239616,0,ES-LJu
3,VPD_F_QC,227728,7392,4496,0,ES-LJu
4,P_F_QC,209448,0,30168,0,ES-LJu
5,PA_F_QC,229143,0,10473,0,ES-LJu
6,G_F_MDS_QC,233686,3443,1457,1030,ES-LJu
7,LE_F_MDS_QC,199073,37078,2609,856,ES-LJu
8,H_F_MDS_QC,217577,18929,2273,837,ES-LJu
9,NEE_VUT_REF_QC,105053,125237,7594,1732,ES-LJu


In [87]:
qc_df.to_csv(data_dir+"golden_sites_qc_count.csv", index=False)

# Plot Data QC Flag Distributions

In [7]:
qc_df = pd.read_csv(data_dir+"golden_sites_qc_count.csv")

In [60]:
# stacked and grouped barchart
# ref: https://dev.to/fronkan/stacked-and-grouped-bar-charts-using-plotly-python-a4p
# ref: https://community.plotly.com/t/grouped-stacked-bar-chart/60805/5

features = ["0","1","2","3"]
features_labels = ["0(measured)","1","2(medium)","3(bad)"]
index = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
total_offset = len(index) + 1
fig_data = []

hover_options = {
    "0": "%{x}<br>\"0\":%{y}",
    "1": "%{x}<br>\"1\":%{y}",
    "2": "%{x}<br>\"2\":%{y}",
    "3": "%{x}<br>\"3\":%{y}"
}

color_options = {
    "0": "#3DB14E",
    "1": "#007B93",
    "2": "#FFC107",
    "3": "#FF4500"
}

# Add the traces
for i, s in enumerate(index):
    site_df = qc_df[qc_df['site_id'] == s]
    for j, q in enumerate(features):
        bar_plot = go.Bar(
            x = site_df["data_type"], y=site_df[q],
            name = features_labels[j],
            offsetgroup = i,
            hovertemplate = hover_options[q],
            marker_color = color_options[q],
            showlegend = False,
        )
        
        if j > 0: # Move vertically
            bar_plot.base = site_df[features[:j]].sum(axis=1)
        
        if j == len(stacked_features)-1:
            bar_plot.text = s
            bar_plot.textposition = "outside"
            bar_plot.textangle = -60
            bar_plot.textfont = dict(size = 100)
        
        if i == 1:
            bar_plot.showlegend = True
        
        fig_data.append(bar_plot)

fig = go.Figure(
    data=fig_data,
    layout=go.Layout(
        height = 600,
        width = 1000,
        legend_title_text= "QC Flag",
        title="Half Hourly Data QC Distributions",
        xaxis_title="Data Type",
        yaxis_title="Record Count",
        legend_orientation="h",
        legend_yanchor="top",
        legend_x=0,
        legend_y=1.1,
    )  
)

fig.show()

# Export Fig
#fig.write_html(data_dir + "\\figures\\GoldSitesQCDistribution.html")

In [9]:
fig = px.bar(qc_df, 
             x="data_type", y=["0","1","2","3"],
             barmode="stack",
             title=f'Half Hourly Data QC Distributions')
fig.show()