In [None]:
from calendar import monthrange
from datetime import datetime
import pandas as pd
from pandas.api.types import CategoricalDtype
import os
import json

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
sys.path.append(os.path.abspath("../../tools"))
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [None]:
root_dir =  os.path.abspath(os.getcwd()+"..\\..\\..\\..\\..\\")
tmp_dir = root_dir + "\\.tmp"
data_dir = root_dir + "\\data\\"

site_metadata_filename = data_dir + "site-metadata.csv"

In [None]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

target_sites = tier1_sites + tier2_sites
target_sites

['US-MMS',
 'US-Vcp',
 'FR-Pue',
 'CH-Lae',
 'US-Var',
 'US-Ne2',
 'ES-LJu',
 'US-Ton',
 'US-UMB',
 'US-Me2',
 'FI-Hyy',
 'US-NR1',
 'IT-Lav',
 'US-Wkg',
 'US-ARM',
 'US-SRM']

In [None]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename, usecols=['site_id','filename'])

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(target_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df

size:(16, 26)


Unnamed: 0,site_id,dataset,start_year,end_year,file,is_dup,IGBP,elevation,lat,long,site_name,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename,size,country,record_count,site_IGBP,site_koppen,start_time,end_time,recorded_day_count
67,FR-Pue,FLUXNET,2000,2014,FLX_FR-Pue_FLUXNET2015_FULLSET_MM_2000-2014_2-...,False,EBF,270.0,43.7413,3.5957,Puechabon,8,3,Csa,Temperate,C3,6.59,data_full_half_hourly_raw_v0_1_FR-Pue.csv,109116169.0,FR,245760.0,EBF,Temperate,2000-07-26 00:00:00,2014-12-31 23:30:00,5120.0
117,US-NR1,FLUXNET,1998,2014,FLX_US-NR1_FLUXNET2015_FULLSET_MM_1998-2014_1-...,False,ENF,3050.0,40.0329,-105.5464,Niwot Ridge Forest (LTER NWT1),27,4,Dfc,Cold,C3,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv,112936778.0,US,270768.0,ENF,Cold,1999-05-02 00:00:00,2014-12-31 23:30:00,5641.0
119,US-Ne2,FLUXNET,2001,2013,FLX_US-Ne2_FLUXNET2015_FULLSET_MM_2001-2013_1-...,False,CRO,362.0,41.16487,-96.4701,Mead - irrigated maize-soybean rotation site,25,4,Dfa,Cold,rotation,48.91,,,,,,,,,
124,US-SRM,FLUXNET,2004,2014,FLX_US-SRM_FLUXNET2015_FULLSET_MM_2004-2014_1-...,False,WSA,1120.0,31.8214,-110.8661,Santa Rita Mesquite,6,2,BSh,Arid,C3,55.39,data_full_half_hourly_raw_v0_1_US-SRM.csv,85092857.0,US,190752.0,WSA,Arid,2004-01-01 00:00:00,2014-12-31 23:30:00,3974.0
127,US-Ton,FLUXNET,2001,2014,FLX_US-Ton_FLUXNET2015_FULLSET_MM_2001-2014_1-...,False,WSA,177.0,38.4316,-120.96598,Tonzi Ranch,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_US-Ton.csv,103999932.0,US,230928.0,WSA,Temperate,2001-05-24 00:00:00,2014-12-31 23:30:00,4811.0
130,US-Var,FLUXNET,2000,2014,FLX_US-Var_FLUXNET2015_FULLSET_MM_2000-2014_1-...,False,GRA,129.0,38.4133,-120.9507,Vaira Ranch- Ione,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_US-Var.csv,110098318.0,US,245712.0,GRA,Temperate,2000-11-01 00:00:00,2014-12-31 23:30:00,5119.0
144,US-Wkg,FLUXNET,2004,2014,FLX_US-Wkg_FLUXNET2015_FULLSET_MM_2004-2014_1-...,False,GRA,1531.0,31.7365,-109.9419,Walnut Gulch Kendall Grasslands,7,2,BSk,Arid,C4,40.93,data_full_half_hourly_raw_v0_1_US-Wkg.csv,83565537.0,US,186768.0,GRA,Arid,2004-05-07 00:00:00,2014-12-31 23:30:00,3891.0
166,US-ARM,AmeriFlux,2003,2018,FLX_US-ARM_FLUXNET2015_FULLSET_MM_2003-2018_be...,True,CRO,314.0,36.6058,-97.4888,ARM Southern Great Plains site- Lamont,14,3,Cfa,Temperate,mix,15.97,data_full_half_hourly_raw_v0_1_US-ARM.csv,117878083.0,US,259104.0,CRO,Temperate,2003-01-01 00:00:00,2018-12-31 23:30:00,5398.0
181,US-MMS,AmeriFlux,1999,2017,FLX_US-MMS_FLUXNET2015_FULLSET_MM_1999-2017_be...,True,DBF,275.0,39.3232,-86.4131,Morgan Monroe State Forest,25,4,Dfa,Cold,C3,42.28,,,,,,,,,
182,US-Me2,AmeriFlux,2002,2017,FLX_US-Me2_FLUXNET2015_FULLSET_MM_2002-2017_be...,True,ENF,1253.0,44.4523,-121.5574,Metolius mature ponderosa pine,18,4,Dsb,Cold,C3,0.03,data_full_half_hourly_raw_v0_1_US-Me2.csv,97235990.0,US,230688.0,ENF,Cold,2002-01-01 00:00:00,2017-12-31 23:30:00,4806.0


# Define Features

In [None]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']
qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
qc_flags_features = [s for s in all_features if "_QC" in s]
qc_flags_features

['TA_F_QC',
 'SW_IN_F_QC',
 'LW_IN_F_QC',
 'VPD_F_QC',
 'P_F_QC',
 'PA_F_QC',
 'G_F_MDS_QC',
 'LE_F_MDS_QC',
 'H_F_MDS_QC',
 'NEE_VUT_REF_QC',
 'NEE_CUT_REF_QC']

# Process Single File

In [None]:
r = site_metadata_df[['site_id','filename']].iloc[-1]
print(r.filename)
local_filename = tmp_dir + "\\" + r.filename

data_full_half_hourly_raw_v0_1_IT-Lav.csv


In [None]:
site_df = pd.read_csv(local_filename)

In [None]:
#features = ['SITE_ID','TIMESTAMP_START', 'TIMESTAMP_END', 'datetime', 'date', 'year', 'month', 'day', 'hour',"GPP_NT_VUT_REF"]
#site_df = pd.read_csv(local_filename, usecols=features + qc_glags_features)
site_df['datetime'] = pd.to_datetime(site_df['datetime'])
site_df['date'] = pd.to_datetime(site_df['date'])
site_df[qc_flags_features] = site_df[qc_flags_features].astype(qc_flag_dtype)
print(f"size:{site_df.shape}")
site_df.head()

size:(297840, 64)


Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,LW_IN_F_QC,LW_IN_ERA,VPD_F,VPD_F_QC,VPD_ERA,P_F,P_F_QC,P_ERA,PA_F,PA_F_QC,PA_ERA,NETRAD,PPFD_IN,G_F_MDS,G_F_MDS_QC,LE_F_MDS,LE_F_MDS_QC,LE_CORR,H_F_MDS,H_F_MDS_QC,H_CORR,NEE_VUT_REF,NEE_VUT_REF_QC,NEE_CUT_REF,NEE_CUT_REF_QC,GPP_NT_VUT_REF,GPP_DT_VUT_REF,GPP_NT_CUT_REF,GPP_DT_CUT_REF,RECO_NT_VUT_REF,RECO_DT_VUT_REF,RECO_NT_CUT_REF,RECO_DT_CUT_REF,datetime,year,month,day,hour,SITE_ID,date,NEE_VUT_REF_qa,SW_DIF,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen
0,200301010000,200301010030,2.675,0,0.95,0.0,0.0,0,0.0,239.608,0,248.415,3.26,0,1.058,0.0,2,0.0,85.355,2,85.355,-83.1372,-0.680282,,,21.0126,1,33.79,-52.9562,0,-85.0948,0.138548,1,0.193352,1,-0.04041,0.0,-0.042584,0.0,0.135054,0.610416,0.021264,0.906946,2003-01-01 00:00:00,2003,1,1,0,IT-Lav,2003-01-01,0.138548,,0.286627,0.820698,0.124172,0.0149,0.1513,0.0068,0.0148,0.106,0.0484,0.0152,ENF,Cold
1,200301010030,200301010100,2.864,0,0.863,0.0,0.0,0,0.0,252.438,0,248.415,3.469,0,1.075,0.0,2,0.0,85.365,2,85.365,-70.924,-0.688867,,,11.6173,0,18.6819,-35.586,0,-57.1904,0.138548,1,0.193352,1,-0.038505,0.0,-0.042155,0.0,0.136677,0.612844,0.021511,0.910553,2003-01-01 00:30:00,2003,1,1,0,IT-Lav,2003-01-01,0.138548,,0.286627,0.820698,0.124172,0.0149,0.1513,0.0068,0.0148,0.106,0.0484,0.0152,ENF,Cold
2,200301010100,200301010130,2.973,0,0.775,0.0,0.0,0,0.0,254.031,0,245.571,3.49,0,1.093,0.0,2,0.0,85.374,2,85.374,-72.0011,-0.710112,,,12.8265,0,20.6262,-34.7839,0,-55.9035,0.138548,1,0.193352,1,-0.037402,0.0,-0.041907,0.0,0.137616,0.61424,0.021654,0.912628,2003-01-01 01:00:00,2003,1,1,1,IT-Lav,2003-01-01,0.138548,,0.286627,0.820698,0.124172,0.0149,0.1513,0.0068,0.0148,0.106,0.0484,0.0152,ENF,Cold
3,200301010130,200301010200,2.879,0,0.82,0.0,0.0,0,0.0,258.347,0,245.571,3.424,0,1.114,0.0,2,0.0,85.396,2,85.396,-67.1425,-0.703996,,,16.8189,0,27.0496,-41.9477,0,-67.4125,0.138548,1,0.365681,0,-0.210683,0.0,-0.337074,0.0,0.136806,0.613037,0.021531,0.910839,2003-01-01 01:30:00,2003,1,1,1,IT-Lav,2003-01-01,0.138548,,0.286627,0.820698,0.124172,0.0149,0.1513,0.0068,0.0148,0.106,0.0484,0.0152,ENF,Cold
4,200301010200,200301010230,3.09,0,0.865,0.0,0.0,0,0.0,262.149,0,239.737,3.585,0,1.135,0.0,2,0.0,85.417,2,85.417,-63.8005,-0.703245,,,4.91604,0,7.90654,-19.6448,0,-31.5706,0.138548,1,-0.071667,0,0.228803,0.0,0.100756,0.0,0.138626,0.615736,0.021808,0.914849,2003-01-01 02:00:00,2003,1,1,2,IT-Lav,2003-01-01,0.138548,,0.286627,0.820698,0.124172,0.0149,0.1513,0.0068,0.0148,0.106,0.0484,0.0152,ENF,Cold


In [None]:
site_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297840 entries, 0 to 297839
Data columns (total 64 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   TIMESTAMP_START  297840 non-null  int64         
 1   TIMESTAMP_END    297840 non-null  int64         
 2   TA_F             297840 non-null  float64       
 3   TA_F_QC          297840 non-null  category      
 4   TA_ERA           280272 non-null  float64       
 5   SW_IN_POT        297840 non-null  float64       
 6   SW_IN_F          297840 non-null  float64       
 7   SW_IN_F_QC       297840 non-null  category      
 8   SW_IN_ERA        280272 non-null  float64       
 9   LW_IN_F          297840 non-null  float64       
 10  LW_IN_F_QC       297840 non-null  category      
 11  LW_IN_ERA        280272 non-null  float64       
 12  VPD_F            297840 non-null  float64       
 13  VPD_F_QC         297840 non-null  category      
 14  VPD_ERA          280

In [None]:
site_df[qc_flags_features].describe()

Unnamed: 0,TA_F_QC,SW_IN_F_QC,LW_IN_F_QC,VPD_F_QC,P_F_QC,PA_F_QC,G_F_MDS_QC,LE_F_MDS_QC,H_F_MDS_QC,NEE_VUT_REF_QC,NEE_CUT_REF_QC
count,297840,297840,297840,297840,280272,280272,278514,297840,297840,297840,297840
unique,3,3,3,3,2,2,4,4,4,4,4
top,0,0,0,0,0,0,0,0,0,1,1
freq,290263,293731,290245,290258,229477,215978,272042,167458,185770,182631,184725


In [None]:
display(site_df[qc_flags_features].apply(pd.value_counts))

Unnamed: 0,TA_F_QC,SW_IN_F_QC,LW_IN_F_QC,VPD_F_QC,P_F_QC,PA_F_QC,G_F_MDS_QC,LE_F_MDS_QC,H_F_MDS_QC,NEE_VUT_REF_QC,NEE_CUT_REF_QC
0,290263,293731,290245,290258,229477,215978,272042,167458,185770,103368,101168
1,1175,2030,1192,1180,0,0,970,125506,107483,182631,184725
2,6402,2079,6403,6402,50795,64294,2196,4045,3779,10598,10729
3,0,0,0,0,0,0,3306,831,808,1243,1218


# Process All Files

In [None]:
qc_df = None
for i, r in site_metadata_df[['site_id','filename']].iterrows():
    if not r.filename or type(r.filename) != type(""):
        print(f'\nERROR: {r.site_id} is mssing hourly data.')
        continue

    local_filename = tmp_dir + "\\" + r.filename
    site_df = pd.read_csv(local_filename)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    site_df['date'] = pd.to_datetime(site_df['date'])
    site_df[qc_flags_features] = site_df[qc_flags_features].astype(qc_flag_dtype)
    
    print(f"\n{r.site_id}")
#     if site_df.isna().sum().sum() != 0:
#         print(f'{site_df.isna().sum()}\n')
    
    qc_flags_count_df = site_df[qc_flags_features].apply(pd.value_counts).T
    qc_flags_count_df.columns = qc_flags_count_df.columns.astype(str)
    qc_flags_count_df['site_id'] = r.site_id
    qc_flags_count_df.reset_index(inplace=True)
    qc_flags_count_df.rename(columns={"index": "data_type"}, inplace=True)
    display(qc_flags_count_df)
    
    
    if type(qc_df) == type(None):
        qc_df = qc_flags_count_df
    else:
        qc_df = pd.concat([qc_df, qc_flags_count_df])
    
    #qc_flags_count_df.T.plot.bar(stacked=True, title = f'{r.site_id}: Data QC Distributions')


FR-Pue


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,244413,656,691,0,FR-Pue
1,SW_IN_F_QC,243991,887,882,0,FR-Pue
2,LW_IN_F_QC,157188,2650,85922,0,FR-Pue
3,VPD_F_QC,244036,1031,693,0,FR-Pue
4,P_F_QC,245066,0,694,0,FR-Pue
5,PA_F_QC,167305,0,78455,0,FR-Pue
6,G_F_MDS_QC,118776,2590,5147,16981,FR-Pue
7,LE_F_MDS_QC,200474,42644,2472,170,FR-Pue
8,H_F_MDS_QC,174992,68291,2272,205,FR-Pue
9,NEE_VUT_REF_QC,104186,129287,11770,517,FR-Pue



US-NR1


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,270046,721,1,0,US-NR1
1,SW_IN_F_QC,269305,852,611,0,US-NR1
2,LW_IN_F_QC,225186,3160,42422,0,US-NR1
3,VPD_F_QC,269468,1294,6,0,US-NR1
4,P_F_QC,266976,0,3792,0,US-NR1
5,PA_F_QC,180819,0,89949,0,US-NR1
6,G_F_MDS_QC,257751,5871,4936,2210,US-NR1
7,LE_F_MDS_QC,244800,25076,890,2,US-NR1
8,H_F_MDS_QC,243833,25750,1184,1,US-NR1
9,NEE_VUT_REF_QC,126191,133400,11041,136,US-NR1



ERROR: US-Ne2 is mssing hourly data.

US-SRM


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,188615,826,1311,0,US-SRM
1,SW_IN_F_QC,189563,687,502,0,US-SRM
2,LW_IN_F_QC,180505,3893,6354,0,US-SRM
3,VPD_F_QC,188613,828,1311,0,US-SRM
4,P_F_QC,190572,0,180,0,US-SRM
5,PA_F_QC,187701,0,3051,0,US-SRM
6,G_F_MDS_QC,190156,545,51,0,US-SRM
7,LE_F_MDS_QC,169248,20552,783,169,US-SRM
8,H_F_MDS_QC,181370,8651,564,167,US-SRM
9,NEE_VUT_REF_QC,96601,88681,5165,305,US-SRM



US-Ton


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,228390,2357,181,0,US-Ton
1,SW_IN_F_QC,230048,695,185,0,US-Ton
2,LW_IN_F_QC,0,0,230928,0,US-Ton
3,VPD_F_QC,230264,636,28,0,US-Ton
4,P_F_QC,230621,0,307,0,US-Ton
5,PA_F_QC,230211,0,717,0,US-Ton
6,G_F_MDS_QC,227740,2779,327,82,US-Ton
7,LE_F_MDS_QC,166059,59897,4944,28,US-Ton
8,H_F_MDS_QC,177417,49063,4424,24,US-Ton
9,NEE_VUT_REF_QC,65566,149202,15939,221,US-Ton



US-Var


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,242346,3081,285,0,US-Var
1,SW_IN_F_QC,244314,750,648,0,US-Var
2,LW_IN_F_QC,177806,5255,62651,0,US-Var
3,VPD_F_QC,242784,2667,261,0,US-Var
4,P_F_QC,244960,0,752,0,US-Var
5,PA_F_QC,239952,0,5760,0,US-Var
6,G_F_MDS_QC,236712,6564,2260,176,US-Var
7,LE_F_MDS_QC,208189,35631,1829,63,US-Var
8,H_F_MDS_QC,226061,18463,1126,62,US-Var
9,NEE_VUT_REF_QC,103239,130944,11371,158,US-Var



US-Wkg


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,185695,1067,6,0,US-Wkg
1,SW_IN_F_QC,185998,768,2,0,US-Wkg
2,LW_IN_F_QC,176736,2588,7444,0,US-Wkg
3,VPD_F_QC,185319,1440,9,0,US-Wkg
4,P_F_QC,186767,0,1,0,US-Wkg
5,PA_F_QC,184715,0,2053,0,US-Wkg
6,G_F_MDS_QC,186479,286,3,0,US-Wkg
7,LE_F_MDS_QC,161679,24721,368,0,US-Wkg
8,H_F_MDS_QC,172514,14115,139,0,US-Wkg
9,NEE_VUT_REF_QC,78418,103014,5332,4,US-Wkg



US-ARM


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,238691,3084,17329,0,US-ARM
1,SW_IN_F_QC,222488,1128,35488,0,US-ARM
2,LW_IN_F_QC,190553,1042,67509,0,US-ARM
3,VPD_F_QC,238466,3309,17329,0,US-ARM
4,P_F_QC,236110,0,22994,0,US-ARM
5,PA_F_QC,250627,0,8477,0,US-ARM
6,G_F_MDS_QC,173986,20106,19251,34298,US-ARM
7,LE_F_MDS_QC,227623,25977,4864,640,US-ARM
8,H_F_MDS_QC,248438,7931,2329,406,US-ARM
9,NEE_VUT_REF_QC,165388,79751,12846,1119,US-ARM



ERROR: US-MMS is mssing hourly data.

US-Me2


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,207904,5319,17465,0,US-Me2
1,SW_IN_F_QC,228941,966,781,0,US-Me2
2,LW_IN_F_QC,0,0,230688,0,US-Me2
3,VPD_F_QC,204778,6686,19224,0,US-Me2
4,P_F_QC,227966,0,2722,0,US-Me2
5,PA_F_QC,166696,0,63992,0,US-Me2
6,G_F_MDS_QC,87940,1530,3860,14479,US-Me2
7,LE_F_MDS_QC,103503,22110,1046,120,US-Me2
8,H_F_MDS_QC,111828,14119,716,116,US-Me2
9,NEE_VUT_REF_QC,55075,155508,19190,915,US-Me2



US-UMB


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,184833,4098,2973,0,US-UMB
1,SW_IN_F_QC,188356,559,2989,0,US-UMB
2,LW_IN_F_QC,0,0,191904,0,US-UMB
3,VPD_F_QC,167469,21393,3042,0,US-UMB
4,P_F_QC,190053,0,1851,0,US-UMB
5,PA_F_QC,184211,0,7693,0,US-UMB
6,G_F_MDS_QC,0,0,0,0,US-UMB
7,LE_F_MDS_QC,0,0,0,0,US-UMB
8,H_F_MDS_QC,0,0,0,0,US-UMB
9,NEE_VUT_REF_QC,93256,91939,6101,608,US-UMB



US-Vcp


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,164905,8610,1013,0,US-Vcp
1,SW_IN_F_QC,171379,1580,1569,0,US-Vcp
2,LW_IN_F_QC,0,0,174528,0,US-Vcp
3,VPD_F_QC,149022,9745,15761,0,US-Vcp
4,P_F_QC,115066,0,59462,0,US-Vcp
5,PA_F_QC,169772,0,4756,0,US-Vcp
6,G_F_MDS_QC,0,0,0,0,US-Vcp
7,LE_F_MDS_QC,0,0,0,0,US-Vcp
8,H_F_MDS_QC,0,0,0,0,US-Vcp
9,NEE_VUT_REF_QC,70948,95952,6987,641,US-Vcp



CH-Lae


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,278102,1258,9024,0,CH-Lae
1,SW_IN_F_QC,277629,1173,9561,21,CH-Lae
2,LW_IN_F_QC,260919,1497,25968,0,CH-Lae
3,VPD_F_QC,278014,1346,9024,0,CH-Lae
4,P_F_QC,272208,0,0,0,CH-Lae
5,PA_F_QC,245053,0,27155,0,CH-Lae
6,G_F_MDS_QC,0,0,0,0,CH-Lae
7,LE_F_MDS_QC,182597,102424,3087,276,CH-Lae
8,H_F_MDS_QC,226736,60632,994,22,CH-Lae
9,NEE_VUT_REF_QC,89467,183615,14415,887,CH-Lae



ES-LJu


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,234618,3356,1642,0,ES-LJu
1,SW_IN_F_QC,230126,1421,8069,0,ES-LJu
2,LW_IN_F_QC,0,0,239616,0,ES-LJu
3,VPD_F_QC,227728,7392,4496,0,ES-LJu
4,P_F_QC,209448,0,30168,0,ES-LJu
5,PA_F_QC,229143,0,10473,0,ES-LJu
6,G_F_MDS_QC,233686,3443,1457,1030,ES-LJu
7,LE_F_MDS_QC,199073,37078,2609,856,ES-LJu
8,H_F_MDS_QC,217577,18929,2273,837,ES-LJu
9,NEE_VUT_REF_QC,105053,125237,7594,1732,ES-LJu



FI-Hyy


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,385754,9970,4594,7154,FI-Hyy
1,SW_IN_F_QC,402073,1267,4132,0,FI-Hyy
2,LW_IN_F_QC,167226,6634,233612,0,FI-Hyy
3,VPD_F_QC,324133,26799,49386,7154,FI-Hyy
4,P_F_QC,370634,0,19270,0,FI-Hyy
5,PA_F_QC,381522,0,8382,0,FI-Hyy
6,G_F_MDS_QC,233156,29209,3016,3372,FI-Hyy
7,LE_F_MDS_QC,350420,54003,2982,67,FI-Hyy
8,H_F_MDS_QC,353795,50807,2801,69,FI-Hyy
9,NEE_VUT_REF_QC,187212,208953,10958,349,FI-Hyy



IT-Lav


Unnamed: 0,data_type,0,1,2,3,site_id
0,TA_F_QC,290263,1175,6402,0,IT-Lav
1,SW_IN_F_QC,293731,2030,2079,0,IT-Lav
2,LW_IN_F_QC,290245,1192,6403,0,IT-Lav
3,VPD_F_QC,290258,1180,6402,0,IT-Lav
4,P_F_QC,229477,0,50795,0,IT-Lav
5,PA_F_QC,215978,0,64294,0,IT-Lav
6,G_F_MDS_QC,272042,970,2196,3306,IT-Lav
7,LE_F_MDS_QC,167458,125506,4045,831,IT-Lav
8,H_F_MDS_QC,185770,107483,3779,808,IT-Lav
9,NEE_VUT_REF_QC,103368,182631,10598,1243,IT-Lav


In [None]:
qc_df.to_csv(data_dir+"golden_sites_qc_count.csv", index=False)

# Plot Data QC Flag Distributions

In [None]:
qc_df = pd.read_csv(data_dir+"golden_sites_qc_count.csv")

In [None]:
# stacked and grouped barchart
# ref: https://dev.to/fronkan/stacked-and-grouped-bar-charts-using-plotly-python-a4p
# ref: https://community.plotly.com/t/grouped-stacked-bar-chart/60805/5

groups = qc_df["data_type"].unique()
subgroup_text = qc_df["site_id"].unique()
features = ["0","1","2","3"]
features_labels = ["0(measured)","1(good quality gapfill)","2(medium)","3(bad)"]
index = target_sites
total_offset = len(index) + 1
fig_data = []

color_options = {
    "0": "#3DB14E",
    "1": "#007B93",
    "2": "#FFC107",
    "3": "#FF4500"
}

# Add the traces
for i, s in enumerate(index):
    site_df = qc_df[qc_df['site_id'] == s]
    for j, q in enumerate(features):
        bar_plot = go.Bar(
            x = site_df["data_type"], y=site_df[q],
            name = features_labels[j],
            offsetgroup = i,
            customdata= site_df[["site_id"]],
            hovertemplate = "%{x}-%{customdata[0]}<br>%{y}",
            marker_color = color_options[q],
            showlegend = False,
        )
        
        if j > 0: # Move vertically
            bar_plot.base = site_df[features[:j]].sum(axis=1)
        
#         if j == len(features)-1:
#             bar_plot.text = s
#             bar_plot.textposition = "outside"
#             bar_plot.textangle = -60
#             bar_plot.outsidetextfont = dict(size = 20, color="dark gray")
        
        if i == 1:
            bar_plot.showlegend = True
        
        fig_data.append(bar_plot)

fig = go.Figure(
    data=fig_data,
    layout=go.Layout(
        height = 800,
        width = 1400,
        legend_title_text= "QC Flag",
        title="Half Hourly Data QC Distributions",
        xaxis_title="Data Type",
        yaxis_title="Record Count",
        legend_orientation="h",
        legend_yanchor="top",
        legend_x=0,
        legend_y=1.05,
        template='plotly_white'
    )  
)

for i, g in enumerate(groups):
    for j, site_id in enumerate([ x for x in index if x in subgroup_text]):
        fig.add_annotation(
                x=-0.55 + (i*1) + (j*0.06), 
                y=-0.5,
                text= site_id,
                showarrow=False,
                textangle=-80,
                yanchor='top',
                xanchor='left',
                font=dict(color='#333', size=8))

fig.show()

# Export Fig
fig.write_html(data_dir + "\\figures\\GoldSitesQCDistribution.html")

In [None]:
fig_data = []

site_df = qc_df[qc_df['site_id'] == s]
for j, q in enumerate(features):
    bar_plot = go.Bar(
        x = [qc_df["data_type"], qc_df['site_id']], y=qc_df[q],
        name = features_labels[j],
        offsetgroup = 0,
        hovertemplate = "%{x}<br>%{y}",
        marker_color = color_options[q],
        showlegend = True,
    )

    if j > 0: # Move vertically
        bar_plot.base = qc_df[features[:j]].sum(axis=1)

    fig_data.append(bar_plot)

fig = go.Figure(
    data=fig_data,
    layout=go.Layout(
        height = 600,
        width = 1200,
        title="Half Hourly Data QC Distributions",
        yaxis_title="Record Count",
        xaxis=dict(
            titlefont_size=10,
            tickfont_size=8),
        legend_title_text= "QC Flag",
        legend_orientation="h",
        legend_yanchor="top",
        legend_x=0,
        legend_y=1.1,
        template='plotly_white'
    )  
)

for i in range(10):
    fig.add_vline(x=13.5 + (14*i), line_width=0.5)

fig.show()


In [None]:
fig = px.bar(qc_df, 
             x="data_type", y=["0","1","2","3"],
             barmode="stack",
             title=f'Half Hourly Data QC Distributions')
fig.show()