In [1]:
from calendar import monthrange
from datetime import datetime
import pandas as pd
from pandas.api.types import CategoricalDtype
from io import BytesIO
import os
import json

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
sys.path.append(os.path.abspath("../tools"))
from CloudIO.AzStorageClient import AzStorageClient
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [2]:
root_dir =  os.path.abspath(os.getcwd()+"..\\..\\..\\..\\")
tmp_dir = root_dir + "\\.tmp"
data_dir = root_dir + "\\data\\"
cred_dir = root_dir + "\\.cred"
az_cred_file = cred_dir + '\\azblobcred.json'

site_metadata_filename = data_dir + "site-metadata.csv"

In [3]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

target_sites = tier1_sites + tier2_sites

In [4]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename, usecols=['site_id','filename'])

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(target_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df

size:(16, 2)


Unnamed: 0,site_id,filename
67,FR-Pue,data_full_half_hourly_raw_v0_1_FR-Pue.csv
117,US-NR1,data_full_half_hourly_raw_v0_1_US-NR1.csv
119,US-Ne2,
124,US-SRM,data_full_half_hourly_raw_v0_1_US-SRM.csv
127,US-Ton,data_full_half_hourly_raw_v0_1_US-Ton.csv
130,US-Var,data_full_half_hourly_raw_v0_1_US-Var.csv
144,US-Wkg,data_full_half_hourly_raw_v0_1_US-Wkg.csv
166,US-ARM,data_full_half_hourly_raw_v0_1_US-ARM.csv
181,US-MMS,
182,US-Me2,data_full_half_hourly_raw_v0_1_US-Me2.csv


In [5]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']
qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
qc_flags_features = [s for s in all_features if "_QC" in s]

In [6]:
# Read files
data_df = None
for i, r in site_metadata_df[['site_id','filename']].iterrows():
    if not r.filename or type(r.filename) != type(""):
        print(f'\nERROR: {r.site_id} is mssing hourly data.')
        continue
    
    local_filename = tmp_dir + "\\" + r.filename
    site_df = pd.read_csv(local_filename)
    site_df['datetime'] = pd.to_datetime(site_df['datetime'])
    site_df['date'] = pd.to_datetime(site_df['date'])
    site_df[qc_flags_features] = site_df[qc_flags_features].astype(qc_flag_dtype)
    
    print(f"{r.site_id}: {site_df.shape}")
    if type(data_df) == type(None):
        data_df = site_df
    else:
        data_df = pd.concat([data_df, site_df])

FR-Pue: (245760, 64)
US-NR1: (270768, 64)

ERROR: US-Ne2 is mssing hourly data.
US-SRM: (190752, 64)
US-Ton: (230928, 64)
US-Var: (245712, 64)
US-Wkg: (186768, 64)
US-ARM: (259104, 64)

ERROR: US-MMS is mssing hourly data.
US-Me2: (230688, 64)
US-UMB: (191904, 64)
US-Vcp: (174528, 64)
CH-Lae: (288384, 64)
ES-LJu: (239616, 64)
FI-Hyy: (407472, 64)
IT-Lav: (297840, 64)


In [7]:
245760+270768+190752+230928+245712+186768+259104+230688+191904+174528+288384+239616+407472+297840

3460224

In [8]:
data_df.shape

(3460224, 64)

In [10]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
parquet_file = BytesIO()
data_df.to_parquet(parquet_file, engine='pyarrow')
parquet_file.seek(0)

container = "gold-samples-data"
blob_name = "gold_samples_full_data.parquet"
azStorageClient = AzStorageClient(az_cred_file)
azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to gold-samples-data/gold_samples_full_data.parquet


# Load Data from Parquet

In [5]:
# Download the parquet file
# ref: https://stackoverflow.com/a/68940709
container = "gold-samples-data"
blob_name = "gold_samples_full_data.parquet"
azStorageClient = AzStorageClient(az_cred_file)
file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
uploaded_df = pd.read_parquet(file_stream, engine='pyarrow')
print(f"data size: {uploaded_df.shape}")
uploaded_df.head()

data size: (3460224, 64)


Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,LW_IN_F_QC,LW_IN_ERA,VPD_F,VPD_F_QC,VPD_ERA,P_F,P_F_QC,P_ERA,PA_F,PA_F_QC,PA_ERA,NETRAD,PPFD_IN,G_F_MDS,G_F_MDS_QC,LE_F_MDS,LE_F_MDS_QC,LE_CORR,H_F_MDS,H_F_MDS_QC,H_CORR,NEE_VUT_REF,NEE_VUT_REF_QC,NEE_CUT_REF,NEE_CUT_REF_QC,GPP_NT_VUT_REF,GPP_DT_VUT_REF,GPP_NT_CUT_REF,GPP_DT_CUT_REF,RECO_NT_VUT_REF,RECO_DT_VUT_REF,RECO_NT_CUT_REF,RECO_DT_CUT_REF,datetime,year,month,day,hour,SITE_ID,date,NEE_VUT_REF_qa,SW_DIF,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen
0,200007260000,200007260030,15.82,0,18.55,0.0,0.0,0,0.0,384.688,2,384.688,0.162,0,1.593,0.0,0.0,1.638,97.769,2.0,97.769,-36.32,0.419,,,4.80329,2.0,,-19.6606,2.0,,2.95832,2,2.95832,2,0.260122,0.0,0.263564,0.0,3.20784,2.26316,3.22063,1.78651,2000-07-26 00:00:00,2000,7,26,0,FR-Pue,2000-07-26,,,,,,,,,,,,,EBF,Temperate
1,200007260030,200007260100,15.62,0,18.37,0.0,0.019,0,0.0,384.688,2,384.688,0.195,0,1.552,0.0,0.0,1.638,97.771,2.0,97.771,-32.59,0.519,,,5.0434,2.0,,-19.6606,2.0,,2.95832,2,2.95832,2,0.221846,0.000644,0.225435,0.000644,3.16912,2.25724,3.18245,1.78182,2000-07-26 00:30:00,2000,7,26,0,FR-Pue,2000-07-26,,,,,,,,,,,,,EBF,Temperate
2,200007260100,200007260130,16.48,0,18.19,0.0,0.0,0,0.0,340.124,2,340.124,1.331,0,1.511,0.0,0.0,0.0,97.773,2.0,97.773,-38.81,-0.043,,,3.94853,1.0,,-24.5284,2.0,,3.03104,2,3.03104,2,0.315191,0.0,0.318129,0.0,3.33718,2.28257,3.3481,1.80182,2000-07-26 01:00:00,2000,7,26,1,FR-Pue,2000-07-26,,,,,,,,,,,,,EBF,Temperate
3,200007260130,200007260200,17.29,0,17.924,0.0,0.0,0,0.0,340.124,2,340.124,2.685,0,1.724,0.0,0.0,0.0,97.797,2.0,97.797,-45.83,-0.183,,,4.77351,1.0,,-34.0815,1.0,,4.31938,1,4.31938,1,-0.813255,0.0,-0.810981,0.0,3.49911,2.30605,3.50757,1.82036,2000-07-26 01:30:00,2000,7,26,1,FR-Pue,2000-07-26,4.31938,,,,,,,,,,,,EBF,Temperate
4,200007260200,200007260230,17.31,0,17.658,0.0,0.0,0,0.0,340.124,2,340.124,2.53,0,1.938,0.0,0.0,0.0,97.821,2.0,97.821,-45.37,-0.544,,,3.37369,1.0,,-30.9598,1.0,,4.34014,1,4.34014,1,-0.830025,0.0,-0.827767,0.0,3.50316,2.30663,3.51155,1.82082,2000-07-26 02:00:00,2000,7,26,2,FR-Pue,2000-07-26,4.34014,,,,,,,,,,,,EBF,Temperate


In [6]:
uploaded_df.tail()

Unnamed: 0,TIMESTAMP_START,TIMESTAMP_END,TA_F,TA_F_QC,TA_ERA,SW_IN_POT,SW_IN_F,SW_IN_F_QC,SW_IN_ERA,LW_IN_F,LW_IN_F_QC,LW_IN_ERA,VPD_F,VPD_F_QC,VPD_ERA,P_F,P_F_QC,P_ERA,PA_F,PA_F_QC,PA_ERA,NETRAD,PPFD_IN,G_F_MDS,G_F_MDS_QC,LE_F_MDS,LE_F_MDS_QC,LE_CORR,H_F_MDS,H_F_MDS_QC,H_CORR,NEE_VUT_REF,NEE_VUT_REF_QC,NEE_CUT_REF,NEE_CUT_REF_QC,GPP_NT_VUT_REF,GPP_DT_VUT_REF,GPP_NT_CUT_REF,GPP_DT_CUT_REF,RECO_NT_VUT_REF,RECO_DT_VUT_REF,RECO_NT_CUT_REF,RECO_DT_CUT_REF,datetime,year,month,day,hour,SITE_ID,date,NEE_VUT_REF_qa,SW_DIF,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen
297835,202012312130,202012312200,-3.131,0,,0.0,0.0,0,,282.582,0,,0.288,0,,,,,,,,,-0.253429,-2.9179,0.0,4.36705,0.0,,-5.64245,0.0,,-1.2748,1,-1.46179,1,1.38967,0.0,1.34514,0.0,1.34742e-07,0.996276,1.43386e-07,0.388514,2020-12-31 21:30:00,2020,12,31,21,IT-Lav,2020-12-31,-1.2748,,0.30255,0.263407,0.084396,0.1868,0.3204,0.1783,0.1821,0.0978,0.0305,0.016,ENF,Cold
297836,202012312200,202012312230,-3.068,0,,0.0,0.0,0,,283.016,0,,0.461,0,,,,,,,,,-0.249702,-2.9202,0.0,3.85966,0.0,,-5.25952,0.0,,-1.2748,1,-1.46179,1,1.37019,0.0,1.34514,0.0,1.36081e-07,1.00167,1.44767e-07,0.38949,2020-12-31 22:00:00,2020,12,31,22,IT-Lav,2020-12-31,-1.2748,,0.30255,0.263407,0.084396,0.1868,0.3204,0.1783,0.1821,0.0978,0.0305,0.016,ENF,Cold
297837,202012312230,202012312300,-2.953,0,,0.0,0.0,0,,281.737,0,,0.704,0,,,,,,,,,-0.257156,-2.92335,0.0,5.59511,0.0,,-7.25914,0.0,,-1.2945,1,-1.48421,1,1.38881,0.0,1.36431,0.0,1.38551e-07,1.01154,1.47312e-07,0.39127,2020-12-31 22:30:00,2020,12,31,22,IT-Lav,2020-12-31,-1.2945,,0.30255,0.263407,0.084396,0.1868,0.3204,0.1783,0.1821,0.0978,0.0305,0.016,ENF,Cold
297838,202012312300,202012312330,-2.803,0,,0.0,0.0,0,,282.721,0,,0.957,0,,,,,,,,,-0.253429,-2.92745,0.0,1.93679,0.0,,-3.23033,0.0,,-1.2945,1,-1.48421,1,1.38905,0.0,1.36428,0.0,1.4182e-07,1.02449,1.50679e-07,0.393589,2020-12-31 23:00:00,2020,12,31,23,IT-Lav,2020-12-31,-1.2945,,0.30255,0.263407,0.084396,0.1868,0.3204,0.1783,0.1821,0.0978,0.0305,0.016,ENF,Cold
297839,202012312330,202101010000,-3.092,1,,0.0,0.0,0,,249.128,1,,0.807,1,,,,,,,,,,-3.16808,1.0,26.5049,1.0,,-2.50662,0.0,,-1.31299,1,-1.49651,1,1.57321,0.0,1.53974,0.0,1.3557e-07,0.999613,1.4424e-07,0.389118,2020-12-31 23:30:00,2020,12,31,23,IT-Lav,2020-12-31,-1.31299,,0.30255,0.263407,0.084396,0.1868,0.3204,0.1783,0.1821,0.0978,0.0305,0.016,ENF,Cold


In [29]:
total_record_count = uploaded_df.shape[0]
na_df = pd.DataFrame(uploaded_df.isna().sum())
na_df["percentage"] = (na_df / total_record_count)
na_df.rename(columns={0:"count"}, inplace=True)

In [32]:
na_df.loc[(na_df['count'] != 0)].sort_values("percentage", ascending=False)

Unnamed: 0,count,percentage
SW_DIF,3245894,0.938059
G_F_MDS,1049469,0.303295
G_F_MDS_QC,1049469,0.303295
H_CORR,1002477,0.289714
LE_CORR,1002477,0.289714
NETRAD,669801,0.193572
b6,500400,0.144615
LE_F_MDS_QC,470341,0.135928
H_F_MDS_QC,470341,0.135928
b5,403776,0.116691
