In [1]:
from calendar import monthrange
from datetime import datetime
import pandas as pd
from pandas.api.types import CategoricalDtype
from io import BytesIO
import os
import json

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
sys.path.append(os.path.abspath("../tools"))
from CloudIO.AzStorageClient import AzStorageClient
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [2]:
root_dir =  os.path.abspath(os.getcwd()+"..\\..\\..\\..\\")
tmp_dir = root_dir + "\\.tmp"
data_dir = root_dir + "\\data\\"
cred_dir = root_dir + "\\.cred"
az_cred_file = cred_dir + '\\azblobcred.json'

site_metadata_filename = data_dir + "site-metadata.csv"

# Output
tag = "1_raw"
container = "gold-samples-data"
blob_name = f"gold_samples_trim_v_{tag}.parquet" #Advisor suggested features only

In [3]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

target_sites = tier1_sites + tier2_sites

# Get Gold Sample Site Data

In [4]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name', 'koppen_main_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(target_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(16, 11)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename
0,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59,data_full_half_hourly_raw_v0_1_FR-Pue.csv
1,US-NR1,3050.0,40.0329,-105.5464,27,4,Dfc,Cold,C3,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv
2,US-Ne2,362.0,41.16487,-96.4701,25,4,Dfa,Cold,rotation,48.91,
3,US-SRM,1120.0,31.8214,-110.8661,6,2,BSh,Arid,C3,55.39,data_full_half_hourly_raw_v0_1_US-SRM.csv
4,US-Ton,177.0,38.4316,-120.96598,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_US-Ton.csv
5,US-Var,129.0,38.4133,-120.9507,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_US-Var.csv
6,US-Wkg,1531.0,31.7365,-109.9419,7,2,BSk,Arid,C4,40.93,data_full_half_hourly_raw_v0_1_US-Wkg.csv
7,US-ARM,314.0,36.6058,-97.4888,14,3,Cfa,Temperate,mix,15.97,data_full_half_hourly_raw_v0_1_US-ARM.csv
8,US-MMS,275.0,39.3232,-86.4131,25,4,Dfa,Cold,C3,42.28,
9,US-Me2,1253.0,44.4523,-121.5574,18,4,Dsb,Cold,C3,0.03,data_full_half_hourly_raw_v0_1_US-Me2.csv


In [5]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']

In [6]:
def data_cleanup(site_id_file_df, target, target_qc, features):

    data_df = None
    qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
    qc_flags_features = [s for s in included_features if "_QC" in s]

    # Iterate through each site:
    for i, r in site_id_file_df.iterrows():        
        if not r.filename or type(r.filename) != type(""):
            print(f'\nERROR: {r.site_id} is mssing hourly data.')
            continue

        # Get only `included_features` from file
        local_filename = tmp_dir + "\\" + r.filename
        site_df = pd.read_csv(local_filename, usecols = included_features +  target_variable + target_variable_qc)
        site_df['datetime'] = pd.to_datetime(site_df['datetime'])
        site_df['date'] = pd.to_datetime(site_df['date'])
        site_df['minute'] = site_df['datetime'].dt.minute
        if len(qc_flags_features) != 0:
            site_df[qc_flags_features] = site_df[qc_flags_features].astype(qc_flag_dtype)
        site_df['site_id'] = r.site_id

        # Remove zero or negative SW
        site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True)

        # Drop rows with NAs for Target Variable
        site_df.dropna(subset=target_variable, axis=0, inplace=True)

        # Drop rows with bad NEE_VUT_REF_QC (aka bad GPP records)
        site_df.drop(site_df[site_df[target_variable_qc[0]] == 3].index, inplace = True)
        site_df.drop(target_variable_qc, axis=1, inplace=True)

        # Drop rows with any NA
        site_df.dropna(axis=0, inplace=True)

        print(f"{r.site_id}: {site_df.shape}")
        if type(data_df) == type(None):
            data_df = site_df
        else:
            data_df = pd.concat([data_df, site_df])
            
    return data_df

def merg_site_metadata(data_df, site_metadata_df):
    # Merge with Site Metadata
    data_df = data_df.merge(site_metadata_df, how='left', left_on='site_id', right_on='site_id')
    return data_df

In [7]:
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = ['NEE_VUT_REF_QC']
target_variable = ['GPP_NT_VUT_REF']

# Get Train Dataset
data_df = data_cleanup(site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

data_df = merg_site_metadata(data_df, site_metadata_df.drop(['filename'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")
display(data_df.head())


FR-Pue: (117200, 27)
US-NR1: (98652, 27)

ERROR: US-Ne2 is mssing hourly data.
US-SRM: (95419, 27)
US-Ton: (113031, 27)
US-Var: (119950, 27)
US-Wkg: (93319, 27)
US-ARM: (125756, 27)

ERROR: US-MMS is mssing hourly data.
US-Me2: (99780, 27)
US-UMB: (70639, 27)
US-Vcp: (78491, 27)
CH-Lae: (112718, 27)
ES-LJu: (112724, 27)
FI-Hyy: (127362, 27)
IT-Lav: (120885, 27)
Data size after cleanup: (1485926, 27)


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id
6449,5.311,25.016,272.218,1.708,0.0,97.939,-0.53574,2001-01-01 08:30:00,2001,1,1,8,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue
6450,5.744,59.734,272.218,1.738,0.0,97.939,0.86438,2001-01-01 09:00:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue
6451,6.176,91.235,272.218,1.767,0.0,97.939,-0.02627,2001-01-01 09:30:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue
6452,6.608,79.264,333.933,1.797,0.05,97.939,-0.17229,2001-01-01 10:00:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue
6453,7.043,94.929,333.933,1.817,0.0,97.923,1.20865,2001-01-01 10:30:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue


Data size after after merged with site metadata: (1485926, 36)


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
0,5.311,25.016,272.218,1.708,0.0,97.939,-0.53574,2001-01-01 08:30:00,2001,1,1,8,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
1,5.744,59.734,272.218,1.738,0.0,97.939,0.86438,2001-01-01 09:00:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
2,6.176,91.235,272.218,1.767,0.0,97.939,-0.02627,2001-01-01 09:30:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
3,6.608,79.264,333.933,1.797,0.05,97.939,-0.17229,2001-01-01 10:00:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59
4,7.043,94.929,333.933,1.817,0.0,97.923,1.20865,2001-01-01 10:30:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,Temperate,C3,6.59


In [8]:
data_df.describe()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,koppen_main,c4_percent
count,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0
mean,14.87724,377.56323,316.15936,10.11574,0.03757,90.79849,5.86211,2009.75172,6.53593,15.74438,11.89376,0.29739,0.53815,0.13063,0.07894,0.24123,0.04288,0.07109,0.25797,0.20036,0.11929,14.99925,971.07777,41.75447,-62.35813,16.51431,3.34981,8.82611
std,9.04516,268.53741,47.26856,9.61817,0.18091,8.84375,7.52193,4.52259,2.98991,8.81076,3.7739,0.13132,0.21932,0.07137,0.0554,0.06086,0.03907,0.0416,0.0696,0.09369,0.07861,15.00001,866.88847,7.77408,59.15444,8.71496,0.69389,15.9951
min,-29.74,0.001,142.77,0.0,0.0,67.405,-49.7372,2001.0,1.0,1.0,3.0,-0.11958,-0.18252,-0.01715,0.0054,0.0305,0.0,0.0,0.0,0.0132,0.0,0.0,129.0,31.7365,-121.5574,6.0,2.0,0.0
25%,8.692,138.70625,283.869,3.18,0.0,85.242,0.45157,2006.0,4.0,8.0,9.0,0.20336,0.3415,0.08147,0.0337,0.1984,0.0196,0.0422,0.2021,0.1189,0.0515,0.0,234.0,36.6058,-110.8661,8.0,3.0,0.0
50%,14.931,344.5285,317.35,6.987,0.0,93.167,3.23428,2010.0,7.0,16.0,12.0,0.28756,0.56813,0.11822,0.0639,0.2308,0.034,0.0632,0.2611,0.1868,0.1033,0.0,689.0,40.0329,-97.4888,14.0,3.0,0.04
75%,21.282,587.463,349.289,13.732,0.0,98.717,9.4872,2013.0,9.0,23.0,15.0,0.36012,0.70597,0.15872,0.1166,0.2775,0.0545,0.0912,0.3152,0.2791,0.1824,30.0,1531.0,45.5598,3.5957,26.0,4.0,10.72
max,42.587,1094.341,473.011,75.684,15.493,103.383,85.0309,2020.0,12.0,31.0,23.0,2.38835,0.93551,0.42385,0.7971,0.7729,0.7689,0.7865,0.4666,0.428,0.3573,30.0,3050.0,61.84741,24.29477,27.0,4.0,55.39


In [9]:
data_df.site_id.unique()

array(['FR-Pue', 'US-NR1', 'US-SRM', 'US-Ton', 'US-Var', 'US-Wkg',
       'US-ARM', 'US-Me2', 'US-UMB', 'US-Vcp', 'CH-Lae', 'ES-LJu',
       'FI-Hyy', 'IT-Lav'], dtype=object)

## Upload Data to Azure Storage Blob as Parquet
**Run with Caution!!!**

In [None]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
parquet_file = BytesIO()
data_df.to_parquet(parquet_file, engine='pyarrow')
parquet_file.seek(0)

azStorageClient = AzStorageClient(az_cred_file)
azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

## Should be no NA data

In [None]:
total_record_count = data_df.shape[0]
na_df = pd.DataFrame(data_df.isna().sum())
na_df["percentage"] = (na_df / total_record_count)
na_df.rename(columns={0:"count"}, inplace=True)

In [None]:
na_df.loc[(na_df['count'] != 0)].sort_values("percentage", ascending=False)