# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [2]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Modules

In [3]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [5]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

## Define Local Files System Constants

In [6]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

# Output
tag = "0_raw"
container = "all-sites-data"
blob_name = f"all_site_trim_v_{tag}.parquet" #Advisor suggested features only

# Get Site Metadata

In [7]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name', 'koppen_main_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df.head()

size:(286, 11)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename
0,AR-SLu,,-33.4648,-66.4598,7,2,BSk,Arid,C3,67.08,data_full_half_hourly_raw_v0_1_AR-SLu.csv
1,AR-Vir,,-28.2395,-56.1886,14,3,Cfa,Temperate,C3,8.75,data_full_half_hourly_raw_v0_1_AR-Vir.csv
2,AT-Neu,970.0,47.11667,11.3175,26,4,Dfb,Cold,C3,0.0,data_full_half_hourly_raw_v0_1_AT-Neu.csv
3,AU-ASM,,-22.283,133.249,4,2,BWh,Arid,C3,100.0,data_full_half_hourly_raw_v0_1_AU-ASM.csv
4,AU-Ade,,-13.0769,131.1178,3,1,Aw,Tropical,C3,79.57,data_full_half_hourly_raw_v0_1_AU-Ade.csv


# Run Data Pipeline on All Sites

In [8]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']

In [9]:
def data_cleanup(site_id_file_df, target, target_qc, features):

    data_df = None
    # qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
    qc_flags_features = [s for s in included_features if "_QC" in s]

    # Iterate through each site:
    for i, r in site_id_file_df.iterrows():        
        if not r.filename or type(r.filename) != type(""):
            print(f'\nERROR: {r.site_id} is mssing hourly data.')
            continue

        # Get only `included_features` from file
        local_filename = raw_data_dir + os.sep + r.filename
        site_df = pd.read_csv(local_filename, usecols = included_features +  target_variable + target_variable_qc)
        site_df['datetime'] = pd.to_datetime(site_df['datetime'])
        site_df['date'] = pd.to_datetime(site_df['date'])
        site_df['minute'] = site_df['datetime'].dt.minute
        if len(qc_flags_features) != 0:
            site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
        site_df['site_id'] = r.site_id

        # Remove zero or negative SW
        site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True)

        # Drop rows with NAs for Target Variable
        site_df.dropna(subset=target_variable, axis=0, inplace=True)

        # Drop rows with bad NEE_VUT_REF_QC (aka bad GPP records)
        site_df.drop(site_df[site_df[target_variable_qc[0]] == 3].index, inplace = True)
        site_df.drop(target_variable_qc, axis=1, inplace=True)

        # Drop rows with any NA
        site_df.dropna(axis=0, inplace=True)

        print(f"{i+1:3}.{r.site_id}: {site_df.shape}")
        if type(data_df) == type(None):
            data_df = site_df
        else:
            data_df = pd.concat([data_df, site_df])
            
    return data_df

def merg_site_metadata(data_df, site_metadata_df):
    # Merge with Site Metadata
    data_df = data_df.merge(site_metadata_df, how='left', left_on='site_id', right_on='site_id')
    return data_df

In [10]:
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = ['NEE_VUT_REF_QC']
target_variable = ['GPP_NT_VUT_REF']

# Get Train Dataset
data_df = data_cleanup(site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

data_df = merg_site_metadata(data_df, site_metadata_df.drop(['filename'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")

display(data_df.head())


  1.AR-SLu: (11274, 27)
  2.AR-Vir: (16714, 27)
  3.AT-Neu: (72353, 27)
  4.AU-ASM: (36657, 27)
  5.AU-Ade: (8936, 27)
  6.AU-Cpr: (35642, 27)
  7.AU-Cum: (18649, 27)
  8.AU-DaP: (36943, 27)
  9.AU-DaS: (50668, 27)
 10.AU-Dry: (37785, 27)
 11.AU-Emr: (19999, 27)
 12.AU-Fog: (18397, 27)
 13.AU-GWW: (16051, 27)
 14.AU-Gin: (24512, 27)
 15.AU-How: (74809, 27)
 16.AU-Lox: (7292, 27)
 17.AU-RDF: (13818, 27)
 18.AU-Rig: (31940, 27)
 19.AU-Rob: (6043, 27)
 20.AU-Stp: (46142, 27)
 21.AU-TTE: (21356, 27)

ERROR: AU-Tum is mssing hourly data.
 23.AU-Wac: (23493, 27)
 24.AU-Whr: (26820, 27)
 25.AU-Wom: (39035, 27)
 26.AU-Ync: (13974, 27)

ERROR: BR-Sa1 is mssing hourly data.
 28.BR-Sa3: (5176, 27)
 29.CA-Man: (34218, 27)
 30.CA-NS4: (18816, 27)
 31.CA-NS7: (25196, 27)
 32.CA-Oas: (71623, 27)
 33.CA-Obs: (73559, 27)
 34.CA-SF1: (16717, 27)
 35.CA-SF3: (20640, 27)
 36.CA-TP1: (56949, 27)
 37.CA-TP2: (16145, 27)
 38.CA-TP3: (64247, 27)
 39.CA-TPD: (21323, 27)
 40.CG-Tch: (168, 27)
 41.CH-Oe1: (48974

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
0,22.185,49.06,363.949,7.694,0.0,95.309,0.3401,2009-12-21 06:30:00,2009,12,21,6,2009-12-21,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,MF,Arid,30,AR-SLu,,-33.4648,-66.4598,7,2,BSk,Arid,C3,67.08
1,22.852,103.302,363.949,8.299,0.0,95.348,6.23618,2009-12-21 07:00:00,2009,12,21,7,2009-12-21,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,MF,Arid,0,AR-SLu,,-33.4648,-66.4598,7,2,BSk,Arid,C3,67.08
2,23.519,157.819,363.949,8.904,0.0,95.386,7.54983,2009-12-21 07:30:00,2009,12,21,7,2009-12-21,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,MF,Arid,30,AR-SLu,,-33.4648,-66.4598,7,2,BSk,Arid,C3,67.08
3,24.186,211.68,363.949,9.509,0.0,95.425,13.0622,2009-12-21 08:00:00,2009,12,21,8,2009-12-21,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,MF,Arid,0,AR-SLu,,-33.4648,-66.4598,7,2,BSk,Arid,C3,67.08
4,24.853,263.963,363.949,10.113,0.0,95.464,17.6049,2009-12-21 08:30:00,2009,12,21,8,2009-12-21,0.27248,0.47574,0.11142,0.0832,0.2342,0.0464,0.0781,0.2884,0.2482,0.1604,MF,Arid,30,AR-SLu,,-33.4648,-66.4598,7,2,BSk,Arid,C3,67.08


In [11]:
data_df.describe()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,koppen_main,c4_percent
count,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0,11649078.0,12568371.0,12568371.0,12568371.0,12568371.0,12568371.0
mean,14.07966,334.29476,323.35014,7.8594,0.04412,95.21526,7.00687,2010.52958,6.56535,15.75316,11.87929,0.32403,0.56984,0.14942,0.08357,0.26335,0.05447,0.0822,0.26196,0.18487,0.10345,14.99031,572.0153,42.7132,-23.94728,18.74177,3.41511,6.34569
std,9.51895,255.46547,50.02858,8.22979,0.20169,7.17344,8.38258,4.99838,2.89526,8.80578,3.92622,1.46744,0.23238,0.08912,0.09849,0.09492,0.09556,0.09347,0.07363,0.08415,0.07233,15.0,672.08006,18.05087,69.90465,8.56902,0.81154,17.69283
min,-47.961,0.001,99.71,0.0,0.0,59.006,-71.4779,2001.0,1.0,1.0,0.0,-745.90909,-1.0,-0.11533,0.0,0.0,0.0,0.0,0.0,0.0011,0.0,0.0,-9.0,-37.4259,-157.4089,1.0,1.0,0.0
25%,8.024,108.633,291.916,2.161,0.0,94.013,0.73239,2006.0,4.0,8.0,9.0,0.21621,0.40848,0.08389,0.0332,0.1977,0.0187,0.0427,0.2063,0.1219,0.0519,0.0,161.5,38.70184,-93.0898,8.0,3.0,0.0
50%,14.469,288.7,326.888,5.17,0.0,97.879,3.99393,2011.0,7.0,16.0,12.0,0.31095,0.62267,0.13208,0.0548,0.2469,0.0293,0.0591,0.2637,0.1749,0.0802,0.0,272.0,45.74048,4.96859,25.0,4.0,0.0
75%,20.68,523.377,357.662,10.599,0.0,99.724,11.1117,2014.0,9.0,23.0,15.0,0.43137,0.7582,0.20322,0.0941,0.312,0.0476,0.0839,0.3178,0.2319,0.1328,30.0,657.0,50.89306,13.10177,26.0,4.0,1.47
max,45.052,1221.683,537.808,85.178,23.923,105.491,98.492,2020.0,12.0,31.0,23.0,164.375,1.0,0.55736,0.9337,0.9175,0.9367,0.9394,0.5946,0.6104,0.5202,30.0,3197.0,78.186,161.34143,29.0,5.0,100.0


In [12]:
print(f"site count: {len(data_df.site_id.unique())}")
data_df.site_id.unique()

site count: 271


array(['AR-SLu', 'AR-Vir', 'AT-Neu', 'AU-ASM', 'AU-Ade', 'AU-Cpr',
       'AU-Cum', 'AU-DaP', 'AU-DaS', 'AU-Dry', 'AU-Emr', 'AU-Fog',
       'AU-GWW', 'AU-Gin', 'AU-How', 'AU-Lox', 'AU-RDF', 'AU-Rig',
       'AU-Rob', 'AU-Stp', 'AU-TTE', 'AU-Wac', 'AU-Whr', 'AU-Wom',
       'AU-Ync', 'BR-Sa3', 'CA-Man', 'CA-NS4', 'CA-NS7', 'CA-Oas',
       'CA-Obs', 'CA-SF1', 'CA-SF3', 'CA-TP1', 'CA-TP2', 'CA-TP3',
       'CA-TPD', 'CG-Tch', 'CH-Oe1', 'CN-Cha', 'CN-Cng', 'CN-Dan',
       'CN-Din', 'CN-Du2', 'CN-Du3', 'CN-Ha2', 'CN-HaM', 'CN-Qia',
       'CN-Sw2', 'CZ-BK2', 'DE-Lkb', 'DE-Lnf', 'DE-Seh', 'DE-SfN',
       'DE-Spw', 'DE-Zrk', 'DK-Eng', 'DK-Fou', 'ES-Amo', 'ES-LgS',
       'ES-Ln2', 'FI-Jok', 'FI-Lom', 'FI-Sod', 'FR-LBr', 'FR-Pue',
       'GH-Ank', 'GL-NuF', 'GL-ZaF', 'GL-ZaH', 'IT-CA1', 'IT-CA2',
       'IT-CA3', 'IT-Col', 'IT-Cpz', 'IT-Isp', 'IT-La2', 'IT-Noe',
       'IT-PT1', 'IT-Ro1', 'IT-Ro2', 'IT-SRo', 'JP-MBF', 'JP-SMF',
       'MY-PSO', 'NL-Hor', 'PA-SPn', 'PA-SPs', 'RU-Che', 'RU-C

# Upload Data to Azure Storage Blob as Parquet
**Run with Caution!!!**

In [None]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
parquet_file = BytesIO()
data_df.to_parquet(parquet_file, engine='pyarrow')
parquet_file.seek(0)

azStorageClient = AzStorageClient(az_cred_file)
azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to all-sites-data/all_site_trim_v_0_raw.parquet


## Should be no NA data

In [None]:
total_record_count = data_df.shape[0]
na_df = pd.DataFrame(data_df.isna().sum())
na_df["percentage"] = (na_df / total_record_count)
na_df.rename(columns={0:"count"}, inplace=True)

In [None]:
na_df.loc[(na_df['count'] != 0)].sort_values("percentage", ascending=False)

Unnamed: 0,count,percentage
elevation,919293,0.07314
