# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [2]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [3]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 KB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask 1.1.4 requires click<8.0,>=5.1, but you have click 8.1.3 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from edahelpers import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [5]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

## Define Local Files System Constants

In [6]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

# Output
tag = "0"
container = "test-sites-data"
blob_name = f"test-sites-data_v_{tag}.parquet" #Advisor suggested features only

In [7]:
# Selected Test Sites
test_sites = ["US-GLE", # ENF, Cold
              "US-AR1", # GRA, Temperate
              "US-Seg", # GRA, Arid
              "US-FR2", # WSA, Temperate
              "ES-LM2", # WSA, Arid
              "CA-Cbo", # DBF, Cold
              "FR-Lam", # CRO, Temperate
              "IT-Cpz", # EBF, Temperate
              "CN-Cha", # MF Cold
              "IT-Lsn", # OSH, Temperate
              ]

# Get Selected Site Metadata

In [8]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name', 'koppen_main_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(test_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(10, 11)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename
0,CN-Cha,,42.4025,128.0958,22,4,Dwb,Cold,C3,12.17,data_full_half_hourly_raw_v0_1_CN-Cha.csv
1,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_IT-Cpz.csv
2,US-GLE,3197.0,41.36653,-106.2399,27,4,Dfc,Cold,C3,0.16,data_full_half_hourly_raw_v0_1_US-GLE.csv
3,CA-Cbo,120.0,44.3167,-79.9333,26,4,Dfb,Cold,C3,1.09,data_full_half_hourly_raw_v0_1_CA-Cbo.csv
4,US-AR1,611.0,36.4267,-99.42,14,3,Cfa,Temperate,C4,27.54,data_full_half_hourly_raw_v0_1_US-AR1.csv
5,US-FR2,271.9,29.9495,-97.9962,14,3,Cfa,Temperate,C3,18.37,data_full_half_hourly_raw_v0_1_US-FR2.csv
6,US-Seg,1622.0,34.3623,-106.7019,5,2,BWk,Arid,mix,0.24,data_full_half_hourly_raw_v0_1_US-Seg.csv
7,ES-LM2,270.0,39.93459,-5.77588,7,2,BSk,Arid,C3,1.3,data_full_half_hourly_raw_v0_1_ES-LM2.csv
8,FR-Lam,180.0,43.49644,1.23788,14,3,Cfa,Temperate,rotation,3.1,data_full_half_hourly_raw_v0_1_FR-Lam.csv
9,IT-Lsn,1.0,45.74048,12.7503,14,3,Cfa,Temperate,C3,2.7,data_full_half_hourly_raw_v0_1_IT-Lsn.csv


# Run Data Pipeline on Gold Sample Sites

In [9]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']

In [10]:
def data_cleanup(site_id_file_df, target, target_qc, features):

    data_df = None
    # qc_flag_dtype = CategoricalDtype([0, 1, 2, 3], ordered=True)
    qc_flags_features = [s for s in included_features if "_QC" in s]

    # Iterate through each site:
    for i, r in site_id_file_df.iterrows():        
        if not r.filename or type(r.filename) != type(""):
            print(f'\nERROR: {r.site_id} is mssing hourly data.')
            continue

        # Get only `included_features` from file
        local_filename = raw_data_dir + os.sep + r.filename
        site_df = pd.read_csv(local_filename, usecols = included_features +  target_variable + target_variable_qc)
        site_df['datetime'] = pd.to_datetime(site_df['datetime'])
        site_df['date'] = pd.to_datetime(site_df['date'])
        site_df['minute'] = site_df['datetime'].dt.minute
        if len(qc_flags_features) != 0:
            site_df[qc_flags_features] = site_df[qc_flags_features].astype('int')
        site_df['site_id'] = r.site_id

        # Remove zero or negative SW
        site_df.drop(site_df[site_df['SW_IN_ERA'] <= 0].index, inplace = True)

        # Drop rows with NAs for Target Variable
        site_df.dropna(subset=target_variable, axis=0, inplace=True)

        # Drop rows with bad NEE_VUT_REF_QC (aka bad GPP records)
        site_df.drop(site_df[site_df[target_variable_qc[0]] == 3].index, inplace = True)
        site_df.drop(target_variable_qc, axis=1, inplace=True)

        # Drop rows with any NA
        site_df.dropna(axis=0, inplace=True)

        print(f"{r.site_id}: {site_df.shape}")
        if type(data_df) == type(None):
            data_df = site_df
        else:
            data_df = pd.concat([data_df, site_df])
            
    return data_df

def merg_site_metadata(data_df, site_metadata_df):
    # Merge with Site Metadata
    data_df = data_df.merge(site_metadata_df, how='left', left_on='site_id', right_on='site_id')
    return data_df

In [11]:
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = ['NEE_VUT_REF_QC']
target_variable = ['GPP_NT_VUT_REF']

# Get Train Dataset
data_df = data_cleanup(site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

data_df = merg_site_metadata(data_df, site_metadata_df.drop(['filename'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")
display(data_df.head())


CN-Cha: (16228, 27)
IT-Cpz: (59175, 27)
US-GLE: (54687, 27)
CA-Cbo: (79273, 27)
US-AR1: (28956, 27)
US-FR2: (30426, 27)
US-Seg: (91884, 27)
ES-LM2: (58806, 27)
FR-Lam: (115812, 27)
IT-Lsn: (40182, 27)
Data size after cleanup: (575429, 27)
Data size after after merged with site metadata: (575429, 36)


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
0,-4.691,29.009,278.697,1.171,0.064,94.149,0.1353,2003-03-13 06:00:00,2003,3,13,6,2003-03-13,0.10572,0.15259,0.0302,0.1455,0.1979,0.1109,0.1448,0.155,0.101,0.0658,MF,Cold,0,CN-Cha,,42.4025,128.0958,22,4,Dwb,Cold,C3,12.17
1,-4.48,62.796,278.697,1.279,0.064,94.163,-0.473,2003-03-13 06:30:00,2003,3,13,6,2003-03-13,0.10572,0.15259,0.0302,0.1455,0.1979,0.1109,0.1448,0.155,0.101,0.0658,MF,Cold,30,CN-Cha,,42.4025,128.0958,22,4,Dwb,Cold,C3,12.17
2,-4.269,95.273,278.697,1.388,0.0,94.177,-0.46365,2003-03-13 07:00:00,2003,3,13,7,2003-03-13,0.10572,0.15259,0.0302,0.1455,0.1979,0.1109,0.1448,0.155,0.101,0.0658,MF,Cold,0,CN-Cha,,42.4025,128.0958,22,4,Dwb,Cold,C3,12.17
3,-4.059,125.886,278.697,1.496,0.0,94.191,-0.43926,2003-03-13 07:30:00,2003,3,13,7,2003-03-13,0.10572,0.15259,0.0302,0.1455,0.1979,0.1109,0.1448,0.155,0.101,0.0658,MF,Cold,30,CN-Cha,,42.4025,128.0958,22,4,Dwb,Cold,C3,12.17
4,-3.848,385.607,226.914,1.604,0.0,94.205,1.2968,2003-03-13 08:00:00,2003,3,13,8,2003-03-13,0.10572,0.15259,0.0302,0.1455,0.1979,0.1109,0.1448,0.155,0.101,0.0658,MF,Cold,0,CN-Cha,,42.4025,128.0958,22,4,Dwb,Cold,C3,12.17


In [12]:
data_df.describe()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,koppen_main,c4_percent
count,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,575429.0,559201.0,575429.0,575429.0,575429.0,575429.0,575429.0
mean,15.87811,371.66242,327.43647,10.00838,0.04451,93.77548,5.96719,2010.85302,6.60275,15.77924,11.97021,0.30026,0.50196,0.13399,0.0971,0.26551,0.05552,0.08818,0.28358,0.22491,0.14341,15.00571,715.54907,40.45408,-42.89669,14.34476,2.99913,3.84936
std,9.45291,263.22687,53.82894,9.50595,0.2121,9.79924,7.8872,4.92395,3.03538,8.82158,3.71476,0.13962,0.22899,0.07661,0.07166,0.06554,0.05709,0.058,0.06519,0.09441,0.09571,15.00001,979.83832,4.33831,58.56188,7.69604,0.7231,6.93971
min,-30.687,0.001,107.639,0.0,0.0,66.031,-49.3833,2001.0,1.0,1.0,4.0,-0.09768,-0.02273,-0.01764,0.0148,0.084,0.005,0.0193,0.0913,0.0299,0.0123,0.0,1.0,29.9495,-106.7019,5.0,2.0,0.0
25%,9.674,136.921,288.878,3.344,0.0,84.935,0.44793,2007.0,4.0,8.0,9.0,0.19508,0.33521,0.07072,0.0458,0.2139,0.0247,0.051,0.237,0.1558,0.0697,0.0,120.0,36.4267,-106.2399,7.0,2.0,0.24
50%,16.302,339.359,333.216,6.949,0.0,98.668,2.98209,2011.0,7.0,16.0,12.0,0.29302,0.52421,0.12272,0.0755,0.2655,0.0389,0.0731,0.2885,0.2074,0.1147,30.0,180.0,41.70525,-5.77588,14.0,3.0,1.3
75%,22.616,578.559,368.875,13.459,0.0,99.82,9.46656,2015.0,9.0,23.0,15.0,0.39185,0.69042,0.18176,0.1278,0.3112,0.0635,0.1036,0.3305,0.2922,0.1885,30.0,1622.0,43.49644,1.23788,22.0,4.0,3.1
max,44.087,1050.626,473.085,80.091,13.035,103.921,83.613,2020.0,12.0,31.0,23.0,1.41056,0.90943,0.38458,0.812,0.7762,0.764,0.8053,0.4418,0.4792,0.4347,30.0,3197.0,45.74048,128.0958,27.0,4.0,27.54


In [13]:
data_df.site_id.unique()

array(['CN-Cha', 'IT-Cpz', 'US-GLE', 'CA-Cbo', 'US-AR1', 'US-FR2',
       'US-Seg', 'ES-LM2', 'FR-Lam', 'IT-Lsn'], dtype=object)

## Should be no NA data

In [14]:
total_record_count = data_df.shape[0]
na_df = pd.DataFrame(data_df.isna().sum())
na_df["percentage"] = (na_df / total_record_count)
na_df.rename(columns={0:"count"}, inplace=True)

In [15]:
na_df.loc[(na_df['count'] != 0)].sort_values("percentage", ascending=False)

Unnamed: 0,count,percentage
elevation,16228,0.0282


In [16]:
# Drop rows with NA
data_df.dropna(axis=0, inplace=True)
print(f"Data size after after final drop: {data_df.shape}")

Data size after after final drop: (559201, 36)


In [17]:
data_df.isna().sum()

TA_ERA              0
SW_IN_ERA           0
LW_IN_ERA           0
VPD_ERA             0
P_ERA               0
PA_ERA              0
GPP_NT_VUT_REF      0
datetime            0
year                0
month               0
day                 0
hour                0
date                0
EVI                 0
NDVI                0
NIRv                0
b1                  0
b2                  0
b3                  0
b4                  0
b5                  0
b6                  0
b7                  0
IGBP                0
koppen              0
minute              0
site_id             0
elevation           0
lat                 0
long                0
koppen_sub          0
koppen_main         0
koppen_name         0
koppen_main_name    0
c3c4                0
c4_percent          0
dtype: int64

# Upload Data to Azure Storage Blob as Parquet
**Run with Caution!!!**

In [18]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
parquet_file = BytesIO()
data_df.to_parquet(parquet_file, engine='pyarrow')
parquet_file.seek(0)

azStorageClient = AzStorageClient(az_cred_file)
azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to test-sites-data/test-sites-data_v_0.parquet
