# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [2]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Modules

In [3]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [5]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

## Define Local Files System Constants

In [6]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

# Output
tag = "0"
container = "test-sites-data"
blob_name = f"test-sites-data_v_{tag}.parquet" #Advisor suggested features only

In [7]:
# Selected Test Sites
test_sites = ["US-GLE", # ENF, Cold
              "US-AR1", # GRA, Temperate
              "US-Seg", # GRA, Arid
              "US-FR2", # WSA, Temperate
              "ES-LM2", # WSA, Arid
              "CA-Cbo", # DBF, Cold
              "FR-Lam", # CRO, Temperate
              "IT-Cpz", # EBF, Temperate
              "CN-Cha", # MF Cold
              "IT-Lsn", # OSH, Temperate
              ]

# Get Selected Site Metadata

In [8]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name', 'koppen_main_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df= site_metadata_df.loc[site_metadata_df['site_id'].isin(test_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(10, 11)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename
0,CN-Cha,,42.4025,128.0958,22,4,Dwb,Cold,C3,12.17,data_full_half_hourly_raw_v0_1_CN-Cha.csv
1,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,Temperate,C3,0.0,data_full_half_hourly_raw_v0_1_IT-Cpz.csv
2,US-GLE,3197.0,41.36653,-106.2399,27,4,Dfc,Cold,C3,0.16,data_full_half_hourly_raw_v0_1_US-GLE.csv
3,CA-Cbo,120.0,44.3167,-79.9333,26,4,Dfb,Cold,C3,1.09,data_full_half_hourly_raw_v0_1_CA-Cbo.csv
4,US-AR1,611.0,36.4267,-99.42,14,3,Cfa,Temperate,C4,27.54,data_full_half_hourly_raw_v0_1_US-AR1.csv
5,US-FR2,271.9,29.9495,-97.9962,14,3,Cfa,Temperate,C3,18.37,data_full_half_hourly_raw_v0_1_US-FR2.csv
6,US-Seg,1622.0,34.3623,-106.7019,5,2,BWk,Arid,mix,0.24,data_full_half_hourly_raw_v0_1_US-Seg.csv
7,ES-LM2,270.0,39.93459,-5.77588,7,2,BSk,Arid,C3,1.3,data_full_half_hourly_raw_v0_1_ES-LM2.csv
8,FR-Lam,180.0,43.49644,1.23788,14,3,Cfa,Temperate,rotation,3.1,data_full_half_hourly_raw_v0_1_FR-Lam.csv
9,IT-Lsn,1.0,45.74048,12.7503,14,3,Cfa,Temperate,C3,2.7,data_full_half_hourly_raw_v0_1_IT-Lsn.csv


# Run Data Pipeline on Gold Sample Sites

In [9]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']

In [None]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = ['NEE_VUT_REF_QC']
target_variable = ['GPP_NT_VUT_REF']

In [10]:
# Get Train Dataset

# initial data clean and feature selections from raw data
data_df = data_cleanup(raw_data_dir, site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

# Merge with site metadata
data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")

# Drop rows with NA
check_and_drop_na(data_df)
print(f"Data size after after final drop: {data_df.shape}")

display(data_df.head())


CN-Cha: (16228, 27)
IT-Cpz: (59175, 27)
US-GLE: (54687, 27)
CA-Cbo: (79273, 27)
US-AR1: (28956, 27)
US-FR2: (30426, 27)
US-Seg: (91884, 27)
ES-LM2: (58806, 27)
FR-Lam: (115812, 27)
IT-Lsn: (40182, 27)
Data size after cleanup: (575429, 27)
Data size after after merged with site metadata: (575429, 36)
Data has NA.


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16228,0,0,0,0,0,0,0,0


Data size after after final drop: (559201, 36)


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
16228,4.66,5.449,218.92,2.904,0.0,100.765,2.1909,2001-01-01 07:30:00,2001,1,1,7,2001-01-01,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,30,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,Temperate,C3,0.0
16229,5.086,71.318,218.92,2.952,0.0,100.789,2.83647,2001-01-01 08:00:00,2001,1,1,8,2001-01-01,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,0,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,Temperate,C3,0.0
16230,5.511,132.32,218.92,3.0,0.0,100.813,4.03906,2001-01-01 08:30:00,2001,1,1,8,2001-01-01,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,30,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,Temperate,C3,0.0
16231,5.937,187.411,218.92,3.048,0.0,100.837,7.79772,2001-01-01 09:00:00,2001,1,1,9,2001-01-01,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,0,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,Temperate,C3,0.0
16232,6.362,235.65,218.92,3.096,0.0,100.861,6.93281,2001-01-01 09:30:00,2001,1,1,9,2001-01-01,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,30,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,Temperate,C3,0.0


In [11]:
data_df.describe()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,koppen_main,c4_percent
count,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0,559201.0
mean,15.99888,372.78984,328.34315,10.09542,0.04491,93.80726,5.89935,2011.04924,6.58824,15.7697,12.00068,0.29945,0.49832,0.13357,0.09872,0.26706,0.05637,0.08935,0.28554,0.22718,0.1455,15.00486,715.54907,40.39754,-47.85888,14.1226,2.97008,3.6079
std,9.4485,263.79306,53.348,9.59278,0.21359,9.93809,7.84509,4.85439,3.04618,8.8167,3.70952,0.13835,0.22863,0.07576,0.07196,0.06416,0.05761,0.05835,0.06413,0.09469,0.09622,15.00001,979.83832,4.38791,51.53537,7.69401,0.71284,6.89128
min,-30.687,0.001,107.639,0.0,0.0,66.031,-49.3833,2001.0,1.0,1.0,4.0,-0.09768,-0.02273,-0.01764,0.0148,0.0986,0.005,0.0193,0.0913,0.0299,0.0123,0.0,1.0,29.9495,-106.7019,5.0,2.0,0.0
25%,9.804,137.523,289.936,3.352,0.0,84.793,0.42672,2007.0,4.0,8.0,9.0,0.19767,0.32305,0.07163,0.0473,0.2155,0.0251,0.0522,0.2401,0.1575,0.0707,0.0,120.0,36.4267,-106.2399,7.0,2.0,0.24
50%,16.4,340.396,334.09,6.995,0.0,98.739,2.93147,2011.0,7.0,16.0,12.0,0.29334,0.52381,0.12294,0.0776,0.2663,0.0398,0.0745,0.2901,0.2111,0.1177,30.0,180.0,41.70525,-79.9333,14.0,3.0,1.09
75%,22.745,580.106,369.349,13.612,0.0,99.859,9.38355,2015.0,9.0,23.0,15.0,0.38945,0.68621,0.18062,0.1304,0.3117,0.0646,0.1043,0.332,0.2952,0.1927,30.0,1622.0,43.49644,1.23788,14.0,3.0,3.1
max,44.087,1050.626,473.085,80.091,13.035,103.921,83.613,2020.0,12.0,31.0,23.0,1.41056,0.90239,0.38458,0.812,0.7762,0.764,0.8053,0.4418,0.4792,0.4347,30.0,3197.0,45.74048,12.7503,27.0,4.0,27.54


In [12]:
data_df.site_id.unique()

array(['IT-Cpz', 'US-GLE', 'CA-Cbo', 'US-AR1', 'US-FR2', 'US-Seg',
       'ES-LM2', 'FR-Lam', 'IT-Lsn'], dtype=object)

# Upload Data to Azure Storage Blob as Parquet
**Run with Caution!!!**

In [13]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
parquet_file = BytesIO()
data_df.to_parquet(parquet_file, engine='pyarrow')
parquet_file.seek(0)

azStorageClient = AzStorageClient(az_cred_file)
azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to test-sites-data/test-sites-data_v_0.parquet
