# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [2]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [3]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 KB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask 1.1.4 requires click<8.0,>=5.1, but you have click 8.1.3 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [5]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [6]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").getOrCreate()
# Check Spark Session Information
spark

## Define Local Files System Constants

In [7]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

# Output
tag = "0_raw"
container = "all-sites-data"
blob_name = f"all_site_trim_v_{tag}.parquet" #Advisor suggested features only

# Get Site Metadata

In [8]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name', 'koppen_main_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df.head()

size:(286, 11)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent,filename
0,AR-SLu,,-33.4648,-66.4598,7,2,BSk,Arid,C3,67.08,data_full_half_hourly_raw_v0_1_AR-SLu.csv
1,AR-Vir,,-28.2395,-56.1886,14,3,Cfa,Temperate,C3,8.75,data_full_half_hourly_raw_v0_1_AR-Vir.csv
2,AT-Neu,970.0,47.11667,11.3175,26,4,Dfb,Cold,C3,0.0,data_full_half_hourly_raw_v0_1_AT-Neu.csv
3,AU-ASM,,-22.283,133.249,4,2,BWh,Arid,C3,100.0,data_full_half_hourly_raw_v0_1_AU-ASM.csv
4,AU-Ade,,-13.0769,131.1178,3,1,Aw,Tropical,C3,79.57,data_full_half_hourly_raw_v0_1_AU-Ade.csv


# Run Data Pipeline on All Sites

In [9]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']

In [10]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = ['NEE_VUT_REF_QC']
target_variable = ['GPP_NT_VUT_REF']

In [12]:
# Get Train Dataset

# initial data clean and feature selections from raw data
data_df = data_cleanup(raw_data_dir, site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

# Merge with site metadata
data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")

# Drop rows with NA
check_and_drop_na(data_df)
print(f"Data size after after final drop: {data_df.shape}")

display(data_df.head())

AR-SLu: (11274, 27)
AR-Vir: (16714, 27)
AT-Neu: (72353, 27)
AU-ASM: (36657, 27)
AU-Ade: (8936, 27)
AU-Cpr: (35642, 27)
AU-Cum: (18649, 27)
AU-DaP: (36943, 27)
AU-DaS: (50668, 27)
AU-Dry: (37785, 27)
AU-Emr: (19999, 27)
AU-Fog: (18397, 27)
AU-GWW: (16051, 27)
AU-Gin: (24512, 27)
AU-How: (74809, 27)
AU-Lox: (7292, 27)
AU-RDF: (13818, 27)
AU-Rig: (31940, 27)
AU-Rob: (6043, 27)
AU-Stp: (46142, 27)
AU-TTE: (21356, 27)

ERROR: AU-Tum is mssing hourly data.
AU-Wac: (23493, 27)
AU-Whr: (26820, 27)
AU-Wom: (39035, 27)
AU-Ync: (13974, 27)

ERROR: BR-Sa1 is mssing hourly data.
BR-Sa3: (5176, 27)
CA-Man: (34218, 27)
CA-NS4: (18816, 27)
CA-NS7: (25196, 27)
CA-Oas: (71623, 27)
CA-Obs: (73559, 27)
CA-SF1: (16717, 27)
CA-SF3: (20640, 27)
CA-TP1: (56949, 27)
CA-TP2: (16145, 27)
CA-TP3: (64247, 27)
CA-TPD: (21323, 27)
CG-Tch: (168, 27)
CH-Oe1: (48974, 27)
CN-Cha: (16228, 27)
CN-Cng: (26429, 27)
CN-Dan: (15425, 27)
CN-Din: (6564, 27)
CN-Du2: (14942, 27)
CN-Du3: (4676, 27)
CN-Ha2: (21062, 27)
CN-HaM: (148

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,919293,0,0,0,0,0,0,0,0


Data size after after final drop: (11649078, 36)


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,koppen_main_name,c3c4,c4_percent
27988,-11.452,15.339,170.681,1.077,0.0,91.369,1.63,2002-01-01 08:00:00,2002,1,1,8,2002-01-01,0.32922,0.20119,0.08528,0.2819,0.4239,0.2716,0.2781,0.2956,0.1004,0.0581,GRA,Cold,0,AT-Neu,970.0,47.11667,11.3175,26,4,Dfb,Cold,C3,0.0
27989,-11.161,68.466,170.681,1.159,0.0,91.399,2.5725,2002-01-01 08:30:00,2002,1,1,8,2002-01-01,0.32922,0.20119,0.08528,0.2819,0.4239,0.2716,0.2781,0.2956,0.1004,0.0581,GRA,Cold,30,AT-Neu,970.0,47.11667,11.3175,26,4,Dfb,Cold,C3,0.0
27990,-10.87,116.63,170.681,1.241,0.0,91.428,1.2618,2002-01-01 09:00:00,2002,1,1,9,2002-01-01,0.32922,0.20119,0.08528,0.2819,0.4239,0.2716,0.2781,0.2956,0.1004,0.0581,GRA,Cold,0,AT-Neu,970.0,47.11667,11.3175,26,4,Dfb,Cold,C3,0.0
27991,-10.579,159.007,170.681,1.323,0.0,91.458,1.2618,2002-01-01 09:30:00,2002,1,1,9,2002-01-01,0.32922,0.20119,0.08528,0.2819,0.4239,0.2716,0.2781,0.2956,0.1004,0.0581,GRA,Cold,30,AT-Neu,970.0,47.11667,11.3175,26,4,Dfb,Cold,C3,0.0
27992,-10.288,230.987,182.029,1.405,0.0,91.488,1.2618,2002-01-01 10:00:00,2002,1,1,10,2002-01-01,0.32922,0.20119,0.08528,0.2819,0.4239,0.2716,0.2781,0.2956,0.1004,0.0581,GRA,Cold,0,AT-Neu,970.0,47.11667,11.3175,26,4,Dfb,Cold,C3,0.0


In [13]:
data_df.describe()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,koppen_main,c4_percent
count,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0,11649078.0
mean,13.60254,327.68362,321.24704,7.37633,0.04356,95.12045,7.06948,2010.621,6.54862,15.75334,11.86917,0.32655,0.57479,0.15069,0.08286,0.26362,0.05506,0.08245,0.25919,0.18041,0.0999,14.99733,572.0153,46.11935,-33.14984,19.45874,3.49992,4.22409
std,9.26756,251.58909,48.385,7.64649,0.19381,7.08246,8.45908,5.07474,2.86422,8.8042,3.94893,1.52369,0.23246,0.08975,0.09985,0.0962,0.09743,0.09511,0.07344,0.08153,0.06988,15.0,672.08006,10.53639,59.36481,8.22415,0.72096,11.56868
min,-47.961,0.001,99.71,0.0,0.0,66.031,-71.4779,2001.0,1.0,1.0,0.0,-745.90909,-1.0,-0.11533,0.0,0.0,0.0,0.0,0.0,0.0011,0.0,0.0,-9.0,-37.4222,-157.4089,2.0,1.0,0.0
25%,7.744,105.555,290.689,2.074,0.0,93.839,0.72884,2006.0,4.0,8.0,9.0,0.21864,0.41777,0.08469,0.0327,0.1964,0.0185,0.0423,0.2036,0.1198,0.0512,0.0,161.5,40.60618,-97.4888,14.0,3.0,0.0
50%,14.084,281.819,325.234,4.925,0.0,97.726,3.99229,2011.0,7.0,16.0,12.0,0.31424,0.6302,0.1335,0.0535,0.2469,0.0288,0.0584,0.2601,0.1718,0.0781,0.0,272.0,46.01468,3.5957,26.0,4.0,0.0
75%,20.026,513.191,355.289,10.002,0.0,99.623,11.2631,2015.0,9.0,23.0,15.0,0.43523,0.76118,0.20543,0.0908,0.3139,0.0473,0.0834,0.316,0.2253,0.1265,30.0,657.0,50.96256,11.64464,26.0,4.0,1.37
max,45.052,1157.319,537.808,82.662,19.462,105.491,98.492,2020.0,12.0,31.0,23.0,164.375,1.0,0.54093,0.9337,0.8866,0.9367,0.9394,0.5946,0.6104,0.5202,30.0,3197.0,78.186,161.34143,29.0,5.0,62.17


In [14]:
print(f"site count: {len(data_df.site_id.unique())}")
data_df.site_id.unique()

site count: 230


array(['AT-Neu', 'AU-Wom', 'BR-Sa3', 'CA-Man', 'CA-NS4', 'CA-NS7',
       'CA-Oas', 'CA-Obs', 'CA-SF1', 'CA-SF3', 'CA-TP1', 'CA-TP2',
       'CA-TP3', 'CA-TPD', 'CG-Tch', 'CH-Oe1', 'CZ-BK2', 'DE-Lkb',
       'DE-Lnf', 'DE-Seh', 'DE-SfN', 'DE-Spw', 'DE-Zrk', 'DK-Eng',
       'DK-Fou', 'ES-Amo', 'ES-LgS', 'ES-Ln2', 'FI-Jok', 'FI-Lom',
       'FI-Sod', 'FR-LBr', 'FR-Pue', 'GH-Ank', 'GL-NuF', 'GL-ZaF',
       'GL-ZaH', 'IT-CA1', 'IT-CA2', 'IT-CA3', 'IT-Col', 'IT-Cpz',
       'IT-Isp', 'IT-La2', 'IT-Noe', 'IT-PT1', 'IT-Ro1', 'IT-Ro2',
       'IT-SRo', 'NL-Hor', 'PA-SPn', 'PA-SPs', 'RU-Che', 'RU-Cok',
       'RU-Ha1', 'SD-Dem', 'SJ-Adv', 'SN-Dhr', 'US-Atq', 'US-Blo',
       'US-GBT', 'US-GLE', 'US-Goo', 'US-IB2', 'US-Ivo', 'US-KS1',
       'US-KS2', 'US-Lin', 'US-Los', 'US-Me1', 'US-Me3', 'US-Me5',
       'US-Me6', 'US-Myb', 'US-NR1', 'US-ORv', 'US-SRC', 'US-SRM',
       'US-Sta', 'US-Syv', 'US-Ton', 'US-Tw3', 'US-Twt', 'US-Var',
       'US-WCr', 'US-WPT', 'US-Whs', 'US-Wi0', 'US-Wi1', 'US-W

# Upload Data to Azure Storage Blob as Parquet
**Run with Caution!!!**

In [15]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
parquet_file = BytesIO()
data_df.to_parquet(parquet_file, engine='pyarrow')
parquet_file.seek(0)

azStorageClient = AzStorageClient(az_cred_file)
azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to all-sites-data/all_site_trim_v_0_raw.parquet
