Features:
- From halft hourly dataset
  ```
  'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
  'datetime', 'year', 'month', 'day', 'hour', 'date',
  'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
  'IGBP'
  ```
- Site metadata:
  ```
  'elevation', 'lat', 'long', 'koppen_sub', 'koppen_main',
  'c3c4', 'c4_percent'
  ```
- Monthly Avergae Features (`data/datasets/external_dataset/monthly/static_monthly_features_v1.csv`):
  ```
  'SITE_ID', 'month', 'TA_F_avg', 'VPD_F_avg', 'P_F_avg',
  'Ts_avg', 'Tmean_avg', 'prcp_lag3_avg',
  'vpd_avg',  'Percent_Snow_avg',
  'EVI_avg', 'NDVI_avg', 'NIRv_avg', 
  'b1_avg', 'b2_avg', 'b3_avg', 'b4_avg', 'b5_avg','b6_avg', 'b7_avg'
  ```

  On Limited Sites:
  ```
 TRAIN = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var",
          "US-Ne2", "ES-LJu", "US-Ton", "US-UMB", "US-Me2",
          "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]
 TEST = ["US-GLE", # ENF, Cold
          "US-AR1", # GRA, Temperate
          "US-Seg", # GRA, Arid
          "US-FR2", # WSA, Temperate
          "ES-LM2", # WSA, Arid
          "CA-Cbo", # DBF, Cold
          "FR-Lam", # CRO, Temperate
          "IT-Cpz", # EBF, Temperate
          "CN-Cha", # MF Cold // Get removed due to missing values
          "IT-Lsn", # OSH, Temperate]
  ```

# Notebook Setup

In [2]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [3]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Mounted at /content/drive/


## Import Modules

In [4]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.0/16.0 MB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.6/96.6 KB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
flask 1.1.4 requires click<8.0,>=5.1, but you have click 8.1.3 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [5]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

from pyspark.sql.functions import col
import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [6]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").config(
    "spark.jars.packages", 
    "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6"
    ).getOrCreate()
# Check Spark Session Information
spark

# Define Local Files System Constants

In [12]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_features_filename =  data_dir + os.sep + "datasets" + os.sep + "external_dataset" + os.sep + "monthly" + os.sep + "static_monthly_features_v1.csv"

# Data Definitions


In [8]:
# File
container = "baseline-data"
ext = "parquet"
ver = "1"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name_base = f"baseline-train-v-{ver}"
test_blob_name_base = f"baseline-test-v-{ver}"

In [9]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

train_sites = tier1_sites + tier2_sites

# Selected Test Sites
test_sites = ["US-GLE", # ENF, Cold
              "US-AR1", # GRA, Temperate
              "US-Seg", # GRA, Arid
              "US-FR2", # WSA, Temperate
              "ES-LM2", # WSA, Arid
              "CA-Cbo", # DBF, Cold
              "FR-Lam", # CRO, Temperate
              "IT-Cpz", # EBF, Temperate
              "CN-Cha", # MF Cold
              "IT-Lsn", # OSH, Temperate
              ]

In [10]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'

# Get Gold Sample Site Data

In [18]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(train_sites + test_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(26, 10)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,c3c4,c4_percent,filename
0,CN-Cha,,42.4025,128.0958,22,4,Dwb,C3,12.17,data_full_half_hourly_raw_v0_1_CN-Cha.csv
1,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,C3,6.59,data_full_half_hourly_raw_v0_1_FR-Pue.csv
2,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_IT-Cpz.csv
3,US-GLE,3197.0,41.36653,-106.2399,27,4,Dfc,C3,0.16,data_full_half_hourly_raw_v0_1_US-GLE.csv
4,US-NR1,3050.0,40.0329,-105.5464,27,4,Dfc,C3,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv
5,US-Ne2,362.0,41.16487,-96.4701,25,4,Dfa,rotation,48.91,
6,US-SRM,1120.0,31.8214,-110.8661,6,2,BSh,C3,55.39,data_full_half_hourly_raw_v0_1_US-SRM.csv
7,US-Ton,177.0,38.4316,-120.96598,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_US-Ton.csv
8,US-Var,129.0,38.4133,-120.9507,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_US-Var.csv
9,US-Wkg,1531.0,31.7365,-109.9419,7,2,BSk,C4,40.93,data_full_half_hourly_raw_v0_1_US-Wkg.csv


# Get Monthly Statuc Features

In [19]:
# A memo of all available features in the dataset
included_monthly_features = [
       'SITE_ID', 'SITE_IGBP', 'month', 'TA_F_avg', 'VPD_F_avg', 'P_F_avg',
       'NETRAD_avg', 'NEE_VUT_REF_avg', 'NEE_VUT_REF_QC_avg',
       'NEE_CUT_REF_avg', 'NEE_CUT_REF_QC_avg', 'GPP_NT_VUT_REF_avg',
       'GPP_DT_VUT_REF_avg', 'GPP_NT_CUT_REF_avg', 'GPP_DT_CUT_REF_avg',
       'RECO_NT_VUT_REF_avg', 'RECO_DT_VUT_REF_avg', 'RECO_NT_CUT_REF_avg',
       'RECO_DT_CUT_REF_avg', 'ET_avg', 'BESS_PAR_avg', 'BESS_PARdiff_avg',
       'BESS_RSDN_avg', 'CSIF_SIFdaily_avg', 'CSIF_SIFinst_avg', 'PET_avg',
       'Ts_avg', 'Tmean_avg', 'prcp_avg', 'vpd_avg', 'prcp_lag3_avg',
       'ESACCI_sm_avg', 'b1_avg', 'b2_avg', 'b3_avg', 'b4_avg', 'b5_avg',
       'b6_avg', 'b7_avg', 'EVI_avg', 'GCI_avg', 'NDVI_avg', 'NDWI_avg',
       'NIRv_avg', 'kNDVI_avg', 'Percent_Snow_avg', 'Fpar_avg', 'Lai_avg',
       'LST_Day_avg', 'LST_Night_avg', 'CO2_concentration_avg', 'dataset',
       'MODIS_LC', 'MODIS_IGBP', 'MODIS_PFT', 'koppen_sub', 'koppen',
       'hemisphere', 'LOCATION_LAT', 'LOCATION_LONG']

In [22]:
included_monthly_features = ['SITE_ID', 'month', 'TA_F_avg', 'VPD_F_avg', 'P_F_avg',
                             'Ts_avg', 'Tmean_avg', 'prcp_lag3_avg',
                             'vpd_avg',  'Percent_Snow_avg',
                             'EVI_avg', 'NDVI_avg', 'NIRv_avg', 
                             'b1_avg', 'b2_avg', 'b3_avg', 'b4_avg', 'b5_avg','b6_avg', 'b7_avg']
monthly_static_features_df = pd.read_csv(monthly_features_filename, usecols = included_monthly_features)

# only focus on target sites
monthly_static_features_df = monthly_static_features_df.loc[monthly_static_features_df['SITE_ID'].isin(train_sites + test_sites)]
print(f"size:{monthly_static_features_df.shape}")
monthly_static_features_df.head()

size:(309, 20)


Unnamed: 0,SITE_ID,month,TA_F_avg,VPD_F_avg,P_F_avg,Ts_avg,Tmean_avg,vpd_avg,prcp_lag3_avg,b1_avg,b2_avg,b3_avg,b4_avg,b5_avg,b6_avg,b7_avg,EVI_avg,NDVI_avg,NIRv_avg,Percent_Snow_avg
396,CA-Cbo,1,-5.5405,0.916,1.361,266.85755,267.11847,0.09597,0.00533,0.04002,0.09694,0.0215,0.03524,0.10664,0.07074,0.04522,0.121,0.41535,0.04029,94.46774
397,CA-Cbo,2,-6.145,1.0845,2.157,266.15253,266.27877,0.11057,0.00483,0.14326,0.24064,0.1128,0.1345,0.2039,0.14334,0.08594,0.19437,0.32753,0.06559,94.42199
398,CA-Cbo,3,-1.625,1.6695,2.2125,271.89497,271.41314,0.17706,0.00517,0.15665,0.2486,0.126,0.14867,0.20599,0.14933,0.09412,0.17863,0.31446,0.06267,44.26825
399,CA-Cbo,4,6.182,3.523,3.046,279.27048,278.96313,0.30176,0.00722,0.08084,0.20583,0.04311,0.0711,0.24993,0.21843,0.1405,0.22827,0.43522,0.08979,0.0
400,CA-Cbo,5,10.156,4.979,2.708,283.75226,283.17087,0.43244,0.00836,0.07571,0.24356,0.04062,0.07347,0.27629,0.219,0.13343,0.30093,0.52414,0.1283,0.0


# Stage 1: Trim and Merge Site Metadata

In [None]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']

In [26]:
# Initial data clean and feature selections from raw data
data_df = data_cleanup(raw_data_dir, site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

# Merge with site metadata
data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename', 'koppen_name'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")

data_df = data_df.merge(monthly_static_features_df, how='left', left_on=['site_id', 'month'], right_on=['SITE_ID', 'month'])
data_df.drop(['SITE_ID'], axis=1, inplace=True)
print(f"Data size after after merged with monthly static features: {data_df.shape}")

# Drop rows with NA
check_and_drop_na(data_df)
print(f"Data size after after final drop: {data_df.shape}")

display(data_df.head())

CN-Cha: (16228, 27)
FR-Pue: (117200, 27)
IT-Cpz: (59175, 27)
US-GLE: (54687, 27)
US-NR1: (98652, 27)

ERROR: US-Ne2 is mssing hourly data.
US-SRM: (95419, 27)
US-Ton: (113031, 27)
US-Var: (119950, 27)
US-Wkg: (93319, 27)
CA-Cbo: (79273, 27)
US-AR1: (28956, 27)
US-ARM: (125756, 27)
US-FR2: (30426, 27)

ERROR: US-MMS is mssing hourly data.
US-Me2: (99780, 27)
US-Seg: (91884, 27)
US-UMB: (70639, 27)
US-Vcp: (78491, 27)
CH-Lae: (112718, 27)
ES-LJu: (112724, 27)
ES-LM2: (58806, 27)
FI-Hyy: (127362, 27)
FR-Lam: (115812, 27)
IT-Lav: (120885, 27)
IT-Lsn: (40182, 27)
Data size after cleanup: (2061355, 27)
Data size after after merged with site metadata: (2061355, 34)
Data size after after merged with monthly static features: (2061355, 52)
Data has NA.


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,c3c4,c4_percent,TA_F_avg,VPD_F_avg,P_F_avg,Ts_avg,Tmean_avg,vpd_avg,prcp_lag3_avg,b1_avg,b2_avg,b3_avg,b4_avg,b5_avg,b6_avg,b7_avg,EVI_avg,NDVI_avg,NIRv_avg,Percent_Snow_avg
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16228,0,0,0,0,0,0,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732,10732


Data size after after final drop: (2034470, 52)


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,koppen_main,c3c4,c4_percent,TA_F_avg,VPD_F_avg,P_F_avg,Ts_avg,Tmean_avg,vpd_avg,prcp_lag3_avg,b1_avg,b2_avg,b3_avg,b4_avg,b5_avg,b6_avg,b7_avg,EVI_avg,NDVI_avg,NIRv_avg,Percent_Snow_avg
16228,5.311,25.016,272.218,1.708,0.0,97.939,-0.53574,2001-01-01 08:30:00,2001,1,1,8,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,C3,6.59,5.82877,2.47162,2.89546,277.79545,278.44258,0.20391,0.00993,0.05071,0.17879,0.0279,0.0501,0.22795,0.16805,0.09303,0.27639,0.5842,0.11295,0.0
16229,5.744,59.734,272.218,1.738,0.0,97.939,0.86438,2001-01-01 09:00:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,3,C3,6.59,5.82877,2.47162,2.89546,277.79545,278.44258,0.20391,0.00993,0.05071,0.17879,0.0279,0.0501,0.22795,0.16805,0.09303,0.27639,0.5842,0.11295,0.0
16230,6.176,91.235,272.218,1.767,0.0,97.939,-0.02627,2001-01-01 09:30:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,C3,6.59,5.82877,2.47162,2.89546,277.79545,278.44258,0.20391,0.00993,0.05071,0.17879,0.0279,0.0501,0.22795,0.16805,0.09303,0.27639,0.5842,0.11295,0.0
16231,6.608,79.264,333.933,1.797,0.05,97.939,-0.17229,2001-01-01 10:00:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,3,C3,6.59,5.82877,2.47162,2.89546,277.79545,278.44258,0.20391,0.00993,0.05071,0.17879,0.0279,0.0501,0.22795,0.16805,0.09303,0.27639,0.5842,0.11295,0.0
16232,7.043,94.929,333.933,1.817,0.0,97.923,1.20865,2001-01-01 10:30:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,3,C3,6.59,5.82877,2.47162,2.89546,277.79545,278.44258,0.20391,0.00993,0.05071,0.17879,0.0279,0.0501,0.22795,0.16805,0.09303,0.27639,0.5842,0.11295,0.0


In [27]:
data_df.describe()

Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,koppen_main,c4_percent,TA_F_avg,VPD_F_avg,P_F_avg,Ts_avg,Tmean_avg,vpd_avg,prcp_lag3_avg,b1_avg,b2_avg,b3_avg,b4_avg,b5_avg,b6_avg,b7_avg,EVI_avg,NDVI_avg,NIRv_avg,Percent_Snow_avg
count,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0,2034470.0
mean,15.15152,376.15044,319.24421,10.11024,0.03945,91.58291,5.82982,2010.1154,6.54788,15.74974,11.92277,0.29652,0.52551,0.13057,0.08466,0.2478,0.04674,0.07624,0.2653,0.20793,0.12681,15.00108,905.30046,41.36808,-58.28076,15.80723,3.24203,7.43234,13.36164,8.38416,1.83126,287.31049,286.85906,0.7872,0.00611,0.08995,0.24101,0.04885,0.07973,0.26459,0.21593,0.1361,0.2743,0.48982,0.11838,5.12355
std,9.1793,267.34261,49.27554,9.62745,0.19056,9.26265,7.5679,4.65452,3.01317,8.81161,3.75362,0.13213,0.22186,0.0718,0.06103,0.06262,0.04536,0.04755,0.06936,0.09491,0.08469,15.0,906.96804,7.05377,57.6655,8.50556,0.71912,14.32281,7.89954,6.08844,1.14716,9.29213,8.19452,0.59728,0.00345,0.05134,0.05402,0.03406,0.03746,0.06362,0.08393,0.07653,0.10388,0.17887,0.05499,15.43168
min,-30.687,0.001,107.639,0.0,0.0,66.031,-49.7372,2001.0,1.0,1.0,3.0,-0.11958,-0.18252,-0.01764,0.0054,0.0305,0.0,0.0,0.0,0.0132,0.0,0.0,1.0,29.9495,-121.5574,5.0,2.0,0.0,-10.26233,0.27254,0.00757,262.42587,262.45474,0.04606,0.00023,0.02523,0.08125,0.01467,0.02519,0.09626,0.04705,0.02272,0.08726,0.10188,0.03309,0.0
25%,8.934,138.23825,284.976,3.222,0.0,85.178,0.4402,2007.0,4.0,8.0,9.0,0.20161,0.33887,0.07881,0.0371,0.2023,0.0213,0.0445,0.2093,0.1296,0.0558,0.0,180.0,36.6058,-109.9419,8.0,3.0,0.0,8.66691,3.825,0.9166,281.56132,281.70399,0.34413,0.00339,0.05179,0.20431,0.02835,0.05461,0.21475,0.15,0.0792,0.2045,0.36333,0.07795,0.0
50%,15.266,343.0415,320.818,6.974,0.0,97.089,3.11767,2010.0,7.0,16.0,12.0,0.28849,0.55313,0.11891,0.0678,0.2384,0.0359,0.0666,0.2702,0.1965,0.1095,30.0,314.0,40.0329,-97.4888,14.0,3.0,0.35,13.70956,6.55922,1.83736,287.60406,287.30859,0.5903,0.00644,0.07804,0.23686,0.04107,0.07206,0.27413,0.2131,0.12007,0.27086,0.52693,0.11104,0.0
75%,21.685,585.201,354.65,13.69,0.0,99.035,9.38782,2014.0,9.0,23.0,15.0,0.36668,0.69977,0.16395,0.1194,0.287,0.0562,0.0948,0.3202,0.283,0.184,30.0,1531.0,44.4523,3.5957,26.0,4.0,6.59,19.10033,11.39982,2.59723,293.63675,292.65724,1.06995,0.0084,0.11441,0.27768,0.0524,0.09168,0.31667,0.28253,0.17866,0.34288,0.61986,0.15144,0.23333
max,44.087,1094.341,473.085,80.091,15.493,103.921,85.0309,2020.0,12.0,31.0,23.0,2.38835,0.93551,0.42385,0.812,0.7762,0.7689,0.8053,0.4666,0.4792,0.4347,30.0,3197.0,61.84741,24.29477,27.0,4.0,55.39,28.525,32.3417,5.89112,305.67596,302.54859,3.10543,0.01598,0.28348,0.37637,0.30396,0.30001,0.37756,0.4091,0.3638,0.55422,0.79549,0.28293,96.55375


In [28]:
data_df.site_id.unique()

array(['FR-Pue', 'IT-Cpz', 'US-GLE', 'US-NR1', 'US-SRM', 'US-Ton',
       'US-Var', 'US-Wkg', 'CA-Cbo', 'US-AR1', 'US-ARM', 'US-FR2',
       'US-Me2', 'US-Seg', 'US-UMB', 'US-Vcp', 'CH-Lae', 'ES-LJu',
       'ES-LM2', 'FI-Hyy', 'FR-Lam', 'IT-Lav', 'IT-Lsn'], dtype=object)

# CheckPoint: Upload Data to Azure Storage Blob as Parquet

In [30]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = True
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:

  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to baseline-data/baseline_all_v_1_raw.parquet


# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [31]:
load_data_from_previous_checkpoint = True

if load_data_from_previous_checkpoint:
  data_df = None
  print(f"loading {tmp_dir + os.sep + blob_name}...")
  if not (os.path.exists(tmp_dir + os.sep + blob_name)):
      if not (os.path.exists(tmp_dir)):
          os.mkdir(tmp_dir)
      azStorageClient = AzStorageClient(az_cred_file)
      file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
      data_df = pd.read_parquet(file_stream, engine='pyarrow')
      data_df.to_parquet(tmp_dir + os.sep + blob_name)
  
  data_df = spark.read.parquet(tmp_dir + os.sep + blob_name)
  data_df = data_df.drop(*['__index_level_0__'])
  print(f"Data loaded: {data_df.count()} rows x {len(data_df.columns)} columns.")


loading /content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling/.tmp/baseline_all_v_1_raw.parquet...
Data loaded: 2034470 rows x 52 columns.


In [32]:
# Drop 'datetime', 'date', and 'koppen; as they are already represented by other columns
features = data_df.columns
if target_variable in features:
  features.remove(target_variable)
data_df = data_df.select([target_variable] + features) #reorder columns

categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen_main']
data_df = data_df.drop(*['datetime', 'date', 'koppen'])

In [33]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
 
string_indexer = StringIndexer(inputCols=categorical_cols, outputCols=[x + "_Index" for x in categorical_cols]) 
data_df = string_indexer.fit(data_df).transform(data_df)

one_hot_encoder  = OneHotEncoder(inputCols=string_indexer.getOutputCols(), outputCols=[x + "_OHE" for x in categorical_cols])
data_df = one_hot_encoder.fit(data_df).transform(data_df)

data_df = data_df.drop(*categorical_cols).drop(*string_indexer.getOutputCols())

print(f"Data size after encoding: {data_df.count()} rows x {len(data_df.columns)} columns.")
data_df.show()

Data size after encoding: 2034470 rows x 49 columns.
+--------------+------+---------+---------+-------+-----+------+----+-----+---+----+------------------+------------------+------------------+------+------+------+------+------+------+---+------+-------+---------+-------+------+----------+----------------+----------------+----------------+----------------+----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------+-----------------+-----------------+----------------+-------------+-------------+--------------+---------------+
|GPP_NT_VUT_REF|TA_ERA|SW_IN_ERA|LW_IN_ERA|VPD_ERA|P_ERA|PA_ERA|year|month|day|hour|               EVI|              NDVI|              NIRv|    b1|    b2|    b3|    b4|    b5|    b6| b7|minute|site_id|elevation|    lat|  long|c4_percent|        TA_F_avg|       VPD_F_avg|         P_F_avg|          Ts_avg|       Tmean_avg|        

In [34]:
# Split into train and test sets
train_df = data_df.filter(col('site_id').isin(train_sites))
test_df = data_df.filter(col('site_id').isin(test_sites))

train_df = train_df.drop(*['site_id'])
test_df = test_df.drop(*['site_id'])

features = train_df.columns
if target_variable in features:
  features.remove(target_variable)

print(f"Train data size: {train_df.count()} rows x {len(train_df.columns)} columns.")
print(f"Test data size: {test_df.count()} rows x {len(test_df.columns)} columns.")
print(f"Features: {features}")

del data_df

Train data size: 1485926 rows x 48 columns.
Test data size: 548544 rows x 48 columns.
Features: ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'year', 'month', 'day', 'hour', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'minute', 'elevation', 'lat', 'long', 'c4_percent', 'TA_F_avg', 'VPD_F_avg', 'P_F_avg', 'Ts_avg', 'Tmean_avg', 'vpd_avg', 'prcp_lag3_avg', 'b1_avg', 'b2_avg', 'b3_avg', 'b4_avg', 'b5_avg', 'b6_avg', 'b7_avg', 'EVI_avg', 'NDVI_avg', 'NIRv_avg', 'Percent_Snow_avg', 'IGBP_OHE', 'c3c4_OHE', 'koppen_sub_OHE', 'koppen_main_OHE']


In [35]:
# Assemble data
assembler = VectorAssembler(inputCols=features, outputCol="vectorized_features")
train_df = assembler.transform(train_df)
test_df = assembler.transform(test_df)

print("Train data peak:")
train_df.show(5, False)
print("Test data peak:")
test_df.show(5, False)

Train data peak:
+--------------+------+---------+---------+-------+-----+------+----+-----+---+----+------------------+------------------+------------------+------+------+------+------+------+------+---+------+---------+-------+------+----------+----------------+----------------+----------------+----------------+----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------+-----------------+-----------------+----------------+-------------+-------------+--------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [36]:
# Normalize data
scaler = MinMaxScaler(inputCol='vectorized_features', outputCol='features')
scaler_model = scaler.fit(train_df)
train_df = scaler_model.transform(train_df)
test_df = scaler_model.transform(test_df)

train_df = train_df.drop(*['vectorized_features'])
test_df = test_df.drop(*['vectorized_features'])

print("Train data peak:")
train_df.show(5, False)
print("Test data peak:")
test_df.show(5, False)

Train data peak:
+--------------+------+---------+---------+-------+-----+------+----+-----+---+----+------------------+------------------+------------------+------+------+------+------+------+------+---+------+---------+-------+------+----------+----------------+----------------+----------------+----------------+----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------+-----------------+-----------------+----------------+-------------+-------------+--------------+---------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Checkpoint: Upload train and test to Azure Blob Storage

In [37]:
final_checkpoint = True

if final_checkpoint:
  model_data_container = "baseline-data"
  azStorageClient = AzStorageClient(az_cred_file)
  sessionkeys = azStorageClient.getSparkSessionKeys()
  spark.conf.set(sessionkeys[0],sessionkeys[1])

  # Upload train dataset
  train_blob_path = f"wasbs://{model_data_container}@{sessionkeys[2]}.blob.core.windows.net/{train_blob_name_base}"
  print(f"Uploading train dataset to {train_blob_path}...")
  train_df.write.format("parquet").mode("overwrite").save(train_blob_path)

  # Upload test dataset
  test_blob_path = f"wasbs://{model_data_container}@{sessionkeys[2]}.blob.core.windows.net/{test_blob_name_base}"
  print(f"Uploading test dataset to {test_blob_path}...")
  test_df.write.format("parquet").mode("overwrite").save(test_blob_path)

Uploading train dataset to wasbs://baseline-data@mids23spring.blob.core.windows.net/baseline-train-v-1...
Uploading test dataset to wasbs://baseline-data@mids23spring.blob.core.windows.net/baseline-test-v-1...
