# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [2]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Modules

In [3]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

import pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [5]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").config(
    "spark.jars.packages", 
    "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6"
    ).getOrCreate()
# Check Spark Session Information
spark

# Define Constants

In [6]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

In [7]:
# File
container = "baseline-data"
ext = "parquet"
ver = "0"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name_base = f"baseline-train-v-{ver}"
test_blob_name_base = f"baseline-test-v-{ver}"

In [8]:
# "Golden" Sites
tier1_sites = ["US-MMS", "US-Vcp", "FR-Pue", "CH-Lae", "US-Var", "US-Ne2", "ES-LJu", "US-Ton"]
tier2_sites = ["US-UMB", "US-Me2", "FI-Hyy", "US-NR1", "IT-Lav", "US-Wkg", "US-ARM", "US-SRM"]

train_sites = tier1_sites + tier2_sites

# Selected Test Sites
test_sites = ["US-GLE", # ENF, Cold
              "US-AR1", # GRA, Temperate
              "US-Seg", # GRA, Arid
              "US-FR2", # WSA, Temperate
              "ES-LM2", # WSA, Arid
              "CA-Cbo", # DBF, Cold
              "FR-Lam", # CRO, Temperate
              "IT-Cpz", # EBF, Temperate
              "CN-Cha", # MF Cold
              "IT-Lsn", # OSH, Temperate
              ]

In [9]:
# Define features and target variables of the data pipelines
included_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                     'IGBP', 'koppen']
target_variable_qc = 'NEE_VUT_REF_QC'
target_variable = 'GPP_NT_VUT_REF'

# Get Gold Sample Site Data

In [10]:
# Load site metadata
included_site_features = ['site_id', 'filename', 'elevation', 'lat', 'long',
                          'koppen_sub', 'koppen_main', 'koppen_name',
                          'c3c4', 'c4_percent']
site_metadata_df = pd.read_csv(site_metadata_filename, usecols = included_site_features)

# only focus on target sites
site_metadata_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(train_sites + test_sites)]
print(f"size:{site_metadata_df.shape}")
site_metadata_df.reset_index(inplace=True, drop=True)
site_metadata_df

size:(26, 10)


Unnamed: 0,site_id,elevation,lat,long,koppen_sub,koppen_main,koppen_name,c3c4,c4_percent,filename
0,CN-Cha,,42.4025,128.0958,22,4,Dwb,C3,12.17,data_full_half_hourly_raw_v0_1_CN-Cha.csv
1,FR-Pue,270.0,43.7413,3.5957,8,3,Csa,C3,6.59,data_full_half_hourly_raw_v0_1_FR-Pue.csv
2,IT-Cpz,68.0,41.70525,12.37611,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_IT-Cpz.csv
3,US-GLE,3197.0,41.36653,-106.2399,27,4,Dfc,C3,0.16,data_full_half_hourly_raw_v0_1_US-GLE.csv
4,US-NR1,3050.0,40.0329,-105.5464,27,4,Dfc,C3,0.35,data_full_half_hourly_raw_v0_1_US-NR1.csv
5,US-Ne2,362.0,41.16487,-96.4701,25,4,Dfa,rotation,48.91,
6,US-SRM,1120.0,31.8214,-110.8661,6,2,BSh,C3,55.39,data_full_half_hourly_raw_v0_1_US-SRM.csv
7,US-Ton,177.0,38.4316,-120.96598,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_US-Ton.csv
8,US-Var,129.0,38.4133,-120.9507,8,3,Csa,C3,0.0,data_full_half_hourly_raw_v0_1_US-Var.csv
9,US-Wkg,1531.0,31.7365,-109.9419,7,2,BSk,C4,40.93,data_full_half_hourly_raw_v0_1_US-Wkg.csv


# Stage 1: Trim and Merge Site Metadata

In [None]:
all_features = ['TIMESTAMP_START', 'TIMESTAMP_END', 'TA_F', 'TA_F_QC', 'TA_ERA',
       'SW_IN_POT', 'SW_IN_F', 'SW_IN_F_QC', 'SW_IN_ERA', 'LW_IN_F',
       'LW_IN_F_QC', 'LW_IN_ERA', 'VPD_F', 'VPD_F_QC', 'VPD_ERA', 'P_F',
       'P_F_QC', 'P_ERA', 'PA_F', 'PA_F_QC', 'PA_ERA', 'NETRAD', 'PPFD_IN',
       'G_F_MDS', 'G_F_MDS_QC', 'LE_F_MDS', 'LE_F_MDS_QC', 'LE_CORR',
       'H_F_MDS', 'H_F_MDS_QC', 'H_CORR', 'NEE_VUT_REF', 'NEE_VUT_REF_QC',
       'NEE_CUT_REF', 'NEE_CUT_REF_QC', 'GPP_NT_VUT_REF', 'GPP_DT_VUT_REF',
       'GPP_NT_CUT_REF', 'GPP_DT_CUT_REF', 'RECO_NT_VUT_REF',
       'RECO_DT_VUT_REF', 'RECO_NT_CUT_REF', 'RECO_DT_CUT_REF', 'datetime',
       'year', 'month', 'day', 'hour', 'SITE_ID', 'date', 'NEE_VUT_REF_qa',
       'SW_DIF', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'koppen']

In [None]:
# Initial data clean and feature selections from raw data
data_df = data_cleanup(raw_data_dir, site_metadata_df[['site_id','filename']],
                  target_variable, target_variable_qc,
                  included_features)
print(f"Data size after cleanup: {data_df.shape}")

# Merge with site metadata
data_df = merge_site_metadata(data_df, site_metadata_df.drop(['filename', 'koppen_main', 'koppen_name'], axis=1))
print(f"Data size after after merged with site metadata: {data_df.shape}")

# Drop rows with NA
check_and_drop_na(data_df)
print(f"Data size after after final drop: {data_df.shape}")

#reorder columns
features = data_df.columns.to_list()
features.remove(target_variable)
data_df = data_df[([target_variable] + features)]

display(data_df.head())

CN-Cha: (16228, 27)
FR-Pue: (117200, 27)
IT-Cpz: (59175, 27)
US-GLE: (54687, 27)
US-NR1: (98652, 27)

ERROR: US-Ne2 is mssing hourly data.
US-SRM: (95419, 27)
US-Ton: (113031, 27)
US-Var: (119950, 27)
US-Wkg: (93319, 27)
CA-Cbo: (79273, 27)
US-AR1: (28956, 27)
US-ARM: (125756, 27)
US-FR2: (30426, 27)

ERROR: US-MMS is mssing hourly data.
US-Me2: (99780, 27)
US-Seg: (91884, 27)
US-UMB: (70639, 27)
US-Vcp: (78491, 27)
CH-Lae: (112718, 27)
ES-LJu: (112724, 27)
ES-LM2: (58806, 27)
FI-Hyy: (127362, 27)
FR-Lam: (115812, 27)
IT-Lav: (120885, 27)
IT-Lsn: (40182, 27)
Data size after cleanup: (2061355, 27)
Data size after after merged with site metadata: (2061355, 33)
Data has NA.


Unnamed: 0,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,GPP_NT_VUT_REF,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16228,0,0,0,0,0


Data size after after final drop: (2045127, 33)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,date,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent
16228,-0.53574,5.311,25.016,272.218,1.708,0.0,97.939,2001-01-01 08:30:00,2001,1,1,8,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59
16229,0.86438,5.744,59.734,272.218,1.738,0.0,97.939,2001-01-01 09:00:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59
16230,-0.02627,6.176,91.235,272.218,1.767,0.0,97.939,2001-01-01 09:30:00,2001,1,1,9,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59
16231,-0.17229,6.608,79.264,333.933,1.797,0.05,97.939,2001-01-01 10:00:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59
16232,1.20865,7.043,94.929,333.933,1.817,0.0,97.923,2001-01-01 10:30:00,2001,1,1,10,2001-01-01,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59


In [None]:
data_df.describe()

Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,c4_percent
count,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0,2045127.0
mean,5.87229,15.18393,376.25804,319.49078,10.11018,0.03958,91.62118,2010.1065,6.55024,15.7513,11.923,0.29796,0.52726,0.13143,0.08435,0.2483,0.04657,0.07608,0.26551,0.2077,0.12646,15.00078,901.20832,41.38344,-58.39359,15.86035,7.39929
std,7.61167,9.17084,267.25693,49.3058,9.61124,0.19043,9.25367,4.65177,3.00549,8.81239,3.75671,0.13328,0.22261,0.07261,0.06102,0.06284,0.04531,0.04748,0.06925,0.09472,0.08461,15.0,906.36713,7.03857,57.53619,8.51506,14.29274
min,-49.7372,-30.687,0.001,107.639,0.0,0.0,66.031,2001.0,1.0,1.0,3.0,-0.11958,-0.18252,-0.01764,0.0054,0.0305,0.0,0.0,0.0,0.0132,0.0,0.0,1.0,29.9495,-121.5574,5.0,0.0
25%,0.44478,8.974,138.391,285.186,3.23,0.0,85.196,2007.0,4.0,8.0,9.0,0.20223,0.3397,0.07913,0.0367,0.2025,0.0211,0.0444,0.2095,0.13,0.0559,0.0,180.0,36.6058,-109.9419,8.0,0.0
50%,3.14854,15.317,343.341,321.129,6.989,0.0,97.162,2010.0,7.0,16.0,12.0,0.28931,0.55534,0.11931,0.0675,0.2389,0.0357,0.0664,0.2708,0.1953,0.1088,30.0,314.0,40.0329,-97.4888,14.0,0.35
75%,9.46059,21.721,585.408,354.958,13.697,0.0,99.039,2014.0,9.0,23.0,15.0,0.36828,0.70229,0.16489,0.1191,0.2878,0.0561,0.0946,0.3201,0.2826,0.1836,30.0,1531.0,44.4523,3.5957,26.0,6.59
max,85.0309,44.087,1094.341,473.085,80.091,15.493,103.921,2020.0,12.0,31.0,23.0,2.38835,0.93551,0.42385,0.812,0.7762,0.7689,0.8053,0.4666,0.4792,0.4347,30.0,3197.0,61.84741,24.29477,27.0,55.39


In [None]:
data_df.site_id.unique()

array(['FR-Pue', 'IT-Cpz', 'US-GLE', 'US-NR1', 'US-SRM', 'US-Ton',
       'US-Var', 'US-Wkg', 'CA-Cbo', 'US-AR1', 'US-ARM', 'US-FR2',
       'US-Me2', 'US-Seg', 'US-UMB', 'US-Vcp', 'CH-Lae', 'ES-LJu',
       'ES-LM2', 'FI-Hyy', 'FR-Lam', 'IT-Lav', 'IT-Lsn'], dtype=object)

In [None]:
data_df.IGBP.unique()

array(['EBF', 'ENF', 'WSA', 'GRA', 'DBF', 'CRO', 'MF', 'OSH'],
      dtype=object)

In [None]:
data_df.koppen.unique()

array(['Temperate', 'Cold', 'Arid'], dtype=object)

# CheckPoint: Upload Data to Azure Storage Blob as Parquet

In [10]:
# Upload to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = False
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:

  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

In [13]:
# Save no transformed data
get_non_transform_train_test = True
if get_non_transform_train_test:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                        None, data_df)
  train_df, test_df = data_transformer.get_test_train_raw()
  print("Train data peak:")
  display(train_df.head(5))
  print("Test data peak:")
  display(test_df.head(5))

  train_blob_name= f"{train_blob_name_base}-{tag}.{ext}"
  test_blob_name= f"{test_blob_name_base}-{tag}.{ext}"
  data_transformer.upload_train_test_to_azure(az_cred_file, container,\
                                              train_blob_name, test_blob_name)

Data size: (2045127, 32).
Train data size: (1485926, 32).
Test data size: (559201, 32).
Train data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent
16228,-0.53574,5.311,25.016,272.218,1.708,0.0,97.939,2001-01-01 08:30:00,2001,1,1,8,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59
16229,0.86438,5.744,59.734,272.218,1.738,0.0,97.939,2001-01-01 09:00:00,2001,1,1,9,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59
16230,-0.02627,6.176,91.235,272.218,1.767,0.0,97.939,2001-01-01 09:30:00,2001,1,1,9,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59
16231,-0.17229,6.608,79.264,333.933,1.797,0.05,97.939,2001-01-01 10:00:00,2001,1,1,10,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,0,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59
16232,1.20865,7.043,94.929,333.933,1.817,0.0,97.923,2001-01-01 10:30:00,2001,1,1,10,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,EBF,Temperate,30,FR-Pue,270.0,43.7413,3.5957,8,C3,6.59


Test data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent
133428,2.1909,4.66,5.449,218.92,2.904,0.0,100.765,2001-01-01 07:30:00,2001,1,1,7,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,30,IT-Cpz,68.0,41.70525,12.37611,8,C3,0.0
133429,2.83647,5.086,71.318,218.92,2.952,0.0,100.789,2001-01-01 08:00:00,2001,1,1,8,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,0,IT-Cpz,68.0,41.70525,12.37611,8,C3,0.0
133430,4.03906,5.511,132.32,218.92,3.0,0.0,100.813,2001-01-01 08:30:00,2001,1,1,8,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,30,IT-Cpz,68.0,41.70525,12.37611,8,C3,0.0
133431,7.79772,5.937,187.411,218.92,3.048,0.0,100.837,2001-01-01 09:00:00,2001,1,1,9,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,0,IT-Cpz,68.0,41.70525,12.37611,8,C3,0.0
133432,6.93281,6.362,235.65,218.92,3.096,0.0,100.861,2001-01-01 09:30:00,2001,1,1,9,0.30272,0.74517,0.12452,0.0244,0.1671,0.018,0.0291,0.2352,0.0977,0.0484,EBF,Temperate,30,IT-Cpz,68.0,41.70525,12.37611,8,C3,0.0


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,c4_percent
count,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0
mean,5.86211,14.87724,377.56323,316.15936,10.11574,0.03757,90.79849,2009.75172,6.53593,15.74438,11.89376,0.29739,0.53815,0.13063,0.07894,0.24123,0.04288,0.07109,0.25797,0.20036,0.11929,14.99925,971.07777,41.75447,-62.35813,16.51431,8.82611
std,7.52193,9.04516,268.53741,47.26856,9.61817,0.18091,8.84375,4.52259,2.98991,8.81076,3.7739,0.13132,0.21932,0.07137,0.0554,0.06086,0.03907,0.0416,0.0696,0.09369,0.07861,15.00001,866.88847,7.77408,59.15444,8.71496,15.9951
min,-49.7372,-29.74,0.001,142.77,0.0,0.0,67.405,2001.0,1.0,1.0,3.0,-0.11958,-0.18252,-0.01715,0.0054,0.0305,0.0,0.0,0.0,0.0132,0.0,0.0,129.0,31.7365,-121.5574,6.0,0.0
25%,0.45157,8.692,138.70625,283.869,3.18,0.0,85.242,2006.0,4.0,8.0,9.0,0.20336,0.3415,0.08147,0.0337,0.1984,0.0196,0.0422,0.2021,0.1189,0.0515,0.0,234.0,36.6058,-110.8661,8.0,0.0
50%,3.23428,14.931,344.5285,317.35,6.987,0.0,93.167,2010.0,7.0,16.0,12.0,0.28756,0.56813,0.11822,0.0639,0.2308,0.034,0.0632,0.2611,0.1868,0.1033,0.0,689.0,40.0329,-97.4888,14.0,0.04
75%,9.4872,21.282,587.463,349.289,13.732,0.0,98.717,2013.0,9.0,23.0,15.0,0.36012,0.70597,0.15872,0.1166,0.2775,0.0545,0.0912,0.3152,0.2791,0.1824,30.0,1531.0,45.5598,3.5957,26.0,10.72
max,85.0309,42.587,1094.341,473.011,75.684,15.493,103.383,2020.0,12.0,31.0,23.0,2.38835,0.93551,0.42385,0.7971,0.7729,0.7689,0.7865,0.4666,0.428,0.3573,30.0,3050.0,61.84741,24.29477,27.0,55.39


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,minute,elevation,lat,long,koppen_sub,c4_percent
count,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0,1485926.0
mean,5.86211,14.87724,377.56323,316.15936,10.11574,0.03757,90.79849,2009.75172,6.53593,15.74438,11.89376,0.29739,0.53815,0.13063,0.07894,0.24123,0.04288,0.07109,0.25797,0.20036,0.11929,14.99925,971.07777,41.75447,-62.35813,16.51431,8.82611
std,7.52193,9.04516,268.53741,47.26856,9.61817,0.18091,8.84375,4.52259,2.98991,8.81076,3.7739,0.13132,0.21932,0.07137,0.0554,0.06086,0.03907,0.0416,0.0696,0.09369,0.07861,15.00001,866.88847,7.77408,59.15444,8.71496,15.9951
min,-49.7372,-29.74,0.001,142.77,0.0,0.0,67.405,2001.0,1.0,1.0,3.0,-0.11958,-0.18252,-0.01715,0.0054,0.0305,0.0,0.0,0.0,0.0132,0.0,0.0,129.0,31.7365,-121.5574,6.0,0.0
25%,0.45157,8.692,138.70625,283.869,3.18,0.0,85.242,2006.0,4.0,8.0,9.0,0.20336,0.3415,0.08147,0.0337,0.1984,0.0196,0.0422,0.2021,0.1189,0.0515,0.0,234.0,36.6058,-110.8661,8.0,0.0
50%,3.23428,14.931,344.5285,317.35,6.987,0.0,93.167,2010.0,7.0,16.0,12.0,0.28756,0.56813,0.11822,0.0639,0.2308,0.034,0.0632,0.2611,0.1868,0.1033,0.0,689.0,40.0329,-97.4888,14.0,0.04
75%,9.4872,21.282,587.463,349.289,13.732,0.0,98.717,2013.0,9.0,23.0,15.0,0.36012,0.70597,0.15872,0.1166,0.2775,0.0545,0.0912,0.3152,0.2791,0.1824,30.0,1531.0,45.5598,3.5957,26.0,10.72
max,85.0309,42.587,1094.341,473.011,75.684,15.493,103.383,2020.0,12.0,31.0,23.0,2.38835,0.93551,0.42385,0.7971,0.7729,0.7689,0.7865,0.4666,0.428,0.3573,30.0,3050.0,61.84741,24.29477,27.0,55.39


Uploading train dataset to baseline-train-v-0-raw.parquet...
File uploaded to baseline-data/baseline-train-v-0-raw.parquet
Uploading test dataset to baseline-test-v-0-raw.parquet...
File uploaded to baseline-data/baseline-test-v-0-raw.parquet


# Stage 2: Data Transform - Convert to Model Ready Data
- Encode data
- Split into train and test
- Assemble data through VectorAssembler
- Normalized data through minmax

In [14]:
load_data_from_previous_checkpoint = True
useSpark = False

if load_data_from_previous_checkpoint:
  data_df = None
  raw_data_file_path = tmp_dir + os.sep + blob_name
  print(f"loading {raw_data_file_path}...")
  if not (os.path.exists(raw_data_file_path)):
      if not (os.path.exists(tmp_dir)):
          os.mkdir(tmp_dir)
      azStorageClient = AzStorageClient(az_cred_file)
      file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
      data_df = pd.read_parquet(file_stream, engine='pyarrow')
      data_df.to_parquet(raw_data_file_path)
  
if useSpark:
  data_transformer = PySparkMLDataTransformer(spark, train_sites, test_sites,
                                              raw_data_file_path, data_df)
else:
  data_transformer = TFTDataTransformer(train_sites, test_sites,
                                              raw_data_file_path, data_df)

loading /content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling/.tmp/baseline_all_v_0_raw.parquet...
Data size: (2045127, 32).


In [15]:
timestamp_col = 'datetime'
target_col = 'GPP_NT_VUT_REF'

if useSpark: # Spark ML Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen'] 
  data_transformer.data_transform(categorical_cols, timestamp_col, target_col)

  print("Train data peak:")
  data_transformer.train_df.show(5, False)
  print("Test data peak:")
  data_transformer.test_df.show(5, False)

  train_blob_name= f"{train_blob_name_base}"
  test_blob_name= f"{test_blob_name_base}"

else: # TFT Data Transformer
  categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen', 'site_id',
                      'year', 'month', 'day', 'hour', 'minute']
  realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                  'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                  'elevation', 'lat', 'long', 'c4_percent']
  backup_cols = ['IGBP', 'koppen','site_id']
  data_transformer.data_transform(categorical_cols, realNum_cols, backup_cols,\
                                  timestamp_col, target_col)

  print("Train data peak:")
  display(data_transformer.train_df.head(5))
  print("Test data peak:")
  display(data_transformer.test_df.head(5))

  train_blob_name= f"{train_blob_name_base}.{ext}"
  test_blob_name= f"{test_blob_name_base}.{ext}"

Data size: (2045127, 35).
Data size after encoding: (2045127, 35)


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,IGBP_name,koppen_name,site_id_name
16228,-0.53574,5.311,25.016,272.218,1.708,0.0,97.939,2001-01-01 08:30:00,0,0,0,5,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,1,6,270.0,43.7413,3.5957,3,0,6.59,EBF,Temperate,FR-Pue
16229,0.86438,5.744,59.734,272.218,1.738,0.0,97.939,2001-01-01 09:00:00,0,0,0,6,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,0,6,270.0,43.7413,3.5957,3,0,6.59,EBF,Temperate,FR-Pue
16230,-0.02627,6.176,91.235,272.218,1.767,0.0,97.939,2001-01-01 09:30:00,0,0,0,6,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,1,6,270.0,43.7413,3.5957,3,0,6.59,EBF,Temperate,FR-Pue
16231,-0.17229,6.608,79.264,333.933,1.797,0.05,97.939,2001-01-01 10:00:00,0,0,0,7,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,0,6,270.0,43.7413,3.5957,3,0,6.59,EBF,Temperate,FR-Pue
16232,1.20865,7.043,94.929,333.933,1.817,0.0,97.923,2001-01-01 10:30:00,0,0,0,7,0.24998,0.73349,0.10592,0.0222,0.1444,0.0074,0.0267,0.1486,0.0977,0.0,2,2,1,6,270.0,43.7413,3.5957,3,0,6.59,EBF,Temperate,FR-Pue


Features(30): ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'year', 'month', 'day', 'hour', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'IGBP', 'koppen', 'minute', 'site_id', 'elevation', 'lat', 'long', 'koppen_sub', 'c3c4', 'c4_percent']
Train data size: (1485926, 35).
Test data size: (559201, 35).
Normalizinf features (20): ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 'elevation', 'lat', 'long', 'c4_percent']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


Train data size: (1485926, 35).
Test data size: (559201, 35).
Train data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,IGBP_name,koppen_name,site_id_name
16228,-0.53574,-1.05761,-1.31284,-0.92961,-0.87415,-0.20769,0.80741,2001-01-01 08:30:00,0,0,0,5,-0.36105,0.89066,-0.34629,-1.0243,-1.5912,-0.90822,-1.06704,-1.57138,-1.09584,-1.51739,2,2,1,6,-0.80873,0.25557,1.11494,3,0,-0.1398,EBF,Temperate,FR-Pue
16229,0.86438,-1.00974,-1.18356,-0.92961,-0.87103,-0.20769,0.80741,2001-01-01 09:00:00,0,0,0,6,-0.36105,0.89066,-0.34629,-1.0243,-1.5912,-0.90822,-1.06704,-1.57138,-1.09584,-1.51739,2,2,0,6,-0.80873,0.25557,1.11494,3,0,-0.1398,EBF,Temperate,FR-Pue
16230,-0.02627,-0.96198,-1.06625,-0.92961,-0.86802,-0.20769,0.80741,2001-01-01 09:30:00,0,0,0,6,-0.36105,0.89066,-0.34629,-1.0243,-1.5912,-0.90822,-1.06704,-1.57138,-1.09584,-1.51739,2,2,1,6,-0.80873,0.25557,1.11494,3,0,-0.1398,EBF,Temperate,FR-Pue
16231,-0.17229,-0.91422,-1.11083,0.37601,-0.8649,0.0687,0.80741,2001-01-01 10:00:00,0,0,0,7,-0.36105,0.89066,-0.34629,-1.0243,-1.5912,-0.90822,-1.06704,-1.57138,-1.09584,-1.51739,2,2,0,6,-0.80873,0.25557,1.11494,3,0,-0.1398,EBF,Temperate,FR-Pue
16232,1.20865,-0.86613,-1.0525,0.37601,-0.86282,-0.20769,0.8056,2001-01-01 10:30:00,0,0,0,7,-0.36105,0.89066,-0.34629,-1.0243,-1.5912,-0.90822,-1.06704,-1.57138,-1.09584,-1.51739,2,2,1,6,-0.80873,0.25557,1.11494,3,0,-0.1398,EBF,Temperate,FR-Pue


Test data peak:


Unnamed: 0,GPP_NT_VUT_REF,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,datetime,year,month,day,hour,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,koppen,minute,site_id,elevation,lat,long,koppen_sub,c3c4,c4_percent,IGBP_name,koppen_name,site_id_name
133428,2.1909,-1.12958,-1.38571,-2.05717,-0.7498,-0.20769,1.12696,2001-01-01 07:30:00,0,0,0,4,0.04053,0.9439,-0.08565,-0.98459,-1.21818,-0.6369,-1.00935,-0.32717,-1.09584,-0.90172,2,2,1,7,-1.04175,-0.00633,1.26338,3,0,-0.5518,EBF,Temperate,IT-Cpz
133429,2.83647,-1.08248,-1.14042,-2.05717,-0.74481,-0.20769,1.12967,2001-01-01 08:00:00,0,0,0,5,0.04053,0.9439,-0.08565,-0.98459,-1.21818,-0.6369,-1.00935,-0.32717,-1.09584,-0.90172,2,2,0,7,-1.04175,-0.00633,1.26338,3,0,-0.5518,EBF,Temperate,IT-Cpz
133430,4.03906,-1.0355,-0.91326,-2.05717,-0.73982,-0.20769,1.13238,2001-01-01 08:30:00,0,0,0,5,0.04053,0.9439,-0.08565,-0.98459,-1.21818,-0.6369,-1.00935,-0.32717,-1.09584,-0.90172,2,2,1,7,-1.04175,-0.00633,1.26338,3,0,-0.5518,EBF,Temperate,IT-Cpz
133431,7.79772,-0.9884,-0.7081,-2.05717,-0.73483,-0.20769,1.1351,2001-01-01 09:00:00,0,0,0,6,0.04053,0.9439,-0.08565,-0.98459,-1.21818,-0.6369,-1.00935,-0.32717,-1.09584,-0.90172,2,2,0,7,-1.04175,-0.00633,1.26338,3,0,-0.5518,EBF,Temperate,IT-Cpz
133432,6.93281,-0.94141,-0.52847,-2.05717,-0.72984,-0.20769,1.13781,2001-01-01 09:30:00,0,0,0,6,0.04053,0.9439,-0.08565,-0.98459,-1.21818,-0.6369,-1.00935,-0.32717,-1.09584,-0.90172,2,2,1,7,-1.04175,-0.00633,1.26338,3,0,-0.5518,EBF,Temperate,IT-Cpz


# Checkpoint: Upload train and test to Azure Blob Storage

In [16]:
final_checkpoint = True

if final_checkpoint:
  data_transformer.upload_train_test_to_azure(az_cred_file, container, \
                                            train_blob_name, test_blob_name)

Uploading train dataset to baseline-train-v-0.parquet...
File uploaded to baseline-data/baseline-train-v-0.parquet
Uploading test dataset to baseline-test-v-0.parquet...
File uploaded to baseline-data/baseline-test-v-0.parquet
