# 0. Preparation

## (Optoinal) mount google drive
- Make sure that available access is the user's own drive(no access across files in shared folder)

In [2]:
if 'google.colab' in str(get_ipython()):
    IN_COLLAB = True
else:
    IN_COLLAB = False

if IN_COLLAB:
    #TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
    MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
    from google.colab import drive
    drive.mount('/content/drive/')
else:
    MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling/"

## Import libraries

In [3]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")
import copy
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pytorch_lightning as pl
import torch
import torch.nn as nn

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
from pytorch_forecasting import BaseModel, MAE
from pytorch_forecasting.metrics.point import RMSE
from pytorch_forecasting.data.encoders import NaNLabelEncoder

from sklearn.metrics import r2_score
from timeit import default_timer
from datetime import datetime
import gc
import pickle

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
     sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append('./.cred')
    sys.path.append('./code/src/tools')
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *
from model_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pl.seed_everything(42)

2023-03-27 21:23:56.482667: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-03-27 21:23:56.530273: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-03-27 21:23:56.531039: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Global seed set to 42
Global seed set to 42


42

# Load data from Azure blob

In [4]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
model_dir = root_dir + os.sep + 'data' + os.sep + 'models'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

container = "all-sites-data"
blob_name = "full_2010_2015_v_mvp_raw.parquet"
local_file = tmp_dir + os.sep + blob_name

In [5]:
# Download full data
root_dir  = MY_HOME_ABS_PATH
tmp_dir   = root_dir + os.sep + '.tmp'
model_dir = root_dir + os.sep + 'data' + os.sep + 'models'

container = "all-sites-data"
blob_name = "full_2010_2015_v_mvp_raw.parquet"
local_file = tmp_dir + os.sep + blob_name

data_df = get_raw_datasets(container, blob_name)

Data size: (4862712, 51)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP', 'MODIS_PFT',
       'gap_flag_hour', 'gap_flag_month'],
      dtype='object')
NA count: 0


# Eval Model

In [6]:
exp_model_dir = "/root/co2-flux-hourly-gpp-modeling/data/models/tft_model_5YrTrain_3DEncode_SmallNetwork_SlimFeatures_NoGPP_230327_0111" # TODO: Replace to your model dir
best_model_path = "/root/co2-flux-hourly-gpp-modeling/data/models/tft_model_5YrTrain_3DEncode_SmallNetwork_SlimFeatures_NoGPP_230327_0111/lightning_logs/version_0/checkpoints/epoch=7-step=20104.ckpt"
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
print(f"Quantiles: {best_tft.loss.quantiles}") # [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]

# Experiment constants
VAL_INDEX  = 3
TEST_INDEX = 4
SUBSET_LEN = 24*365*5 # 1 year
ENCODER_LEN = 24*3   # 3 days
print(f"Training timestemp length = {SUBSET_LEN}.")

# create dataloaders for model
# ref: https://pytorch-lightning.readthedocs.io/en/stable/guides/speed.html#dataloaders
batch_size = 128  # set this between 32 to 128
cpu_count = os.cpu_count()

Quantiles: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
Training timestemp length = 43800.


In [7]:
def setup_tsdataset_nogpp(train_df, val_df, test_df, min_encoder_len):
    # create training and validation TS dataset 
    training = TimeSeriesDataSet(
      train_df, # <------ no longer subsetting, option 1 split can use entire train site sequence
      time_idx="timestep_idx_global",
      target="GPP_NT_VUT_REF",
      group_ids=["site_id"],
      allow_missing_timesteps=False, # <---- turned on bc some rows are removed.
      min_encoder_length=min_encoder_len,
      max_encoder_length=min_encoder_len,
      min_prediction_length=1,
      max_prediction_length=1,
      static_categoricals=["koppen_main"],
      static_reals=[],
      time_varying_known_categoricals=["month", "hour"],
      time_varying_known_reals=['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7'],
      time_varying_unknown_categoricals=[], 
      time_varying_unknown_reals=[],
      target_normalizer=None,
      categorical_encoders={'koppen_main': NaNLabelEncoder(add_nan=True),
                            },
      add_relative_time_idx=True,
      add_target_scales=False, # <------- turned off
      add_encoder_length=False, # <------- turned off
    )

    validation = TimeSeriesDataSet.from_dataset(training, val_df, predict=False, stop_randomization=True)
    
    if test_df is not None:
        testing = TimeSeriesDataSet.from_dataset(training, test_df, predict=False, stop_randomization=True)
    else:
        testing = None

    return (training, validation, testing)

In [8]:
# Setup dataset
train_df, val_df, _ = get_splited_datasets(data_df, VAL_INDEX, TEST_INDEX)
train_df, val_df, _ = subset_data(train_df, val_df, None, SUBSET_LEN)
_, validation, _ = setup_tsdataset_nogpp(train_df, val_df, None, ENCODER_LEN)

Subest length: 43800 timesteps for each sites
Subset num train timesteps: 2579424
Subset num val timesteps: 952752


## Default Evaluation (1 year of validation data)

In [9]:
# Eval on Validation set
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=cpu_count)

# Print Model Eval on Validation Set
start = default_timer()
print(f"Start eval on validation.")
val_actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
val_q_predictions = best_tft.predict(val_dataloader, mode="quantiles", show_progress_bar=True, return_x=False)
eval_time = default_timer() - start
print(f"Val eval time: {eval_time}")

# save eval results - also we can resume eval at a later point in time
with open(exp_model_dir + os.sep + "val_actuals.pkl", "wb") as fout:
    pickle.dump(val_actuals, fout)

with open(exp_model_dir + os.sep + "val_q_predictions.pkl", "wb") as fout:
    pickle.dump(val_q_predictions, fout)

Start eval on validation.


Predict: 100%|██████████| 7429/7429 [09:58<00:00, 12.41 batches/s]


Val eval time: 651.330793101999


In [11]:
# load eval results - also we can resume eval at a later point in time
# with open(exp_model_dir + os.sep + "val_actuals.pkl", "rb") as fin:
#     val_acutals = pickle.load(fin)

# with open(exp_model_dir + os.sep + "val_q_predictions.pkl", "rb") as fin:
#     val_q_predictions = pickle.load(fin)

# Eval on Validation set
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=cpu_count)

val_predictions = val_q_predictions[:, :, 3] # p50 == estimation
val_p90 = val_q_predictions[:, :,5] # p90 == estimation
#del val_q_predictions

# Unmasked evaltaion
eval_metric = get_eval_metrics(val_actuals, val_predictions, None, val_p90)
print(f"Unmasked Val eval evaluation: {eval_metric}")

# Masked evlationa
start = default_timer()
mask = torch.logical_not(torch.cat([x['decoder_cat'][:, :, -1].reshape(-1) for x, y in iter(val_dataloader)]))
masked_eval_metric = get_eval_metrics(val_actuals, val_predictions, mask, val_p90)
eval_time = default_timer() - start
print(f"Masked Val eval time: {eval_time}")
print(f"Masked Val eval evaluation: {masked_eval_metric}")

Unmasked Val eval evaluation: {'rmse': 3.6963117122650146, 'mae': 2.034262180328369, 'nse': 0.7267818450927734, 'r2': 0.7267818118600462, 'p50_loss': 0.5044232606887817, 'p90_loss': 0.28162509202957153}
Masked Val eval time: 51.08380620800017
Masked Val eval evaluation: {'rmse': 1.5254271030426025, 'mae': 0.8380093574523926, 'nse': 0.17887413501739502, 'r2': 0.1788742221373929, 'p50_loss': 0.9253355860710144, 'p90_loss': 0.5865048766136169}


## Eval 5 Years of Validation Data

In [13]:
# split data
train_df, val_df, _ = get_splited_datasets(data_df, VAL_INDEX, TEST_INDEX)
train_df, _, _ = subset_data(train_df, val_df, None, SUBSET_LEN)
_, validation, _ = setup_tsdataset_nogpp(train_df, val_df, None, ENCODER_LEN)
print(f"validation timesteps: {len(val_df)}")

del train_df
gc.collect()

# Eval on Validation set
val_dataloader = validation.to_dataloader(train=False, batch_size=128, num_workers=os.cpu_count())

start = default_timer()
val_actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
val_q_predictions = best_tft.predict(val_dataloader, mode="quantiles", show_progress_bar=True)
print(f"Val eval time: {default_timer() - start}")

# save eval results - also we can resume tuning at a later point in time
with open(exp_model_dir + os.sep + "val5yr_actuals.pkl", "wb") as fout:
    pickle.dump(val_actuals, fout)

with open(exp_model_dir + os.sep + "val5yr_q_predictions.pkl", "wb") as fout:
    pickle.dump(val_q_predictions, fout)

Subest length: 43800 timesteps for each sites
Subset num train timesteps: 2579424
Subset num val timesteps: 952752
validation timesteps: 1056072



Predict:   0%|          | 0/8236 [00:00<?, ? batches/s][A
Predict:   0%|          | 1/8236 [00:01<2:34:34,  1.13s/ batches][A
Predict:   0%|          | 3/8236 [00:01<48:08,  2.85 batches/s]  [A
Predict:   0%|          | 5/8236 [00:01<29:18,  4.68 batches/s][A
Predict:   0%|          | 7/8236 [00:01<22:11,  6.18 batches/s][A
Predict:   0%|          | 9/8236 [00:01<18:40,  7.34 batches/s][A
Predict:   0%|          | 11/8236 [00:01<16:06,  8.51 batches/s][A
Predict:   0%|          | 13/8236 [00:02<14:50,  9.23 batches/s][A
Predict:   0%|          | 15/8236 [00:02<13:39, 10.03 batches/s][A
Predict:   0%|          | 17/8236 [00:02<13:10, 10.39 batches/s][A
Predict:   0%|          | 19/8236 [00:02<12:24, 11.04 batches/s][A
Predict:   0%|          | 21/8236 [00:02<11:56, 11.46 batches/s][A
Predict:   0%|          | 23/8236 [00:02<11:52, 11.53 batches/s][A
Predict:   0%|          | 25/8236 [00:03<11:19, 12.08 batches/s][A
Predict:   0%|          | 27/8236 [00:03<11:13, 12.18 bat

Val eval time: 723.588577329001


In [14]:
# # load eval results - also we can resume eval at a later point in time
# with open(exp_model_dir + os.sep + "val5yr_actuals.pkl", "rb") as fin:
#     val_acutals = pickle.load(fin)

# with open(exp_model_dir + os.sep + "val5yr_q_predictions.pkl", "rb") as fin:
#     val_q_predictions = pickle.load(fin)

val_predictions = val_q_predictions[:, :, 3] # p50 == estimation
val_p90 = val_q_predictions[:, :,5] # p90
#del val_q_predictions
    
# Unmasked evaltaion
eval_metric = get_eval_metrics(val_actuals, val_predictions, None, val_p90)
print(f"Unmasked Val eval evaluation: {eval_metric}")

# Masked evlationa
start = default_timer()
mask = torch.logical_not(torch.cat([x['decoder_cat'][:, :, -1].reshape(-1) for x, y in iter(val_dataloader)]))
masked_eval_metric = get_eval_metrics(val_actuals, val_predictions, mask, val_p90)
eval_time = default_timer() - start
print(f"Masked Val eval time: {eval_time}")
print(f"Masked Val eval evaluation: {masked_eval_metric}")

Unmasked Val eval evaluation: {'rmse': 3.7135982513427734, 'mae': 2.0345048904418945, 'nse': 0.7280431985855103, 'r2': 0.728043250362639, 'p50_loss': 0.5005776882171631, 'p90_loss': 0.28066569566726685}
Masked Val eval time: 60.46160401199995
Masked Val eval evaluation: {'rmse': 1.5558595657348633, 'mae': 0.8453733325004578, 'nse': 0.20473861694335938, 'r2': 0.20473860699055235, 'p50_loss': 0.9071527719497681, 'p90_loss': 0.5814838409423828}


# Model Interpreation

In [15]:
# Setup dataset
train_df, val_df, _ = get_splited_datasets(data_df, VAL_INDEX, TEST_INDEX)
train_df, val_df, _ = subset_data(train_df, val_df, None, SUBSET_LEN)
_, validation, _ = setup_tsdataset(train_df, val_df, None, ENCODER_LEN)
raw_predictions = best_tft.predict(validation, mode="raw",  show_progress_bar=True)
interpretation = best_tft.interpret_output(raw_predictions, reduction="sum")
best_tft.plot_interpretation(interpretation)

Subest length: 43800 timesteps for each sites
Subset num train timesteps: 2579424
Subset num val timesteps: 952752



Predict:   0%|          | 0/14858 [00:00<?, ? batches/s][A

IndexError: index out of range in self

Predict:   0%|          | 0/8236 [1:23:55<?, ? batches/s]