# 0. Preparation

In [2]:
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

## Import libraries

In [16]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")
import copy
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch
import torch.nn as nn

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
from pytorch_forecasting import BaseModel, MAE
from pytorch_forecasting.metrics.point import RMSE
from pytorch_forecasting.data.encoders import NaNLabelEncoder

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
sys.path.append('./.cred')
sys.path.append('./code/src/tools')
sys.path.append('./code/src/tools/CloudIO')

from eval_functions import *
from azure.storage.blob import BlobServiceClient
from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pl.seed_everything(42)

Global seed set to 42


42

# Load data from Azure blob

In [4]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

In [5]:
# AzStorageClient.listBlobs(container)
container = "baseline-data"
cred_file = az_cred_file
ext = "parquet"

# Container and file name
blob_name_train = "baseline-train-v-1-i-knn.parquet"
blob_name_val = "baseline-test-v-1-i-knn.parquet"

In [6]:
if os.path.exists(cred_file):
  connect_str = ""
  with open(cred_file, "rb") as f:
      data = json.load(f)
      connect_str = data['connectionstr']
      blob_svc_client = BlobServiceClient.from_connection_string(connect_str)
      tokens = connect_str.split(';')
      for t in tokens:
        if "AccountName=" in t:
          AccountName = t[len("AccountName="):]
        elif "AccountKey=" in t:
          AccountKey = t[len("AccountKey="):]

# Download train file
train_df = None
if not (os.path.exists(tmp_dir+blob_name_train)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name_train)
    train_df = pd.read_parquet(file_stream, engine='pyarrow')
    train_df.to_parquet(tmp_dir + blob_name_train)
else:
    train_df = pd.read_parquet(tmp_dir + blob_name_train)


# Load Test Data
test_df = None
if not (os.path.exists(tmp_dir+blob_name_val)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name_val)
    test_df = pd.read_parquet(file_stream, engine='pyarrow')
    test_df.to_parquet(tmp_dir + blob_name_val)
else:
    test_df = pd.read_parquet(tmp_dir + blob_name_val)

print(f"Train size: {train_df.shape}")
print(f"Test size: {test_df.shape}")

Train size: (631032, 49)
Test size: (234888, 49)


# Data Preprocessing
For development, subsetting to a few sites, and ~300 days for each site

In [7]:
train_sites = ['CH-Lae', 'FR-Pue']
val_sites = ['FI-Hyy', 'ES-LJu']

# Subset train_df to save some time
print(f"Initial Train Sites: {train_df['site_id'].unique()}")
print(f"Initial num train timesteps: {len(train_df)}")
val_df_split1 = train_df.loc[train_df['site_id'].isin(val_sites), ].copy()
train_df = train_df.loc[train_df['site_id'].isin(train_sites), ].copy()
print(f"Trimmed Train Sites: {train_df['site_id'].unique()}")
print(f"Val Sites: {val_df_split1['site_id'].unique()}")
print(f"Test Sites: {test_df['site_id'].unique()}")

# Subset the time series within sites to save more time (300 days)
train_df = train_df.loc[train_df['timestep_idx_local'] < 300, ].copy()
print(f"Subset num train timesteps: {len(train_df)}")
val_df_split1 = val_df_split1.loc[val_df_split1['timestep_idx_local'] < 300, ].copy()
print(f"Subset num val timesteps: {len(val_df_split1)}")

Initial Train Sites: ['CH-Lae' 'ES-LJu' 'FI-Hyy' 'FR-Pue' 'IT-Lav' 'US-ARM' 'US-NR1' 'US-SRM'
 'US-Ton' 'US-UMB' 'US-Var' 'US-Vcp' 'US-Wkg']
Initial num train timesteps: 631032
Trimmed Train Sites: ['CH-Lae' 'FR-Pue']
Val Sites: ['ES-LJu' 'FI-Hyy']
Test Sites: ['CA-Cbo' 'ES-LM2' 'FR-Lam' 'US-AR1' 'US-GLE' 'US-Seg']
Subset num train timesteps: 600
Subset num val timesteps: 600


In [8]:
# Convert Dtypes
cat_cols = ["year", "month", "day", "hour", "IGBP", "koppen_main", "koppen_sub"]
for col in cat_cols:
  train_df[col] = train_df[col].astype(str).astype("category")
  val_df_split1[col] = val_df_split1[col].astype(str).astype("category")
  test_df[col] = test_df[col].astype(str).astype("category")

## Initialize TS Dataset

In [9]:
# Define encoder, decoder lengths
max_prediction_length = 1
max_encoder_length = 24*7

In [10]:
training = TimeSeriesDataSet(
    train_df, 
    time_idx="timestep_idx_global",
    target="GPP_NT_VUT_REF",
    group_ids=["site_id"],
    allow_missing_timesteps=False, 
    min_encoder_length=max_encoder_length,
    max_encoder_length=max_encoder_length,
    min_prediction_length=max_prediction_length,
    max_prediction_length=max_prediction_length,
    static_categoricals=["IGBP","koppen_main","koppen_sub"],
    static_reals=[], #elevation
    time_varying_known_categoricals=["year", "month", "day", "hour"],
    time_varying_known_reals=["timestep_idx_local", 'TA_ERA', 'SW_IN_ERA',
                              'LW_IN_ERA', 'VPD_ERA','P_ERA', 'PA_ERA', 'EVI', 
                              'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5',
                              'b6', 'b7'],
    time_varying_unknown_categoricals=[], 
    time_varying_unknown_reals=["GPP_NT_VUT_REF", "gap_flag_hour"], # <--- added for masking metrics
    target_normalizer=None,
    categorical_encoders={'IGBP': NaNLabelEncoder(add_nan=True),
                          'koppen_main': NaNLabelEncoder(add_nan=True),
                          'koppen_sub': NaNLabelEncoder(add_nan=True),
                          'year': NaNLabelEncoder(add_nan=True), 
                          'month': NaNLabelEncoder(add_nan=True),
                          'day': NaNLabelEncoder(add_nan=True),
                          },
    add_relative_time_idx=True,
    add_target_scales=False,
    add_encoder_length=False, 
)

# create validation set (predict=True) using the full site sequence for val sites
validation = TimeSeriesDataSet.from_dataset(training, val_df_split1, predict=False, stop_randomization=True)

# create dataloaders for model
batch_size = 1  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0) # <--- removed the batch_size*10

# Temporal Fusion Transformer

In [11]:
tft = TemporalFusionTransformer.from_dataset(
    training,
    learning_rate=1e-3,
    hidden_size=16,  # most important hyperparameter apart from learning rate
    attention_head_size=1, #Set to up to 4 for large datasets
    dropout=0.1,  # between 0.1 and 0.3 are good values
    hidden_continuous_size=8,  # set to <= hidden_size
    output_size=7,  # 7 quantiles by default
    loss=QuantileLoss(),
    logging_metrics=nn.ModuleList([MAE(), RMSE()]), #SMAPE(), #MAPE() #<---- added metrics to report in TensorBoard
    reduce_on_plateau_patience=4, # reduce learning rate if no improvement in validation loss after x epochs
    optimizer="adam"
    )
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

Number of parameters in network: 43.3k


In [17]:
# configure network and trainer
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
lr_logger = LearningRateMonitor()  # log the learning rate
logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

trainer = pl.Trainer(
    max_epochs=1,
    enable_model_summary=False,
    gradient_clip_val=0.1,
    fast_dev_run=False,  # comment in to check that network or dataset has no serious bugs
    accelerator="gpu",
    gpus=1,
    callbacks=[lr_logger, early_stop_callback],
    logger=logger,
)

trainer.fit(
    tft,
    train_dataloaders=train_dataloader,
    val_dataloaders=val_dataloader,
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 0:  18%|█▊        | 94/528 [01:35<07:21,  1.02s/it, loss=3.49, v_num=0, train_loss_step=0.624]
Epoch 1:  16%|█▌        | 84/528 [00:27<02:26,  3.02it/s, loss=1.28, v_num=1, train_loss_step=0.125, val_loss=0.622, train_loss_epoch=2.080]
Epoch 0:  50%|█████     | 264/528 [00:37<00:37,  7.01it/s, loss=1.57, v_num=2, train_loss_step=0.404] 
Validation: 0it [00:00, ?it/s][A
Validation:   0%|          | 0/264 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/264 [00:00<?, ?it/s][A
Epoch 0:  50%|█████     | 265/528 [00:37<00:37,  7.02it/s, loss=1.57, v_num=2, train_loss_step=0.404]
Epoch 0:  50%|█████     | 266/528 [00:37<00:37,  7.04it/s, loss=1.57, v_num=2, train_loss_step=0.404]
Epoch 0:  51%|█████     | 267/528 [00:37<00:36,  7.06it/s, loss=1.57, v_num=2, train_loss_step=0.404]
Epoch 0:  51%|█████     | 268/528 [00:37<00:36,  7.09it/s, loss=1.57, v_num=2, train_loss_step=0.404]
Epoch 0:  51%|█████     | 269/528 [00:37<00:36,  7.11it/s, loss=1.57, v_num=2, train_loss_

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 528/528 [00:45<00:00, 11.54it/s, loss=1.57, v_num=2, train_loss_step=0.404, val_loss=0.497, train_loss_epoch=1.930]


# Evaluate performance + CHECK NEW FUNCTIONS

In [18]:
# Run
masked_rmse, masked_mae, masked_nse, masked_r2 = masked_eval_metrics(val_dataloader, tft)
print(f"masked_rmse: {masked_rmse}")
print(f"masked_mae: {masked_mae}")
print(f"masked_nse: {masked_nse}")
print(f"masked_r2: {masked_r2}")

masked_rmse: 0.6350937485694885
masked_mae: 0.45401236414909363
masked_nse: -8.897841996772776
masked_r2: -1.0731758742542987


In [21]:
## CHECK NEW FUNCTION MANUALLY
# Subset val df to the timesteps that get preds
val_results = pd.DataFrame()
for site in ['ES-LJu', 'FI-Hyy']:
    site_df = val_df_split1.loc[val_df_split1['site_id']==site, ].copy()
    site_df_results = site_df.iloc[-132:, ].copy()
    val_results = pd.concat([val_results, site_df_results], axis=0)

# Append preds to df row
y_pred = tft.predict(val_dataloader).reshape(-1)
val_results['prediction'] = y_pred
val_results['mask'] = np.abs(val_results['gap_flag_hour'] - 1)

# Make timestep-level calcs
masked_val_results = val_results.loc[val_results['mask']==1, ].copy()
masked_val_results['ae'] = np.abs(val_results['GPP_NT_VUT_REF'] - val_results['prediction'])

# Get aggregate metric
check_mae = masked_val_results['ae'].mean()
print(f"Check Mask MAE: {check_mae}")
print(f"Function Mask MAE: {masked_mae}")

Check Mask MAE: 0.4540123451107527
Function Mask MAE: 0.45401236414909363


In [19]:
## COMPARE TO UNMASKED
# calcualte mean absolute error on validation set
actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
predictions = tft.predict(val_dataloader)
mae = (actuals - predictions).abs().mean()
print(f"Val MAE: {mae}")

# Derive Val RMSE
criterion = nn.MSELoss()
rmse = torch.sqrt(criterion(actuals, predictions))
print(f"Val RMSE: {rmse}")

# NSE metric
nse = he.nse(actuals.reshape(-1).numpy(), predictions.reshape(-1).numpy())
print(f"NSE: {nse}")

# R-Squared
r2 = r2_score(actuals.reshape(-1).numpy(), predictions.reshape(-1).numpy())
print(f"R2: {r2}")

Val MAE: 0.8530687093734741
Val RMSE: 1.3808634281158447
NSE: -29.040090860943412
R2: 0.18292750869969865
