# 0. Preparation

## Mount google drive
- Make sure that available access is the user's own drive(no access across files in shared folder)

In [22]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## (pip install)

In [23]:
# install required modules quietly
required_packages = ['azure-storage-blob', 'pytorch_forecasting',
                     'numba', 'scikit-learn']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

In [24]:
! pip install statsmodels --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [25]:
! pip install pytorch_lightning==1.9.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import libraries

In [26]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")
import copy
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch
import torch.nn as nn

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
from pytorch_forecasting import BaseModel, MAE
from pytorch_forecasting.metrics.point import RMSE
from pytorch_forecasting.data.encoders import NaNLabelEncoder

from sklearn.metrics import r2_score
from timeit import default_timer

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))
  
from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pl.seed_everything(42)

INFO:lightning_fabric.utilities.seed:Global seed set to 42


42

# Load data from Azure blob

In [27]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
model_dir = data_dir + os.sep + 'model'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

container = "all-sites-data"
blob_name = "full_2010_2015_all_v_exp_raw.parquet"
local_file = tmp_dir + os.sep + blob_name

In [28]:
# Download full data
data_df = None

if not (os.path.exists(local_file)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
    data_df = pd.read_parquet(file_stream, engine='pyarrow')
    data_df.to_parquet(local_file)
else:
    data_df = pd.read_parquet(local_file)
print(f"Data size: {data_df.shape}")

# Convert Dtypes
cat_cols = ["year", "month", "day", "hour", "IGBP", "koppen_main", "koppen_sub", 
            "gap_flag_month", "gap_flag_hour"]
for col in cat_cols:
  data_df[col] = data_df[col].astype(str).astype("category")

data_df.dropna(inplace=True)
print(f"Data size: {data_df.shape}")

Data size: (4822944, 50)
Data size: (4818384, 50)


In [29]:
data_df.columns

Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_PFT', 'gap_flag_hour',
       'gap_flag_month'],
      dtype='object')

In [30]:
data_df.isna().sum().sum()

0

# Data Preprocessing Methods

In [33]:
SITE_SPLITS =[
  ['AR-SLu', 'AU-ASM', 'AU-Cum', 'AU-How', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-Lnf',
   'IT-CA3', 'NL-Hor', 'US-Syv', 'US-AR2', 'US-ARM', 'US-Vcp', 'CH-Cha', 'CZ-KrP', 
   'CZ-Lnz', 'DE-Geb', 'DE-Obe', 'ES-LJu', 'FI-Let', 'IT-Lav', 'SE-Deg'],
  ['AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 
   'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 
   'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm'],
  ['AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2',
   'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ',
   'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam'],
  ['AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue',
   'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie',
   'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn'],
  ['AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1',
   'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha',
   'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte'],
  ['CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'US-SRM', 'US-Tw3', 'US-Var', 'US-WCr',
   'US-Ho1', 'US-Seg', 'US-UMB', 'BE-Lon', 'CH-Dav', 'DE-Gri', 'DE-HoH', 'ES-LM1',
   'FR-Aur', 'FR-FBn', 'GF-Guy', 'IT-MBo', 'IT-Ren', 'RU-Fyo']
]

def get_splited_datasets(df, val_index, test_index): 
  train_sites, val_sites, test_sites = [], [], []
  for i, subset in enumerate(SITE_SPLITS):
    if i == val_index:
      val_sites = SITE_SPLITS[i]
    elif i == test_index:
      test_sites = SITE_SPLITS[i]
    else:
      train_sites += SITE_SPLITS[i]

  train_df = data_df.loc[data_df['site_id'].isin(train_sites), ].copy()
  val_df   = data_df.loc[data_df['site_id'].isin(val_sites), ].copy()

  if len(train_df['site_id'].unique()) != len(train_sites):
    print(f"Expected Train({len(train_sites)}): {train_sites}")
    print(f"Actual Train({len(train_df['site_id'].unique())}): {train_df['site_id'].unique()}")
  
  if len(val_df['site_id'].unique()) != len(val_sites):
    print(f"Expected Val({len(val_sites)}): {val_sites}")
    print(f"Actual Val({len(val_df['site_id'].unique())}): {val_df['site_id'].unique()}")

  if test_index is not None:
    test_df = data_df.loc[data_df['site_id'].isin(test_sites), ].copy()
    if len(test_df['site_id'].unique()) != len(test_sites):
      print(f"Expected Test({len(test_sites)}): {test_sites}")
      print(f"Actual Test({len(test_df['site_id'].unique())}): {test_df['site_id'].unique()}")
  else:
    test_df = None

  return (train_df, val_df, test_df)

def subset_data(train_df, val_df, test_df, subset_len):
  print(f'Subest length: {subset_len} timesteps')
  # Subset the time series within sites to save more time (300 timesteps)
  train_df = train_df.loc[train_df['timestep_idx_global'] < subset_len, ].copy()
  print(f"Subset num train timesteps: {len(train_df)}")
  val_df = val_df.loc[val_df['timestep_idx_global'] < subset_len, ].copy()
  print(f"Subset num val timesteps: {len(val_df)}")
  if test_df is not None:
    test_df = test_df.loc[test_df['timestep_idx_global'] < subset_len, ].copy()
    print(f"Subset num test timesteps: {len(test_df)}")

  return (train_df, val_df, test_df)

# Run Experiment

In [34]:
def setup_train_val_tsdataset(train_df, val_df, min_encoder_len):
  # create training and validation TS dataset 
  training = TimeSeriesDataSet(
      train_df, # <------ no longer subsetting, option 1 split can use entire train site sequence
      time_idx="timestep_idx_global",
      target="GPP_NT_VUT_REF",
      group_ids=["site_id"],
      allow_missing_timesteps=True, # <---- turned on bc some rows are removed.
      min_encoder_length=min_encoder_len,
      max_encoder_length=min_encoder_len,
      min_prediction_length=max_prediction_length,
      max_prediction_length=max_prediction_length,
      static_categoricals=["IGBP","koppen_main","koppen_sub", "gap_flag_month", "gap_flag_hour"],
      static_reals=[], #elevation lat long
      time_varying_known_categoricals=["year", "month", "day", "hour"],
      time_varying_known_reals=["timestep_idx_global", 
                                'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                                'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 
                                'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day','LST_Night'],
      time_varying_unknown_categoricals=[], 
      time_varying_unknown_reals=["GPP_NT_VUT_REF"],
      target_normalizer=None, # <---- not sure if we need this given we scale in data pipeline.... but might want to change to scale at Group level?
      categorical_encoders={'IGBP': NaNLabelEncoder(add_nan=True),
                            'koppen_main': NaNLabelEncoder(add_nan=True),
                            'koppen_sub': NaNLabelEncoder(add_nan=True),
                            'year': NaNLabelEncoder(add_nan=True), # temp for subset
                            'month': NaNLabelEncoder(add_nan=True), # temp for subset
                            'day': NaNLabelEncoder(add_nan=True), # temp for subset
                            },
      add_relative_time_idx=True,
      add_target_scales=False, # <------- turned off
      add_encoder_length=False, # <------- turned off
  )
  
  validation = TimeSeriesDataSet.from_dataset(training, val_df, predict=True, stop_randomization=True)

  return (training, validation)

def get_eval_metrics(predictions, actuals):
    
    mae = (actuals - predictions).abs().mean()
    
    criterion = nn.MSELoss()
    rmse = torch.sqrt(criterion(actuals, predictions))

    r2 = r2_score(actuals, predictions)

    return { 'mae': mae, 'rmse': rmse, 'r2':r2}

In [35]:
# (data is already splited to train/validation set)
max_prediction_length = 1 #24
max_encoder_length_candidates =  [24*7, 24*7*2, 24*7*4, 24*7*4*3]
exp_names = ["1 Week", "2 Weeks", "1 Month", "3 Months"]

TEST_INDEX = 5
VAL_INDICES = [0, 2, 4]
SUBSET_LEN = int(24*7*4*3*1.5)

if not (os.path.exists(model_dir)):
  os.makedirs(model_dir)

In [36]:
experiment_result = pd.DataFrame(columns=['experiment', 'fold', 'val_index',
                                        'training_time', 'eval_time',
                                        'MAE', 'RMSE', 'R2',
                                        'model_path'])

for i, max_encoder_len in enumerate(max_encoder_length_candidates):
  print(f'Experiment: max_encoder_length = {max_encoder_len} ({exp_names[i]})')
  
  for k, val_index in enumerate(VAL_INDICES):

    train_time, eval_time = 0, 0

    # split data
    train_df, val_df, test_df = get_splited_datasets(data_df, val_index, None)
    train_df, val_df, test_df = subset_data(train_df, val_df, test_df, SUBSET_LEN)
    (training, validation) = setup_train_val_tsdataset(train_df, val_df, max_encoder_len)

    # create dataloaders for model
    batch_size = 32  # set this between 32 to 128
    train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
    val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

    # Create TFT model from dataset
    tft = TemporalFusionTransformer.from_dataset(
        training,
        learning_rate=1e-3,
        hidden_size=16,  # most important hyperparameter apart from learning rate
        attention_head_size=1, # Set to up to 4 for large datasets
        dropout=0.1,           # Between 0.1 and 0.3 are good values
        hidden_continuous_size=8,  # set to <= hidden_size
        output_size=7,  # 7 quantiles by default
        loss=QuantileLoss(),
        logging_metrics=nn.ModuleList([MAE(), RMSE()]), #SMAPE(), #MAPE() #<---- added metrics to report in TensorBoard
        reduce_on_plateau_patience=4, # reduce learning rate if no improvement in validation loss after x epochs
        optimizer="adam"
        )
    print(f"  Number of parameters in network: {tft.size()/1e3:.1f}k")

    # configure network and trainer
    early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10, verbose=False, mode="min")
    lr_logger = LearningRateMonitor()  # log the learning rate
    logger = TensorBoardLogger("lightning_logs")  # logging results to a tensorboard

    trainer = pl.Trainer(
        max_epochs=2,
        enable_model_summary=True,
        gradient_clip_val=0.1,
        fast_dev_run=False,  # comment in to check that network or dataset has no serious bugs
        accelerator='gpu',
        devices=1,
        callbacks=[lr_logger, early_stop_callback],
        logger=logger,
    )

    start = default_timer()
    trainer.fit(
        tft,
        train_dataloaders=train_dataloader,
        val_dataloaders=val_dataloader,
    )
    train_time = default_timer() - start
    print(f"  Training time: {train_time}")

    # load the best model according to the validation loss
    best_model_path = trainer.checkpoint_callback.best_model_path
    print("  Best model path: " + best_model_path)
    best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

    exp_dir = model_dir + os.sep + exp_names[i]
    if not (os.path.exists(exp_dir)):
      os.makedirs(exp_dir)
    local_model_path = exp_dir + os.sep + f"fold-{k}-model.pth"
    torch.save(best_tft.state_dict(), local_model_path)
    print(f" Saved model to {local_model_path}")

    # Print Model Eval on Validation Set
    start = default_timer()
    actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
    predictions = best_tft.predict(val_dataloader)
    eval_time = default_timer() - start
    print(f"  Val eval time: {eval_time}")

    eval_metric = get_eval_metrics
    print(eval_metric)
    result_data = {  'experiment': exp_names[i], 
                     'fold': k+1, 
                     'val_index': val_index,
                     'training_time': train_time, 
                     'eval_time': eval_time,
                     'model_path': local_model_path,
                     'MAE': eval_metric['MAE'],
                     'RMSE': eval_metric['RMSE'],
                     'R2': eval_metric['R2']}
    experiment_result = experiment_result.append(result_data, ignore_index=True)
  
  # Print experiment results
  display(experiment_result[experiment_result['experiment'] ==  exp_names[i]])

Experiment: max_encoder_length = 168 (1 Week)
Expected Train(111): ['AU-Cpr', 'AU-Wom', 'CZ-BK2', 'DE-SfN', 'IT-CA1', 'IT-CA2', 'IT-Ro2', 'US-IB2', 'US-Me6', 'US-Ton', 'CA-Ca3', 'US-CRT', 'US-KFS', 'US-Mpj', 'US-Prr', 'US-Ro1', 'US-Tw4', 'BE-Bra', 'CZ-BK1', 'DE-Hai', 'IL-Yat', 'IT-Tor', 'SE-Htm', 'AT-Neu', 'AU-RDF', 'AU-Whr', 'CA-TP1', 'DE-Zrk', 'IT-SRo', 'US-Wkg', 'CA-Ca2', 'CA-TP4', 'US-Bar', 'US-Fmf', 'US-Oho', 'US-SRG', 'US-Ses', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Kli', 'DE-RuR', 'ES-LM2', 'FR-Fon', 'FR-Lam', 'AR-Vir', 'AU-DaS', 'AU-Emr', 'AU-Gin', 'AU-Rig', 'AU-TTE', 'DE-Spw', 'FR-Pue', 'IT-Isp', 'IT-Noe', 'US-Twt', 'US-WPT', 'CA-Cbo', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'FI-Hyy', 'SE-Nor', 'SE-Ros', 'NL-Loo', 'SE-Lnn', 'AU-DaP', 'AU-GWW', 'AU-Rob', 'AU-Stp', 'US-GLE', 'US-NR1', 'US-Whs', 'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Rws', 'US-UMd', 'US-Wjs', 'CH-Fru', 'CH-Oe2', 'DE-Tha', 'DK-Sor', 'FR-Bil', 'FR-Hes', 'IT-BCi', 'IT-SR2', 'DE-Hte', 'CA-Oas', 'ES-Amo', 'FI-Sod', 'US-Myb', 'U

MisconfigurationException: ignored

In [None]:
from google.colab import runtime
runtime.unassign()