# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
    IN_COLLAB = True
else:
    IN_COLLAB = False

if IN_COLLAB:
    #TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
    # MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
    MY_HOME_ABS_PATH = "/content/drive/MyDrive/TFT_baseline"
    from google.colab import drive
    drive.mount('/content/drive/')
else:
    # MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling/"
    MY_HOME_ABS_PATH = "/home/ec2-user/SageMaker/co2-flux-hourly-gpp-modeling"

## Import Modules

In [2]:
import os
os.chdir(MY_HOME_ABS_PATH)

import sys
import warnings
warnings.filterwarnings("ignore")
import copy
import json
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pytorch_lightning as pl
from pytorch_lightning.callbacks import Callback, EarlyStopping, LearningRateMonitor
from pytorch_lightning.loggers import TensorBoardLogger
import torch
import torch.nn as nn

from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters, MetricsCallback
from pytorch_forecasting import BaseModel, MAE
from pytorch_forecasting.metrics.point import RMSE
from pytorch_forecasting.data.encoders import NaNLabelEncoder

import optuna
from optuna.integration import PyTorchLightningPruningCallback, TensorBoardCallback

# from sklearn.metrics import r2_score
from timeit import default_timer
from datetime import datetime
import gc
import pickle


# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
     sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append('./.cred')
    sys.path.append('./code/src/tools')
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *
from model_pipeline_lib_for_nbinstance import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pl.seed_everything(42)

2023-04-15 04:16:40.944676: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-15 04:16:40.996203: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Global seed set to 42
Global seed set to 42


42

In [3]:
# data/models/tft_model_1yrtrain_tuning_230318_1906/trial_0/epoch=12.ckpt

## Define Local File System Constants

In [4]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
model_objects_dir = root_dir + os.sep + 'code/src/modeling/model_objects'

# Load Train and Test dataset from Azure Storage Blob

In [5]:
container = "all-sites-data"
blob_name = "full_2010_2015_v_mvp_raw.parquet"

local_file = tmp_dir + os.sep + blob_name

In [6]:
# def get_raw_datasets(container, blob_name):
local_file = tmp_dir + os.sep + blob_name
data_df = None
if not (os.path.exists(local_file)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
    data_df = pd.read_parquet(file_stream, engine='pyarrow')
    data_df.to_parquet(local_file)
else:
    data_df = pd.read_parquet(local_file)

print(f"Data size: {data_df.shape}")

# Convert Dtypes
cat_cols = ["year", "month", "day", "hour", "MODIS_IGBP", "koppen_main", "koppen_sub", 
            "gap_flag_month", "gap_flag_hour"]
for col in cat_cols:
    data_df[col] = data_df[col].astype(str).astype("category")

print(f"Data Columns: {data_df.columns}")
print(f"NA count: {data_df.isna().sum().sum()}")
# return data_df

Data size: (4862712, 51)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP', 'MODIS_PFT',
       'gap_flag_hour', 'gap_flag_month'],
      dtype='object')
NA count: 0


# Load Three Best Models As of April 8

In [7]:
best_model_path_GPP = f"{MY_HOME_ABS_PATH}/data/models/GPPTFT_5yrTrain_2WkEncode_230410_2310/lightning_logs/version_1/checkpoints/epoch=9-step=74795.ckpt"

In [8]:
best_GPP = TemporalFusionTransformer.load_from_checkpoint(best_model_path_GPP)
print(f"Quantiles: {best_GPP.loss.quantiles}") # [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]

Quantiles: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]


# Make prediction for three best models

In [10]:
exp_model_dir = "{MY_HOME_ABS_PATH}/data/models/GPPTFT_5yrTrain_2WkEncode_230410_2310" # TODO: Replace to your model dir
best_model_path = "./data/models/GPPTFT_5yrTrain_2WkEncode_230410_2310/lightning_logs/version_1/checkpoints/epoch=9-step=74795.ckpt"
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)
print(f"Quantiles: {best_tft.loss.quantiles}") # [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]

# Experiment constants
VAL_INDEX  = 3
TEST_INDEX = 4
SUBSET_LEN = 24*365*5 # 5 year
ENCODER_LEN = 24*14
print(f"Training timestemp length = {SUBSET_LEN}.")

# Create dataloaders for model
batch_size = 128
cpu_count = os.cpu_count()

Quantiles: [0.02, 0.1, 0.25, 0.5, 0.75, 0.9, 0.98]
Training timestemp length = 43800.


## Create TS Dataset

In [None]:
# Setup dataset
train_df, val_df, test_df = get_splited_datasets(data_df, VAL_INDEX, TEST_INDEX)
train_df, val_df, test_df = subset_data(train_df, val_df, test_df, SUBSET_LEN)

# Decide sites in use

In [16]:
val_df.head()

Unnamed: 0,GPP_NT_VUT_REF,site_id,timestep_idx_local,timestep_idx_global,datetime,date,year,month,day,hour,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,lat,long,koppen_sub,koppen_main,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_IGBP,MODIS_PFT,gap_flag_hour,gap_flag_month
153528,-0.4694,AU-DaP,0,350640,2010-01-01 00:00:00,2010-01-01,2010,1,1,0,24.47,0.0,416.09,2.21,0.0,99.839,0.56131,0.74435,0.31742,0.05892,0.4199,0.02462,0.09036,0.41006,0.28214,0.12812,GRA,-14.0633,131.3181,3,1,C3,97.83,116.0,63.0,231.0,0.46453,-0.00719,301.4853,0.29788,10,0.08345,0.0,0.59,2.0,309.5,297.52,GRA,GRA,0.0,0.0
153529,-0.44756,AU-DaP,1,350641,2010-01-01 01:00:00,2010-01-01,2010,1,1,1,23.791,0.0,409.45,1.018,0.0,99.812,0.51946,0.71901,0.28217,0.06444,0.39286,0.02698,0.08792,0.41804,0.30312,0.13154,GRA,-14.0633,131.3181,3,1,C3,97.83,116.0,63.0,231.0,0.46453,-0.00719,301.4853,0.29788,10,0.08345,0.0,0.59,2.0,309.5,297.52,GRA,GRA,0.0,0.0
153530,-0.43325,AU-DaP,2,350642,2010-01-01 02:00:00,2010-01-01,2010,1,1,2,23.456,0.0,409.45,0.801,0.0,99.76,0.58092,0.7491,0.33465,0.06188,0.44276,0.02548,0.09302,0.41848,0.29022,0.12242,GRA,-14.0633,131.3181,3,1,C3,97.83,116.0,63.0,231.0,0.46453,-0.00719,301.4853,0.29788,10,0.08345,0.0,0.59,2.0,309.5,297.52,GRA,GRA,0.0,0.0
153531,-0.40639,AU-DaP,3,350643,2010-01-01 03:00:00,2010-01-01,2010,1,1,3,23.122,0.0,409.45,0.584,0.0,99.708,0.64758,0.77458,0.38791,0.06068,0.495,0.02542,0.09588,0.43856,0.29074,0.11474,GRA,-14.0633,131.3181,3,1,C3,97.83,116.0,63.0,231.0,0.46453,-0.00719,301.4853,0.29788,10,0.08345,0.0,0.59,2.0,309.5,297.52,GRA,GRA,0.0,0.0
153532,-0.39665,AU-DaP,4,350644,2010-01-01 04:00:00,2010-01-01,2010,1,1,4,22.794,0.0,400.833,0.408,0.0,99.693,0.67512,0.79451,0.4027,0.0567,0.50416,0.02536,0.09438,0.45466,0.27622,0.10274,GRA,-14.0633,131.3181,3,1,C3,97.83,116.0,63.0,231.0,0.46453,-0.00719,301.4853,0.29788,10,0.08345,0.0,0.59,2.0,309.5,297.52,GRA,GRA,0.0,0.0


In [17]:
site_list = val_df["site_id"].unique()

In [18]:
val_df[['site_id','MODIS_IGBP']].drop_duplicates().reset_index(drop=True)

Unnamed: 0,site_id,MODIS_IGBP
0,AU-DaP,GRA
1,AU-Emr,GRA
2,AU-Gin,SAV
3,AU-How,SAV
4,AU-Rig,GRA
5,CA-Ca1,ENF
6,CA-Gro,MF
7,CH-Fru,MF
8,CH-Oe2,CRO
9,DE-Hai,DBF


In [19]:
# AU-DaP	GrassLand GRA

# NL-Loo	SAV	SAV Savanna
# US-Ses	OSH Open Shrublands
# CA-Gro	MF Mixed Forest
# US-Bar	DBF 
# FI-Hyy	ENF Evergreen Needleleaf Forest
# FR_Aur	CRO Cropland

In [20]:
val_AU_DaP_df = val_df[val_df["site_id"] == "AU-DaP"]
# val_NL_Loo_df = val_df[val_df["site_id"] == "NL-Loo"]
# val_US_Ses_df = val_df[val_df["site_id"] == "US-Ses"]
# val_CA_Gro_df = val_df[val_df["site_id"] == "CA-Gro"]
# val_US_Bar_df = val_df[val_df["site_id"] == "US-Bar"]
# val_FI_Hyy_df = val_df[val_df["site_id"] == "FI-Hyy"]
# val_FR_Aur_df = val_df[val_df["site_id"] == "FR-Aur"]


print(val_AU_DaP_df.shape)
# print(val_NL_Loo_df.shape)
# print(val_US_Ses_df.shape)
# print(val_CA_Gro_df.shape)
# print(val_US_Bar_df.shape)
# print(val_FI_Hyy_df.shape)
# print(val_FR_Aur_df.shape)


(32304, 51)


# Convert to TS dataset

In [21]:
min_encoder_len = ENCODER_LEN

training = TimeSeriesDataSet(
      train_df,
      time_idx="timestep_idx_global",
      target="GPP_NT_VUT_REF",
      group_ids=["site_id"],
      allow_missing_timesteps=False, # <---- turned on bc some rows are removed.
      min_encoder_length=min_encoder_len,
      max_encoder_length=min_encoder_len,
      min_prediction_length=1,
      max_prediction_length=1,
      static_categoricals=["MODIS_IGBP","koppen_main","koppen_sub"],
      static_reals=[],
      time_varying_known_categoricals=["month", "day", "hour"],
      time_varying_known_reals=["timestep_idx_global", 
                                'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7', 
                                'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts', 
                                'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day','LST_Night'],
      time_varying_unknown_categoricals=["gap_flag_month", "gap_flag_hour"], 
      time_varying_unknown_reals=["GPP_NT_VUT_REF"],
      target_normalizer=None,
      categorical_encoders={'MODIS_IGBP': NaNLabelEncoder(add_nan=True),
                            'koppen_main': NaNLabelEncoder(add_nan=True),
                            'koppen_sub': NaNLabelEncoder(add_nan=True),
                            'year': NaNLabelEncoder(add_nan=True),
                            },
      add_relative_time_idx=True,
      add_target_scales=False, # <------- turned off
      add_encoder_length=False, # <------- turned off
    )

In [22]:
# validation = TimeSeriesDataSet.from_dataset(training, val_df, predict=False, stop_randomization=True)

val_AU_DaP = TimeSeriesDataSet.from_dataset(training, val_AU_DaP_df, predict=False, stop_randomization=True)
# val_NL_Loo = TimeSeriesDataSet.from_dataset(training, val_NL_Loo_df, predict=False, stop_randomization=True)
# val_US_Ses = TimeSeriesDataSet.from_dataset(training, val_US_Ses_df, predict=False, stop_randomization=True)
# val_CA_Gro = TimeSeriesDataSet.from_dataset(training, val_CA_Gro_df, predict=False, stop_randomization=True)
# val_US_Bar = TimeSeriesDataSet.from_dataset(training, val_US_Bar_df, predict=False, stop_randomization=True)
# val_FI_Hyy = TimeSeriesDataSet.from_dataset(training, val_FI_Hyy_df, predict=False, stop_randomization=True)
# val_FR_Aur = TimeSeriesDataSet.from_dataset(training, val_FR_Aur_df, predict=False, stop_randomization=True)


# Create Data Loader

In [None]:
del train_df

In [23]:
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=cpu_count)#, pin_memory=True)
# val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size, num_workers=cpu_count)#, pin_memory=False)


### Define TS Dataset for NO-PAST-GPP models

In [148]:
# (training_NO_PAST_GPP, validation_NO_PAST_GPP, testing_NO_PAST_GPP) = setup_tsdataset_no_past_GPP(train_df, val_df, test_df, ENCODER_LEN)


In [149]:
# # Get small test data
# subtest_df = test_df.loc[test_df['timestep_idx_local'] < ENCODER_LEN+5, ].copy()
# _, _, subtesting = setup_tsdataset(train_df, val_df, subtest_df, ENCODER_LEN)
# print(f"Subset num subtest timesteps: {len(subtest_df)}")

# # create dataloaders for model
# # ref: https://pytorch-lightning.readthedocs.io/en/stable/guides/speed.html#dataloaders
# batch_size = 64  # set this between 32 to 128
# cpu_count = os.cpu_count()

In [156]:

# # Create TFT model from dataset
# tft = TemporalFusionTransformer.from_dataset(
#     training,
#     learning_rate=0.00001,
#     hidden_size=16,  # most important hyperparameter apart from learning rate
#     attention_head_size=1, # Set to up to 4 for large datasets
#     dropout=0.3, # Between 0.1 and 0.3 are good values
#     hidden_continuous_size=16,  # set to <= hidden_size
#     output_size=7,  # 7 quantiles by default
#     loss=QuantileLoss(),
#     logging_metrics=nn.ModuleList([MAE(), RMSE()]),
#     reduce_on_plateau_patience=2, # reduce learning rate if no improvement in validation loss after x epochs
#     optimizer="adam"
# )
# print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

# # configure network and trainer
# early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=6, mode="min", check_finite=True, verbose=False,)
# lr_logger = LearningRateMonitor()  # log the learning rate
# checkpoint_callback = ModelCheckpoint(dirpath=exp_model_dir, save_top_k=3, monitor="val_loss") # save model objects for top k epoch val loss
# logger = TensorBoardLogger(exp_model_dir)  # logging results to a tensorboard

# trainer = pl.Trainer(
#     max_epochs=25,
#     enable_model_summary=True,
#     #gradient_clip_val=2,
#     fast_dev_run=False, 
#     accelerator='gpu',
#     devices="auto", 
#     callbacks=[lr_logger, early_stop_callback],
#     logger=logger,
#     strategy="ddp",
# )

## Prepare Two Validation Data Loaders

In [24]:
import torch.multiprocessing
torch.multiprocessing.set_sharing_strategy('file_system')

In [41]:
# Eval on Validation set
def val_actuals(validation_ts):
    val_dataloader = validation_ts.to_dataloader(train=False, batch_size=batch_size, num_workers=cpu_count)
    # Comment out to save memorys
    # val_actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
    return val_dataloader#, val_actuals

def val_actuals(validation_ts):
    val_dataloader = validation_ts.to_dataloader(train=False, batch_size=batch_size, num_workers=cpu_count)
    # Comment out to save memorys
    # val_actuals = torch.cat([y[0] for x, y in iter(val_dataloader)])
    return val_dataloader#, val_actuals

In [28]:
val_loader_AU_DaP = val_actuals(val_AU_DaP) #, val_AU_DaP_actuals

# Obtain Val Prediction

In [30]:
def pred_raw(val_dataloader):
    print(f"Start eval on validation.")
    start = default_timer()
    val_raw_pred, val_x = best_GPP.predict(val_dataloader, show_progress_bar=True,
                                                   mode="raw", return_x=True)
    eval_time = default_timer() - start
    print(f"Val eval time: {eval_time}")
    return  val_raw_pred, val_x

In [55]:
def raw_pred_x_to_dict(model_name, val_or_test, site_name, val_raw_pred, val_x):
    
    
    # Store raw_predictions
    # Create dictionary
    val_raw_pred_dict = {}
    for i in val_raw_pred.keys():
        val_raw_pred_dict[i] = val_raw_pred[i]
    val_raw_pred_dict.keys()
    
    with open(f'{val_or_test}_raw_pred_{site_name}_{model_name}.pkl', 'wb') as handle:
        pickle.dump(val_raw_pred_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # open pickle from storage
    with open(f'{val_or_test}_raw_pred_{site_name}_{model_name}.pkl', 'rb') as handle:
        val_raw_pred_site = pickle.load(handle)
 
    # Store x
    val_x_dict = {}
    for i in val_x.keys():
        val_x_dict[i] = val_x[i]
    val_x_dict.keys()
    
    with open(f'{val_or_test}_x_pred_{site_name}_{model_name}.pkl', 'wb') as handle:
        pickle.dump(val_x_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    # open pickle from storage
    with open(f'{val_or_test}_x_pred_{site_name}_{model_name}.pkl', 'rb') as handle:
        val_x_site = pickle.load(handle)
                                                       
    return val_raw_pred_site, val_x_site


In [37]:
val_raw_AU_DaP, val_x_AU_DaP = pred_raw(val_loader_AU_DaP)

Start eval on validation.


Predict: 100%|██████████| 250/250 [04:43<00:00,  1.14s/ batches]

Val eval time: 283.82525439100027





In [38]:
model_name = "GPPTFT_5yrTrain_2WkEncode_230410_2310"
site_name = "AU_DaP"

raw_AU_DaP, x_AU_DaP = raw_pred_x_to_dict(model_name, site_name, val_raw_AU_DaP, val_x_AU_DaP)

In [46]:
val_raw_AU_DaP.keys()

('prediction',
 'encoder_attention',
 'decoder_attention',
 'static_variables',
 'encoder_variables',
 'decoder_variables',
 'decoder_lengths',
 'encoder_lengths')

In [47]:
val_raw_AU_DaP['encoder_variables'].shape

torch.Size([31968, 336, 1, 37])

In [43]:
# val_loader_AU_DaP = val_actuals(val_AU_DaP)
raw_AU_DaP.keys()

dict_keys(['prediction', 'encoder_attention', 'decoder_attention', 'static_variables', 'encoder_variables', 'decoder_variables', 'decoder_lengths', 'encoder_lengths'])

In [53]:
x_AU_DaP.keys()

dict_keys(['encoder_cat', 'encoder_cont', 'encoder_target', 'encoder_lengths', 'decoder_cat', 'decoder_cont', 'decoder_target', 'decoder_lengths', 'decoder_time_idx', 'groups', 'target_scale'])

In [49]:
del val_AU_DaP_df

In [52]:
del val_loader_AU_DaP

In [50]:
del val_raw_AU_DaP, val_x_AU_DaP 

# Predict on different val sites

Define the functions that do prediction end-to-end

In [None]:
def site_raw_pred(model_name, site_name, val_or_test, df, training):
    # filter with site id
    df_site = df[df["site_id"] == site_name]
    print(f'filtered with {site_name}')
    
    # create TS dataset
    df_site_ts = TimeSeriesDataSet.from_dataset(training,
                                                df_site,
                                                predict=False, stop_randomization=True)
    print(f'TS dataset created')
    # create dataloader
    site_dataloader = df_site_ts.to_dataloader(train=False, batch_size=batch_size, num_workers=cpu_count)
    print(f'dataloader created')
    
    # obtain raw prediction and x
    raw_pred, x = pred_raw(site_dataloader)
    print(f'prediction completed')
    
    # convert to dict and save to pkl
    print(f'saving to pickle')
    raw_pred_dict, x_dict = raw_pred_x_to_dict(model_name, val_or_test, site_name, raw_pred, x)
    
    print(f'deleting used files')
    del df_site, df_site_ts, site_dataloader, raw_pred, x

    return raw_pred_dict, x_dict
    

In [60]:
model_name = "GPPTFT_5yrTrain_2WkEncode_230410_2310"
site_name = "US-AR1"

raw_US_AR1, x_US_AR1 = site_raw_pred(model_name, site_name, "val", val_df, training)

filtered with US-AR1
TS dataset created
dataloader created
Start eval on validation.


Predict: 100%|██████████| 203/203 [03:50<00:00,  1.13s/ batches]


Val eval time: 230.0536716810002
prediction completed
saving to pickle


In [None]:
# AU-DaP	GrassLand GRA
# US-AR1	GrassLand GRA
# US-Bar	DBF 
# FI-Hyy	ENF Evergreen Needleleaf Forest
# FR_Aur	CRO Cropland

In [None]:
model_name = "GPPTFT_5yrTrain_2WkEncode_230410_2310"
site_name = "FI_Hyy"

raw_FI_Hyy, x_FI_Hyy = site_raw_pred(model_name, site_name, "val", val_df, training)

In [61]:
model_name = "GPPTFT_5yrTrain_2WkEncode_230410_2310"
site_name = "US-Bar"

raw_US_Bar, x_US_Bar = site_raw_pred(model_name, site_name, "val", val_df, training)

filtered with US-Bar
TS dataset created
dataloader created
Start eval on validation.


Predict: 100%|██████████| 340/340 [06:25<00:00,  1.13s/ batches]


Val eval time: 385.0881070250016
prediction completed
saving to pickle


OSError: [Errno 28] No space left on device

In [None]:
model_name = "GPPTFT_5yrTrain_2WkEncode_230410_2310"
site_name = "FR_Aur"

raw_FR_Aur, x_FR_Aur = site_raw_pred(model_name, site_name, "val", val_df, training)

## Try predction with data

In [None]:
test_df[['site_id','MODIS_IGBP']].drop_duplicates().reset_index(drop=True)

# (Reference)Model Interpretation

In [None]:
print(f'encoder variables: {best_GPP.encoder_variables}')
encoder_var = best_GPP.encoder_variables
len(best_GPP.encoder_variables)

### Interpret_output
- reduction: "none" for no averaging over batches, 
- "sum" for summing attentions, 
- "mean" for normalizing by encode lengths
- attention_prediction_horizon: which prediction horizon to use for attention

In [None]:
interpretation_none = best_tft.interpret_output(val_raw_predictions, reduction="none")
interpretation_sum = best_tft.interpret_output(val_raw_predictions, reduction="sum")
interpretation_mean = best_tft.interpret_output(val_raw_predictions, reduction="mean")

In [None]:
# size of val_raw_predictions['encoder_variables']
print(val_raw_predictions['encoder_variables'].shape)

When encoder length is 24 times 7, Each feature has 168 feature importance in each prediction time point

In [None]:
print(val_raw_predictions['encoder_variables'][0].shape)
val_raw_predictions['encoder_variables'][0]


In [None]:
print(val_raw_predictions['encoder_variables'][0][0].shape)
val_raw_predictions['encoder_variables'][0][0]

In [None]:
print(len(val_raw_predictions['encoder_variables'][0][0][0]))
print(val_raw_predictions['encoder_variables'][0][0].sum())
val_raw_predictions['encoder_variables'][0][0]

### Interpretation reduction="none"

In [None]:
print(interpretation_none['encoder_variables'].shape)
interpretation_none['encoder_variables']

In [None]:
print(interpretation_none['encoder_variables'][0].sum())
interpretation_none['encoder_variables'][0]

### Interpretation reduction="sum"

In [None]:
print(interpretation_sum['encoder_variables'].sum())
print(len(interpretation_sum['encoder_variables']))
interpretation_sum['encoder_variables']

### Interpretation reduction="mean"

In [None]:
print(interpretation_mean['encoder_variables'].sum())
print(len(interpretation_mean['encoder_variables']))
interpretation_mean['encoder_variables']

### Compare with manual normalization

In [None]:
# Sum manually 
interpret_none_df = pd.DataFrame(interpretation_none['encoder_variables'], columns=encoder_var)
interpret_none_df.sum()

### Normalize sum of importance

In [None]:
interpret_none_df.mean()

In [None]:
importance_rank = pd.DataFrame(interpret_none_df.mean(),columns=["importance"]).sort_values(by="importance", ascending=False)
importance_rank

In [None]:
# 19 features cover 80% of total importance
importance_rank[:19].sum()

## Create df importance of the snapshot of one prediction time point

In [None]:
# tensor to list
encoder_size_importance = []
for i in val_raw_predictions['encoder_variables'][0]:
    encoder_size_importance += [list(i.numpy()[0])]

# define encoder index -1 to -168
encoder_size_importance_df = pd.DataFrame(encoder_size_importance,columns=encoder_var)
encoder_index = [-(168-i) for i in range(168)]
encoder_index#.reverse()

encoder_size_importance_df["encoder_index"] = encoder_index

columns_order = list(encoder_size_importance_df.columns[-1:]) + list(encoder_size_importance_df.columns[:-1])
encoder_size_imp_df  = encoder_size_importance_df[columns_order]
encoder_size_imp_df.head()


In [None]:
encoder_size_imp_df.columns

### Choose average top 20 features to plot

In [None]:
encoder_size_imp_rank_df = pd.DataFrame(encoder_size_imp_df.sum()/168, columns=['avg_encoder_imp'])
encoder_size_imp_rank_df = encoder_size_imp_rank_df.sort_values(by=['avg_encoder_imp'], ascending=False)
encoder_size_imp_rank_df[:15]

# Create stack area plot

In [None]:
encoder_size_imp_rank_df[:15].index

In [None]:
importance_rank[:7].sum()

color palette in seaborn https://seaborn.pydata.org/generated/seaborn.color_palette.html

Examples
- `pal = sns.light_palette("seagreen")`
- `pal = sns.color_palette("mako")`
- `pal = sns.cubehelix_palette(start=.5, rot=-.75)#, as_cmap=True)`
- `pal = sns.cubehelix_palette(start=.5, rot=-.5)#, as_cmap=True)`


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#set seaborn style
sns.set_theme()
pal = sns.color_palette("crest")
sns.set(rc={'figure.figsize':(30,7), })

#['GPP_NT_VUT_REF', 'SW_IN_ERA', 'gap_flag_hour', 'b4', 'ESACCI-sm',
       # 'year', 'BESS-RSDN', 'relative_time_idx', 'timestep_idx_global',
       # 'LW_IN_ERA', 'PA_ERA', 'b2', 'VPD_ERA', 'b6', 'TA_ERA']

#create area chart
plt.stackplot(encoder_size_imp_df['encoder_index'], # x axis
              encoder_size_imp_df['year'],
              encoder_size_imp_df['ESACCI-sm'],
              encoder_size_imp_df['SW_IN_ERA'],
              encoder_size_imp_df['BESS-RSDN'],
              encoder_size_imp_df['gap_flag_hour'],
              encoder_size_imp_df['GPP_NT_VUT_REF'],
              encoder_size_imp_df['b4'],
              labels=['year','ESACCI-sm','SW_IN_ERA','BESS-RSDN','gap_flag_hour','GPP_NT_VUT_REF','b4'],
              colors=pal)

labels=['year','ESACCI-sm','SW_IN_ERA','BESS-RSDN','gap_flag_hour','GPP_NT_VUT_REF','b4']
# Reverse the order of labels in legend
plt.legend(reversed(plt.legend().legendHandles), reversed(labels), loc='upper left', fontsize=14)

#add axis labels
plt.title('Feature Importance by Encoder Relative Time Point', fontsize=20)
plt.ylabel('Importance', fontsize=18)
plt.xlabel('Encoder Relative Index (1hour before pred ~ 168(24*7) hours before pred)', fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=14) # fontsize of both axes


## Add attention to the plot 

In [None]:
val_raw_predictions["encoder_attention"].shape

In [None]:
val_raw_predictions["encoder_attention"][0].shape

In [None]:
val_raw_predictions["encoder_attention"][0][0][0].shape

In [None]:
val_raw_predictions["encoder_attention"][0][0][0].numpy()

In [None]:
# Add attention to feature importance df
encoder_size_imp_df["encoder_attention"] = list(val_raw_predictions["encoder_attention"][0][0][0].numpy())
encoder_size_imp_df.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#set seaborn style
sns.set_theme()
pal = sns.color_palette("crest")
sns.set(rc={'figure.figsize':(30,7)})

#['GPP_NT_VUT_REF', 'SW_IN_ERA', 'gap_flag_hour', 'b4', 'ESACCI-sm',
       # 'year', 'BESS-RSDN', 'relative_time_idx', 'timestep_idx_global',
       # 'LW_IN_ERA', 'PA_ERA', 'b2', 'VPD_ERA', 'b6', 'TA_ERA']

#create area chart
plt.stackplot(encoder_size_imp_df['encoder_index'], # x axis
              encoder_size_imp_df['year'],
              encoder_size_imp_df['ESACCI-sm'],
              encoder_size_imp_df['SW_IN_ERA'],
              encoder_size_imp_df['BESS-RSDN'],
              encoder_size_imp_df['gap_flag_hour'],
              encoder_size_imp_df['GPP_NT_VUT_REF'],
              encoder_size_imp_df['b4'],
              labels=['year','ESACCI-sm','SW_IN_ERA','BESS-RSDN','gap_flag_hour','GPP_NT_VUT_REF','b4'],
              colors=pal)

labels=['year','ESACCI-sm','SW_IN_ERA','BESS-RSDN','gap_flag_hour','GPP_NT_VUT_REF','b4']
# Reverse the order of labels in legend
plt.legend(reversed(plt.legend().legendHandles), reversed(labels), loc='upper left', fontsize=14)

#add axis labels
plt.title('Feature Importance by Encoder Relative Time Point / Average Attention', fontsize=20)
plt.ylabel('Importance', fontsize=18)
plt.xlabel('Encoder Relative Index (1hour before pred ~ 168(24*7) hours before pred)', fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=14) # fontsize of both axes

plt2 = plt.twinx()

plt2.plot(encoder_size_imp_df["encoder_index"], encoder_size_imp_df["encoder_attention"], color='w', linewidth=5)
plt2.set_ylabel('Average Attention', fontsize=18)
plt2.tick_params('y', colors='k')
plt.show()
plt.savefig('fi_avg_attention_GPP_TFT_AU-DaP_78_20100108_0AM_0.png')

### (with different palette)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#set seaborn style
sns.set_theme()
# pal = sns.light_palette("seagreen")
pal = sns.color_palette("mako")
# pal = sns.cubehelix_palette(start=.5, rot=-.75)#, as_cmap=True)`
# pal = sns.cubehelix_palette(start=.5, rot=-.5)#, as_cmap=True)`
# pal = sns.color_palette("crest")
sns.set(rc={'figure.figsize':(30,7)})

#['GPP_NT_VUT_REF', 'SW_IN_ERA', 'gap_flag_hour', 'b4', 'ESACCI-sm',
       # 'year', 'BESS-RSDN', 'relative_time_idx', 'timestep_idx_global',
       # 'LW_IN_ERA', 'PA_ERA', 'b2', 'VPD_ERA', 'b6', 'TA_ERA']

#create area chart
plt.stackplot(encoder_size_imp_df['encoder_index'], # x axis
              encoder_size_imp_df['year'],
              encoder_size_imp_df['ESACCI-sm'],
              encoder_size_imp_df['SW_IN_ERA'],
              encoder_size_imp_df['BESS-RSDN'],
              encoder_size_imp_df['gap_flag_hour'],
              encoder_size_imp_df['GPP_NT_VUT_REF'],
              encoder_size_imp_df['b4'],
              labels=['year','ESACCI-sm','SW_IN_ERA','BESS-RSDN','gap_flag_hour','GPP_NT_VUT_REF','b4'],
              colors=pal)

labels=['year','ESACCI-sm','SW_IN_ERA','BESS-RSDN','gap_flag_hour','GPP_NT_VUT_REF','b4']
# Reverse the order of labels in legend
plt.legend(reversed(plt.legend().legendHandles), reversed(labels), loc='upper left', fontsize=14)

#add axis labels
plt.title('Feature Importance by Encoder Relative Time Point / Average Attention', fontsize=20)
plt.ylabel('Importance', fontsize=18)
plt.xlabel('Encoder Relative Index (1hour before pred ~ 168(24*7) hours before pred)', fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=14) # fontsize of both axes

plt2 = plt.twinx()

plt2.plot(encoder_size_imp_df["encoder_index"], encoder_size_imp_df["encoder_attention"], color='w', linewidth=5)
plt2.set_ylabel('Average Attention')
plt2.tick_params('y', colors='k')# black is k
plt.show()

plt.savefig('fi_avg_attention_GPP_TFT_AU-DaP_78_20100108_0AM.png')

## Plot without GPP

In [None]:
encoder_size_imp_rank_df[:15].index

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#set seaborn style
sns.set_theme()
# pal = sns.light_palette("seagreen")
# pal = sns.color_palette("mako")
# pal = sns.cubehelix_palette(start=.5, rot=-.75)#, as_cmap=True)`
# pal = sns.cubehelix_palette(start=.5, rot=-.5)#, as_cmap=True)`
pal = sns.color_palette("crest")
sns.set(rc={'figure.figsize':(30,7)})

#['GPP_NT_VUT_REF', 'SW_IN_ERA', 'gap_flag_hour', 'b4', 'ESACCI-sm',
       # 'year', 'BESS-RSDN', 'relative_time_idx', 'timestep_idx_global',
       # 'LW_IN_ERA', 'PA_ERA', 'b2', 'VPD_ERA', 'b6', 'TA_ERA']

#create area chart
plt.stackplot(encoder_size_imp_df['encoder_index'], # x axis
              encoder_size_imp_df['timestep_idx_global'],
              encoder_size_imp_df['LW_IN_ERA'],
              encoder_size_imp_df['PA_ERA'],
              encoder_size_imp_df['b2'],
              encoder_size_imp_df['VPD_ERA'],
              encoder_size_imp_df['b6'],
              encoder_size_imp_df['TA_ERA'],
              encoder_size_imp_df['year'],
              encoder_size_imp_df['ESACCI-sm'],
              encoder_size_imp_df['relative_time_idx'],
              encoder_size_imp_df['SW_IN_ERA'],
              encoder_size_imp_df['BESS-RSDN'],
              encoder_size_imp_df['gap_flag_hour'],
              encoder_size_imp_df['b4'], 
              labels=['timestep_idx_global', 'LW_IN_ERA', 'PA_ERA', 'b2', 'VPD_ERA', 'b6', 'TA_ERA',
                      'year','ESACCI-sm','relative_time_idx', 'SW_IN_ERA','BESS-RSDN','gap_flag_hour','b4'],
              colors=pal)

labels=['timestep_idx_global', 'LW_IN_ERA', 'PA_ERA', 'b2', 'VPD_ERA', 'b6', 'TA_ERA',
        'year','ESACCI-sm','relative_time_idx', 'SW_IN_ERA','BESS-RSDN','gap_flag_hour','b4']
# Reverse the order of labels in legend
plt.legend(reversed(plt.legend().legendHandles), reversed(labels), loc='upper left', fontsize=14)

#add axis labels
plt.title('Feature Importance by Encoder Relative Time Point / Average Attention', fontsize=20)
plt.ylabel('Importance', fontsize=18)
plt.xlabel('Encoder Relative Index (1hour before pred ~ 168(24*7) hours before pred)', fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=14) # fontsize of both axes

plt2 = plt.twinx()

plt2.plot(encoder_size_imp_df["encoder_index"], encoder_size_imp_df["encoder_attention"], color='r', linewidth=5)
plt2.set_ylabel('Average Attention')
plt2.tick_params('y', colors='k')# black is k
plt.show()

plt.savefig('fi_avg_attention_GPP_TFT_AU-DaP_78_20100108_0AM_2.png')

### Prediction time point of the plot

In [None]:
# val_raw_predictions
print(val_x.keys())
val_x["decoder_time_idx"][0].numpy()[0]

In [None]:
val_x["groups"][0].numpy()[0]

In [None]:
val_df[val_df["timestep_idx_global"] == val_x["decoder_time_idx"][0].numpy()[0]].head()


In [None]:
# val_x["decoder_time_idx"]
val_df[val_df["site_id"] == "AU-DaP"][val_df["timestep_idx_global"] >= 350808]


In [None]:
# match AU-DaP = 78
print(val_x['groups'][:10])
val_x['decoder_target'][0].numpy()[0]
val_x['decoder_target'][:10]

In [None]:
val_df[val_df["timestep_idx_global"] == val_x["decoder_time_idx"][0].numpy()[0]][val_df["site_id"] == "AU-DaP"]

### Profile of the plot

- Site: AU-DaP
- group index: 78

- YEAR 2010
- Month 1
- Day 8
- hour 0AM

In [None]:
# # attention
# fig, ax = plt.subplots()
# attention = interpretation_none["attention"][pred_ind].detach().cpu()
# attention = attention / attention.sum(-1).unsqueeze(-1)
# ax.plot(attention)
# ax.set_xlabel("Time index")
# ax.set_ylabel("Attention")
# ax.set_title(f"Attention for Pred Step {i}")
# xticks_locations = range(0, len(attention), 10)
# xticks_labels = range(-len(attention), 0, 10)
# plt.xticks(xticks_locations, xticks_labels)
# plt.show()

In [None]:

# # Interpret Outputs
# interpretation_sum = best_tft.interpret_output(raw_prediction_igbp, reduction="sum")
# best_tft.plot_interpretation(interpretation_sum)

In [None]:
# # Init df
# example_fi_df = pd.DataFrame(columns= ['encoder_timestep'] + known_cat + unknown_cat + known_real + ['relative_time_idx'])

# # Add avg importances to df
# avg_fi_vals = torch.mean(torch.mean(raw_predictions["encoder_variables"].squeeze(), dim=0), dim=0).numpy()
# example_fi_df.loc[len(example_fi_df)] = ['Average'] + list(avg_fi_vals)

# # Get FIs at each step
# for time in range(ENCODER_LEN):
#     feature_means_time = torch.mean(raw_predictions["encoder_variables"].squeeze(), dim=0)[time].numpy()
#     neg_time = time-ENCODER_LEN
#     example_fi_df.loc[len(example_fi_df)] = [neg_time] + list(feature_means_time)

# # Merge in attn vales
# attn_time = torch.mean(raw_predictions["encoder_attention"].squeeze().squeeze(), dim=0)
# example_fi_df['attn_time'] = [np.nan] + list(attn_time.numpy())

In [None]:
#!conda install jinja2 -y

In [None]:
# # Save out
# example_fi_df.to_csv(root_dir + os.sep + 'data/feature_imp_by_time_igbp_dbp.csv', index=False)

# # Display
# example_fi_df.sort_values('attn_time', ascending=False, inplace=True)
# styled_df = example_fi_df.style.background_gradient(cmap='Greens')
# styled_df