# Notebook Setup

In [1]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

DOWNLOAD_DATA = False

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Modules

In [2]:
if DOWNLOAD_DATA:
  # install required modules quietly
  required_packages = ['geopandas','azure-storage-blob']

  for p in required_packages: 
    try:
        __import__(p)
    except ImportError:
        %pip install {p} --quiet

  import os
  os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

  import pandas as pd
  from io import BytesIO

  # Load locale custome modules
  import sys
  if IN_COLLAB:
    os.chdir(MY_HOME_ABS_PATH)
    sys.path.insert(0,os.path.abspath("./code/src/tools"))
  else:
    sys.path.append(os.path.abspath("./code/src/tools"))

  from CloudIO.AzStorageClient import AzStorageClient
  from data_pipeline_lib import *

  pd.set_option('display.max_rows', 500)
  pd.set_option('display.max_columns', None)
  pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Define Local File System Constants

In [3]:
if DOWNLOAD_DATA:
  root_dir =  MY_HOME_ABS_PATH
  tmp_dir =  root_dir + os.sep + '.tmp'
  raw_data_dir = tmp_dir
  data_dir = root_dir + os.sep + 'data'
  cred_dir = root_dir + os.sep + '.cred'
  az_cred_file = cred_dir + os.sep + 'azblobcred.json'

  if IN_COLLAB:
    raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

In [4]:
train_sites = ['US-NR1', 'US-Me2', 'US-ARM', 'US-Vcp']
valid_sites = ["US-Vcp"]
test_sites = ["US-GLE"]
selected_sites =  train_sites + valid_sites + test_sites

# Load raw dataset from Azure Storage Blob

In [5]:
if DOWNLOAD_DATA:
  # Define target dataset
  container = "baseline-data"
  ext = "parquet"
  ver = "0"
  tag = "mc_raw"
  blob_name_base = f"baseline_all_v_{ver}"
  data_blob_name = f"{blob_name_base}_{tag}.{ext}"

  full_blob_name = 'baseline_all_v_1-i-knn_raw.parquet'

  tft_data_dir = tmp_dir + os.sep + 'data' + os.sep + 'gpp'


In [6]:
if DOWNLOAD_DATA:
  def load_data(container, data_blob_name, local_file_path):
    if not (os.path.exists(local_file_path)):
      if not (os.path.exists(tmp_dir)):
        os.mkdir(tmp_dir)
      if not (os.path.exists(tft_data_dir)):
        os.makedirs(tft_data_dir)
      print(f"Loading dataset from {container}/{data_blob_name}...")
      # Initialized Azure Storage Client
      azStorageClient = AzStorageClient(az_cred_file)
      file_stream = azStorageClient.downloadBlob2Stream(container, data_blob_name)
      data_df = pd.read_parquet(file_stream, engine='pyarrow')
      data_df.to_csv(local_file_path)
    else:
      print(f"Loading dataset from {local_file_path}...")
      data_df = pd.read_csv(local_file_path)

    return data_df

  # Load full dataset
  full_local_file = tft_data_dir + os.sep + 'full.csv'
  full_df = load_data(container, full_blob_name, full_local_file)

  print(f"Data size: {full_df.shape}")
  print(full_df['site_id'].unique())
  print("data peak:")
  display(full_df.head())
  

In [7]:
# import gc
# del full_df
# gc.collect()

# Run TensortFlow TFT (Fix-Param, Test)

https://www.tensorflow.org/guide/migrate/upgrade

In [8]:
import os
os.chdir(MY_HOME_ABS_PATH + "/code/src/tools/tft/")

In [9]:
!pip install tensorflow-gpu
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow-gpu
  Using cached tensorflow-gpu-2.12.0.tar.gz (2.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tensorflow-gpu
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Building wheel for tensorflow-gpu (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for tensorflow-gpu[0m[31m
[0m[?25h  Running setup.py clean for tensorflow-gpu
Failed to build tensorflow-gpu
Installing collected packages: tensorflow-gpu
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mRunning setup.py install for tensorflow-gpu[0m did not run success

In [10]:
import tensorflow as tf
print(tf.__version__)
print(tf.test.gpu_device_name())
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.compat.v1.experimental.output_all_intermediates(True)

2.11.0
/device:GPU:0
Num GPUs Available:  1


In [11]:
# Update tesnort flow libs
# !tf_upgrade_v2 \
#     --intree libs/ \
#     --outtree libs_v2/ \
#     --reportfile tree_report.txt

In [12]:
%run script_train_fixed_params.py gpp ../../../../.tmp yes yes

Using output folder ../../../../.tmp





Selecting GPU ID=0
*** Training from defined parameters for gpp ***
Loading & splitting data...
Formatting train-valid-test splits.
Raw size: (865920, 50) from ['CA-Cbo' 'CH-Lae' 'ES-LJu' 'ES-LM2' 'FI-Hyy' 'FR-Lam' 'FR-Pue' 'IT-Lav'
 'US-AR1' 'US-ARM' 'US-GLE' 'US-NR1' 'US-SRM' 'US-Seg' 'US-Ton' 'US-UMB'
 'US-Var' 'US-Vcp' 'US-Wkg']
Setting scalers with training data...
IDs:['CA-Cbo', 'CH-Lae', 'ES-LJu', 'ES-LM2', 'FI-Hyy', 'FR-Lam', 'FR-Pue', 'IT-Lav', 'US-AR1', 'US-ARM', 'US-NR1', 'US-SRM', 'US-Seg', 'US-Ton', 'US-UMB', 'US-Var', 'US-Wkg']
Real number input: ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'day', 'hour', 'lat', 'long', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 'day', 'hour', 'lat', 'long'].
Setting scalers with training data...
Categorical input: ['year', 'month', 'IGBP', 'koppen_main', 'koppen_sub']
Train size: (96408, 50) from ['IT-Lav' 'US-NR1']
Valid size: (52584, 50) from ['US-Vcp']
Test size: (43824, 50) from ['US-GLE']




Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 174, 26)]    0           []                               
                                                                                                  
 tf_op_layer_TemporalFusionTran  [(None, 174, 5)]    0           ['input_1[0][0]']                
 sformer/strided_slice_1 (Tenso                                                                   
 rFlowOpLayer)                                                                                    
                                                                                                  
 tf_op_layer_TemporalFusionTran  [(None, 174, 21)]   0           ['input_1[0][0]']                
 sformer/strided_slice (TensorF                                                               

  super().__init__(name, **kwargs)


                                                                 former/strided_slice_37[0][0]']  
                                                                                                  
 dense_34 (Dense)               (None, 1, 5)         30          ['tf_op_layer_TemporalFusionTrans
                                                                 former/strided_slice_38[0][0]']  
                                                                                                  
 dense_38 (Dense)               (None, 1, 5)         30          ['tf_op_layer_TemporalFusionTrans
                                                                 former/strided_slice_39[0][0]']  
                                                                                                  
 dense_42 (Dense)               (None, 1, 5)         30          ['tf_op_layer_TemporalFusionTrans
                                                                 former/strided_slice_40[0][0]']  
          

  updates = self.state_updates


Cannot load from ../../../../.tmp/saved_models/gpp/fixed/tmp, skipping ...
Using cached validation data
Optimal model found, updating
Model saved to: ../../../../.tmp/saved_models/gpp/fixed/TemporalFusionTransformer.ckpt
*** Running tests ***
Resetting temp folder...
*** TemporalFusionTransformer params ***
# dropout_rate = 0.1
# hidden_layer_size = 5
# learning_rate = 0.001
# max_gradient_norm = 100.0
# minibatch_size = 128
# model_folder = ../../../../.tmp/saved_models/gpp/fixed
# num_heads = 4
# stack_size = 1
# total_time_steps = 174
# num_encoder_steps = 168
# num_epochs = 1
# early_stopping_patience = 5
# multiprocessing_workers = 5
# column_definition = [('site_id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>), ('datetime', <DataTypes.DATE: 2>, <InputTypes.TIME: 5>), ('GPP_NT_VUT_REF', <DataTypes.REAL_VALUED: 0>, <InputTypes.TARGET: 0>), ('TA_ERA', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>), ('SW_IN_ERA', <DataTypes.REAL_VALUED: 0>, <InputTypes.KNOWN_INPUT: 2>),

  updates=self.state_updates,


array([[[ 0.85880965, -0.6068796 , -0.7167925 ],
        [ 0.8534388 , -0.60494965, -0.7298975 ],
        [ 0.8442011 , -0.6083681 , -0.7327204 ],
        [ 0.83618486, -0.6153914 , -0.6992468 ],
        [ 0.827056  , -0.624711  , -0.6516527 ],
        [ 0.79628366, -0.6440406 , -0.6390484 ]],

       [[ 0.87123394, -0.6064458 , -0.64013183],
        [ 0.85280216, -0.60984045, -0.6777438 ],
        [ 0.8389711 , -0.6167918 , -0.66355515],
        [ 0.8266261 , -0.6259162 , -0.62817776],
        [ 0.7941154 , -0.645139  , -0.6260783 ],
        [ 0.7759429 , -0.66014236, -0.5618396 ]],

       [[ 0.87071437, -0.61244994, -0.57056564],
        [ 0.8467566 , -0.61935806, -0.5947192 ],
        [ 0.8283281 , -0.62811995, -0.5832678 ],
        [ 0.7921778 , -0.64726   , -0.60050195],
        [ 0.7726371 , -0.6617794 , -0.5449282 ],
        [ 0.7515415 , -0.67759514, -0.47744805]],

       ...,

       [[ 0.81881475, -0.49677107,  0.43027687],
        [ 0.8354517 , -0.51975554,  0.37312955],
 

Model evaluation p10 saved to ../../../../.tmp/results/gpp/fixed/pred_0226_0947_p10.csv.
Model evaluation p50 saved to ../../../../.tmp/results/gpp/fixed/pred_0226_0947_p50.csv.
Model evaluation p90 saved to ../../../../.tmp/results/gpp/fixed/pred_0226_0947_p90.csv.
Model evaluation targets saved to ../../../../.tmp/results/gpp/fixed/pred_0226_0947_targets.csv.
Training completed @ 2023-02-26 09:47:56.714959
Best validation loss = 4.977513691921867
Params:
dropout_rate  =  0.1
hidden_layer_size  =  5
learning_rate  =  0.001
max_gradient_norm  =  100.0
minibatch_size  =  128
model_folder  =  ../../../../.tmp/saved_models/gpp/fixed
num_heads  =  4
stack_size  =  1
total_time_steps  =  174
num_encoder_steps  =  168
num_epochs  =  1
early_stopping_patience  =  5
multiprocessing_workers  =  5
column_definition  =  [('site_id', <DataTypes.REAL_VALUED: 0>, <InputTypes.ID: 4>), ('datetime', <DataTypes.DATE: 2>, <InputTypes.TIME: 5>), ('GPP_NT_VUT_REF', <DataTypes.REAL_VALUED: 0>, <InputTypes.T

# Model Evaulation Visualization

In [13]:
import gc
gc.collect()

673822