# Notebook Setup

In [1]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

In [3]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

## Import Modules

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
    sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *
from model_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

2023-04-02 22:58:32.196286: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-02 22:58:32.243852: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-02 22:58:32.244937: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Global seed set to 42


# Load Raw TFT Set

In [6]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
model_dir = root_dir + os.sep + 'data' + os.sep + 'models'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
preproc_objects_dir = root_dir + os.sep + 'code' + os.sep + 'src' + os.sep + 'preprocessing' + os.sep + 'preproc_objects'

# file names
split_dict_filename = preproc_objects_dir + os.sep + "stratified_splits_k5.joblib"
model = "tft"
ext = 'parquet'
ver = 'global-scaler'
train_blob_name = f"{model}-full_2010_2015-train-v-{ver}.{ext}"
val_blob_name = f"{model}-full_2010_2015-val-v-{ver}.{ext}"
test_blob_name = f"{model}-full_2010_2015-test-v-{ver}.{ext}"

# Download full data
container = "all-sites-data"
blob_name = "full_2010_2015_v_mvp_raw.parquet"
local_file = tmp_dir + os.sep + blob_name

data_df = get_raw_datasets(container, blob_name)

Data size: (4862712, 51)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP', 'MODIS_PFT',
       'gap_flag_hour', 'gap_flag_month'],
      dtype='object')
NA count: 0


## Load Train/Val/Test Split

In [7]:
# Load split dict from preprocessing/stratified-splits.ipynb
split_dict = joblib.load(split_dict_filename)

# Assign folds to train-val-test splits (for MVP)
train_folds = [x for x in range(1, split_dict['NUM_FOLDS']+1) if (x != split_dict['VAL_INDEX']) & (x != split_dict['TEST_INDEX'])]
train_sites = [z for x in train_folds for z in split_dict[f"fold_{x}"] ]
val_sites = split_dict[f"fold_{split_dict['VAL_INDEX']}"]
test_sites = split_dict[f"fold_{split_dict['TEST_INDEX']}"]

total_sites = len(train_sites) + len(val_sites) + len(test_sites)
print(f"# train/val/test sites: {len(train_sites)}/{len(val_sites)}/{len(test_sites)}")
print(f"% train/val/test sites: {len(train_sites)/total_sites:.3f}/{len(val_sites)/total_sites:.3f}/{len(test_sites)/total_sites:.3f}")

# train/val/test sites: 78/26/25
% train/val/test sites: 0.605/0.202/0.194


## Data Transformation

In [8]:
raw_data_file_path = None
model_name = 'tft-global-scaler'
data_transformer = TFTDataTransformer(train_sites, val_sites, test_sites, model_name, raw_data_file_path, data_df, preproc_objects_dir)

Data size: (4862712, 50).


In [9]:
categorical_cols = ['c3c4', 'koppen_sub', 'koppen_main', 'year', 'month', 'day', 'hour', 
                    'MODIS_PFT', 'MODIS_LC', 'MODIS_IGBP', 'IGBP']
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night']
data_transformer.data_transform(categorical_cols, realNum_cols, cat_encode_type='label')

print("\nTrain data peak:")
display(data_transformer.train_df.head(2))

Data size: (4862712, 50).
Data size after encoding: (4862712, 50)
Number of sites in df: 129
Train Sites: ['AR-SLu' 'AR-Vir' 'AT-Neu' 'AU-ASM' 'AU-Cpr' 'AU-Cum' 'AU-DaS' 'AU-RDF'
 'AU-TTE' 'AU-Wom' 'BE-Bra' 'BE-Dor' 'BE-Lon' 'BE-Vie' 'CA-Ca2' 'CA-Ca3'
 'CA-Cbo' 'CA-TP1' 'CA-TP3' 'CA-TP4' 'CA-TPD' 'CH-Cha' 'CH-Lae' 'CN-Sw2'
 'CZ-BK1' 'CZ-BK2' 'CZ-KrP' 'CZ-RAJ' 'CZ-Stn' 'DE-Geb' 'DE-HoH' 'DE-Hte'
 'DE-Kli' 'DE-Obe' 'DE-RuR' 'DE-SfN' 'DE-Spw' 'ES-LJu' 'ES-LM2' 'FI-Let'
 'FR-Fon' 'FR-Lam' 'FR-Pue' 'IL-Yat' 'IT-CA1' 'IT-CA3' 'IT-Lav' 'IT-Noe'
 'IT-Ro2' 'IT-SRo' 'IT-Tor' 'NL-Hor' 'SE-Htm' 'SE-Lnn' 'SE-Ros' 'US-AR2'
 'US-ARM' 'US-CRT' 'US-Fmf' 'US-IB2' 'US-KFS' 'US-Me6' 'US-Myb' 'US-Prr'
 'US-Ro1' 'US-Rws' 'US-SRG' 'US-SRM' 'US-Syv' 'US-Tw4' 'US-UMB' 'US-UMd'
 'US-Vcm' 'US-Vcp' 'US-WCr' 'US-WPT' 'US-Wjs' 'US-Wkg']
Val Sites: ['AU-DaP' 'AU-Emr' 'AU-Gin' 'AU-How' 'AU-Rig' 'CA-Ca1' 'CA-Gro' 'CH-Fru'
 'CH-Oe2' 'DE-Hai' 'DK-Sor' 'FI-Hyy' 'FR-Aur' 'FR-Hes' 'GF-Guy' 'IT-SR2'
 'NL-Loo' 'SE-Deg' 'SE-N

Unnamed: 0,GPP_NT_VUT_REF,site_id,timestep_idx_local,timestep_idx_global,datetime,year,month,day,hour,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,IGBP,lat,long,koppen_sub,koppen_main,c3c4,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,ESACCI-sm,MODIS_LC,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,MODIS_IGBP,MODIS_PFT,gap_flag_hour,gap_flag_month
0,-0.13057,AR-SLu,0,350640,2010-01-01 00:00:00,0,0,0,0,1.3085,-0.69453,0.82552,0.3132,1.23318,0.00304,-0.09129,-0.01868,-0.24579,-0.24943,-0.43443,-0.24622,-0.21851,0.37241,0.72035,0.23276,6,-33.4648,-66.4598,11,1,0,2.41198,1.99682,0.64589,1.9599,0.4257,-1.0436,1.76195,-1.54503,5,-0.54594,-0.4432,0.1372,-0.10592,1.67861,1.73361,7,8,0.0,0.0
1,-0.66336,AR-SLu,1,350641,2010-01-01 01:00:00,0,0,0,1,1.26106,-0.69453,0.82552,0.21833,-0.2331,0.00403,-0.09129,-0.01868,-0.24579,-0.24943,-0.43443,-0.24622,-0.21851,0.37241,0.72035,0.23276,6,-33.4648,-66.4598,11,1,0,2.41198,1.99682,0.64589,1.9599,0.4257,-1.0436,1.76195,-1.54503,5,-0.54594,-0.4432,0.1372,-0.10592,1.67861,1.73361,7,8,0.0,0.0


## Save Out

In [11]:
final_checkpoint = True

if final_checkpoint:
    data_transformer.upload_train_test_to_azure(az_cred_file, container, train_blob_name, val_blob_name, test_blob_name)

Uploading train dataset to tft-full_2010_2015-train-v-global-scaler.parquet...
File uploaded to all-sites-data/tft-full_2010_2015-train-v-global-scaler.parquet
Uploading val dataset to tft-full_2010_2015-val-v-global-scaler.parquet...
File uploaded to all-sites-data/tft-full_2010_2015-val-v-global-scaler.parquet
Uploading test dataset to tft-full_2010_2015-test-v-global-scaler.parquet...
File uploaded to all-sites-data/tft-full_2010_2015-test-v-global-scaler.parquet
