Note: Need 32G of RAM to run 5-year prediction

# 0. Preparation

## (Optoinal) mount google drive
- Make sure that available access is the user's own drive(no access across files in shared folder)

In [2]:
if 'google.colab' in str(get_ipython()):
    IN_COLLAB = True
else:
    IN_COLLAB = False

if IN_COLLAB:
    #TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
    MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
    from google.colab import drive
    drive.mount('/content/drive/')
else:
    MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling/"

## Import libraries

In [21]:
if IN_COLLAB:
    !pip install torch pytorch-lightning pytorch_forecasting azure-storage-blob -q
else:
    !pip install xgboost -q

[0m

In [22]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

from io import BytesIO
import numpy as np
import pandas as pd

import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import xgboost
from xgboost import XGBRegressor, plot_importance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
print(xgboost.__version__)

from timeit import default_timer
from datetime import datetime
import gc
import pickle

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
     os.environ["IN_COLLAB"] = "true"
     os.environ["MY_HOME_ABS_PATH"] = MY_HOME_ABS_PATH
     sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *
from model_pipeline_lib import *

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pl.seed_everything(42)

Global seed set to 42


1.7.5


42

# File System Definitions

In [16]:
# Download full data
root_dir  = MY_HOME_ABS_PATH
tmp_dir   = root_dir + os.sep + '.tmp'
model_dir = root_dir + os.sep + 'data' + os.sep + 'models'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

container = "all-sites-data"
tft_raw_blob_name = "full_2010_2015_v_mvp_raw.parquet"

# output
ver = "0"
ext = "parquet"

xgboost_raw_blob_name = f"hybrid-2010-2015-xgboost-transformed-v{ver}.{ext}" # Transformed TFT data for RFR prediction
hybrid_raw_blob_name = f"hybrid-2010-2015-xgboost-raw-v{ver}.{ext}" # TFT + RFR pred

# Data Transform for RFR Predictions


In [14]:
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night']
target_variable = 'GPP_NT_VUT_REF'

def GetTransformedTftDataforXgboostPredict( real_cols):
    # Load data from Azure
    data_df = get_raw_datasets(container, tft_raw_blob_name)
    
    data_df = data_df[[target_variable]+realNum_cols]

    # Load scalers
    preproc_objects_dir = root_dir + os.sep + 'code/src/preprocessing/preproc_objects'
    preproc_save_path = preproc_objects_dir + os.sep + 'scaler_cv4.joblib'
    scaler=joblib.load(preproc_save_path)
    print(scaler.mean_)

    # Transform numerical features
    data_df.loc[:,real_cols] = scaler.transform(data_df[real_cols])
    print(f"Data size: {data_df.shape}")

    data_df.reset_index(inplace=True, drop=True)
    print(f"Data size: {data_df.shape}")
    
    return data_df

In [17]:
generate_xgboost_dataset = False

if generate_xgboost_dataset:
    
    data_df = GetTransformedTftDataforXgboostPredict(realNum_cols)
    
    parquet_file = BytesIO()
    data_df.to_parquet(parquet_file, engine='pyarrow')
    parquet_file.seek(0)

    azStorageClient = AzStorageClient(az_cred_file)
    azStorageClient.uploadBlob(container, xgboost_raw_blob_name, parquet_file, overwrite=True)

Data size: (4862712, 51)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP', 'MODIS_PFT',
       'gap_flag_hour', 'gap_flag_month'],
      dtype='object')
NA count: 0
[ 1.10715734e+01  1.66943375e+02  3.15434963e+02  6.03055858e+00
  5.10312538e-02  9.49345273e+01  3.04173728e-01  5.32818064e-01
  1.37392661e-01  9.38595694e-02  2.61081768e-01  6.37356429e-02
  8.99631667e-02  2.56365275e-01  1.82869278e-01  1.08856663e-01
  7.46886623e+01  3.07184435e+01  1.63946036e+02  1

# Get RFR Predictions

In [28]:
# Load model
model_objects_dir = root_dir + os.sep + 'data/models/xgboost_best'
model_save_path = os.path.join(model_objects_dir, 'xgboost_best_model.pkl')
model = joblib.load(model_save_path)
print(f"Model Features: {model.feature_names_in_}\n")
    
# Load tranformed TFT data from Blob
data_df = get_raw_datasets(container, xgboost_raw_blob_name)

# Run RFR Predictions
X_data = data_df[model.feature_names_in_]
y_actual = data_df[target_variable]
y_pred = model.predict(X_data)
print(f"# of data count: actual[{len(y_actual)}], predicted[{len(y_pred)}]")

# Evaluate predictions 
rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
mae = mean_absolute_error(y_actual, y_pred)
r2 = r2_score(y_actual, y_pred)
print(f"RMSE: {rmse}, MAE: {mae}, R2/NSE: {r2}")

toCache = True
if toCache:
    ts = datetime.now().strftime("%y%m%d_%H%M")
    rfr_pred_file = tmp_dir + os.sep + f"xgboost_predict_{ts}.pkl"
    with open(rfr_pred_file, "wb") as fout:
        print(f"Xgboost prediction result temporary saved to {rfr_pred_file}.")
        pickle.dump(y_pred, fout)

Model Features: ['NDVI' 'NIRv' 'SW_IN_ERA']

Data size: (4862712, 30)
Data Columns: Index(['GPP_NT_VUT_REF', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA',
       'P_ERA', 'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5',
       'b6', 'b7', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
       'LST_Day', 'LST_Night'],
      dtype='object')
NA count: 0
# of data count: actual[4862712], predicted[4862712]
RMSE: 3.6117102351957504, MAE: 1.9056476639337077, R2/NSE: 0.7198960539585142
Xgboost prediction result temporary saved to /root/co2-flux-hourly-gpp-modeling/.tmp/xgboost_predict_230410_2233.pkl.


# Merge RFR prediction result with TFT data

In [29]:
# Load TFT data
target_variable = 'GPP_NT_VUT_REF'
data_df = get_raw_datasets(container, tft_raw_blob_name)

# Load RFR Prediction
rfr_pred_file = "/root/co2-flux-hourly-gpp-modeling/.tmp/xgboost_predict_230410_2233.pkl" # TODO: Update if needed
with open(rfr_pred_file, "rb") as fin:
    print(f"Load XGBoost prediction result from {rfr_pred_file}.")
    rfr_pred = pickle.load(fin)

# Merge data
data_df['xgboost_pred_gpp'] = rfr_pred

# Sanity Check
# RMSE: 3.5836375355212176, MAE: 1.9214436397874508, R2/NSE: 0.7242334531705916
rmse = np.sqrt(mean_squared_error(data_df[target_variable], data_df['xgboost_pred_gpp']))
mae = mean_absolute_error(data_df[target_variable], data_df['xgboost_pred_gpp'])
r2 = r2_score(data_df[target_variable], data_df['xgboost_pred_gpp'])
print(f"RMSE: {rmse}, MAE: {mae}, R2/NSE: {r2}")

# Upload to cloud
toUpload = True
if toUpload:
    parquet_file = BytesIO()
    data_df.to_parquet(parquet_file, engine='pyarrow')
    parquet_file.seek(0)

    azStorageClient = AzStorageClient(az_cred_file)
    azStorageClient.uploadBlob(container, hybrid_raw_blob_name, parquet_file, overwrite=True)

Data size: (4862712, 51)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP', 'MODIS_PFT',
       'gap_flag_hour', 'gap_flag_month'],
      dtype='object')
NA count: 0
Load XGBoost prediction result from /root/co2-flux-hourly-gpp-modeling/.tmp/xgboost_predict_230410_2233.pkl.
RMSE: 3.6117102351957504, MAE: 1.9056476639337077, R2/NSE: 0.7198960539585142
File uploaded to all-sites-data/hybrid-2010-2015-xgboost-raw-v0.parquet
