Note: Need 32G of RAM to run 5-year prediction

# 0. Preparation

## (Optoinal) mount google drive
- Make sure that available access is the user's own drive(no access across files in shared folder)

In [None]:
if 'google.colab' in str(get_ipython()):
    IN_COLLAB = True
else:
    IN_COLLAB = False

if IN_COLLAB:
    #TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
    MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
    from google.colab import drive
    drive.mount('/content/drive/')
else:
    MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling/"

## Import libraries

In [None]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

from io import BytesIO
import numpy as np
import pandas as pd

import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor

from timeit import default_timer
from datetime import datetime
import gc
import pickle

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
     os.environ["IN_COLLAB"] = "true"
     os.environ["MY_HOME_ABS_PATH"] = MY_HOME_ABS_PATH
     sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *
from model_pipeline_lib import *

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pl.seed_everything(42)

2023-04-02 23:21:13.954707: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-02 23:21:13.957010: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-02 23:21:14.006146: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-04-02 23:21:14.006944: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Global seed set to 42
Global seed set to 42


42

# File System Definitions

In [10]:
# Download full data
root_dir  = MY_HOME_ABS_PATH
tmp_dir   = root_dir + os.sep + '.tmp'
model_dir = root_dir + os.sep + 'data' + os.sep + 'models'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

container = "all-sites-data"
tft_raw_blob_name = "full_2010_2015_v_mvp_raw.parquet"

# output
ver = "1"
ext = "parquet"

rfr_raw_blob_name = f"hybrid-2010-2015-rfr-transformed-v{ver}.{ext}" # Transformed TFT data for RFR prediction
hybrid_raw_blob_name = f"hybrid-2010-2015-raw-v{ver}.{ext}" # TFT + RFR pred

# Data Transform for RFR Predictions


In [7]:
categorical_cols = ['c3c4', 'MODIS_PFT', 'MODIS_IGBP']
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night']
target_variable = 'GPP_NT_VUT_REF'

def GetTransformedTftDataforRfrPredict(cat_cols, real_cols):
    # Load data from Azure
    data_df = get_raw_datasets(container, tft_raw_blob_name)

    # Load scalers
    preproc_objects_dir = root_dir + os.sep + 'code/src/preprocessing/preproc_objects'
    preproc_save_path = preproc_objects_dir + os.sep + 'scaler_rfr-mvp.joblib'
    scaler=joblib.load(preproc_save_path)
    print(scaler.mean_)

    # Transform categorical features
    dummy_df = pd.get_dummies(data_df[cat_cols])
    data_df = data_df.drop(columns=cat_cols)
    data_df = pd.concat([data_df, dummy_df], axis=1)
    print(f"Data size after encoding: {data_df.shape}")

    # Transform numerical features
    data_df.loc[:,real_cols] = scaler.transform(data_df[real_cols])
    print(f"Data size: {data_df.shape}")

    data_df.reset_index(inplace=True, drop=True)
    print(f"Data size: {data_df.shape}")
    
    return data_df

In [41]:
generate_rfr_dataset = False

if generate_rfr_dataset:
    
    data_df = GetTransformedTftDataforRfrPredict(categorical_cols, realNum_cols)
    
    parquet_file = BytesIO()
    data_df.to_parquet(parquet_file, engine='pyarrow')
    parquet_file.seek(0)

    azStorageClient = AzStorageClient(az_cred_file)
    azStorageClient.uploadBlob(container, rfr_raw_blob_name, parquet_file, overwrite=True)

Data size: (4862712, 51)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP', 'MODIS_PFT',
       'gap_flag_hour', 'gap_flag_month'],
      dtype='object')
NA count: 0
[ 1.09206415e+01  1.65476230e+02  3.15006423e+02  5.93380537e+00
  5.17308451e-02  9.48886824e+01  3.05767371e-01  5.34209899e-01
  1.38198560e-01  9.41709227e-02  2.62089304e-01  6.46183239e-02
  9.07462739e-02  2.56177452e-01  1.81825343e-01  1.07663278e-01
  1.10274090e+01  7.41909515e+01  3.06207843e+01  1

# Get RFR Predictions

In [8]:
# Load model
model_objects_dir = root_dir + os.sep + 'code/src/modeling/model_objects'
model_save_path = os.path.join(model_objects_dir, 'rfr_mvp_v3_top7_features.pkl')
model = joblib.load(model_save_path)
print(f"Model Features: {model.feature_names_in_}\n")
    
# Load tranformed TFT data from Blob
data_df = get_raw_datasets(container, rfr_raw_blob_name)

# Run RFR Predictions
X_data = data_df[model.feature_names_in_]
y_actual = data_df[target_variable]
y_pred = model.predict(X_data)
print(f"# of data count: actual[{len(y_actual)}], predicted[{len(y_pred)}]")

# Evaluate predictions 
rmse = np.sqrt(mean_squared_error(y_actual, y_pred))
mae = mean_absolute_error(y_actual, y_pred)
r2 = r2_score(y_actual, y_pred)
print(f"RMSE: {rmse}, MAE: {mae}, R2/NSE: {r2}")

toCache = True
if toCache:
    ts = datetime.now().strftime("%y%m%d_%H%M")
    rfr_pred_file = tmp_dir + os.sep + f"rfr_predict_{ts}.pkl"
    with open(rfr_pred_file, "wb") as fout:
        print(f"RFR prediction result temporary saved to {rfr_pred_file}.")
        pickle.dump(y_pred, fout)

Model Features: ['SW_IN_ERA' 'hour' 'VPD_ERA' 'NIRv' 'NDVI' 'EVI' 'TA_ERA']

Data size: (4862712, 75)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c4_percent',
       'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily', 'PET', 'Ts',
       'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai', 'LST_Day',
       'LST_Night', 'gap_flag_hour', 'gap_flag_month', 'MODIS_LC', 'c3c4_C3',
       'c3c4_C4', 'c3c4_mix', 'c3c4_rotation', 'c3c4_unknown', 'MODIS_PFT_CRO',
       'MODIS_PFT_DBF', 'MODIS_PFT_EBF', 'MODIS_PFT_ENF', 'MODIS_PFT_GRA',
       'MODIS_PFT_MF', 'MODIS_PFT_Other', 'MODIS_PFT_SA', 'MODIS_PFT_SH',
       'MODIS_IGBP_CRO', 'MODIS_IGBP_CSH', 'MODIS_IGBP_DBF', 'MODIS_IGB

[Parallel(n_jobs=48)]: Using backend ThreadingBackend with 48 concurrent workers.
[Parallel(n_jobs=48)]: Done   6 out of  50 | elapsed:    0.7s remaining:    5.0s
[Parallel(n_jobs=48)]: Done  50 out of  50 | elapsed:    1.1s finished


# of data count: actual[4862712], predicted[4862712]
RMSE: 3.5836375355212176, MAE: 1.9214436397874508, R2/NSE: 0.7242334531705916
RFR prediction result temporary saved to /root/co2-flux-hourly-gpp-modeling/.tmp/rfr_predict_230402_2323.pkl.


# Merge RFR prediction result with TFT data

In [11]:
# Load TFT data
target_variable = 'GPP_NT_VUT_REF'
data_df = get_raw_datasets(container, tft_raw_blob_name)

# Load RFR Prediction
rfr_pred_file = "/root/co2-flux-hourly-gpp-modeling/.tmp/rfr_predict_230402_2323.pkl" # TODO: Update if needed
with open(rfr_pred_file, "rb") as fin:
    print(f"Load RFR prediction result from {rfr_pred_file}.")
    rfr_pred = pickle.load(fin)

# Merge data
data_df['rfr_pred_gpp'] = rfr_pred

# Sanity Check
# RMSE: 3.5836375355212176, MAE: 1.9214436397874508, R2/NSE: 0.7242334531705916
rmse = np.sqrt(mean_squared_error(data_df[target_variable], data_df['rfr_pred_gpp']))
mae = mean_absolute_error(data_df[target_variable], data_df['rfr_pred_gpp'])
r2 = r2_score(data_df[target_variable], data_df['rfr_pred_gpp'])
print(f"RMSE: {rmse}, MAE: {mae}, R2/NSE: {r2}")

# Upload to cloud
toUpload = True
if toUpload:
    parquet_file = BytesIO()
    data_df.to_parquet(parquet_file, engine='pyarrow')
    parquet_file.seek(0)

    azStorageClient = AzStorageClient(az_cred_file)
    azStorageClient.uploadBlob(container, hybrid_raw_blob_name, parquet_file, overwrite=True)

Data size: (4862712, 51)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP', 'MODIS_PFT',
       'gap_flag_hour', 'gap_flag_month'],
      dtype='object')
NA count: 0
Load RFR prediction result from /root/co2-flux-hourly-gpp-modeling/.tmp/rfr_predict_230402_2323.pkl.
RMSE: 3.5836375355212176, MAE: 1.9214436397874508, R2/NSE: 0.7242334531705916
File uploaded to all-sites-data/hybrid-2010-2015-raw-v1.parquet
