# 0. Preparation

## (Optoinal) mount google drive
- Make sure that available access is the user's own drive(no access across files in shared folder)

In [28]:
if 'google.colab' in str(get_ipython()):
    IN_COLLAB = True
else:
    IN_COLLAB = False

if IN_COLLAB:
    #TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
    MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"
    from google.colab import drive
    drive.mount('/content/drive/')
else:
    MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling/"

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## pip install

In [44]:
! pip install pytorch-forecasting azure-storage-blob joblib 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Import libraries

In [40]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

from io import BytesIO
import numpy as np
import pandas as pd

import sklearn
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

from timeit import default_timer
from datetime import datetime
import gc
import pickle

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
     os.environ["IN_COLLAB"] = "true"
     os.environ["MY_HOME_ABS_PATH"] = MY_HOME_ABS_PATH
     sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *
from model_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pl.seed_everything(42)

INFO:lightning_fabric.utilities.seed:Global seed set to 42


42

# Load data from Azure blob

In [31]:
# Download full data
root_dir  = MY_HOME_ABS_PATH
tmp_dir   = root_dir + os.sep + '.tmp'
model_dir = root_dir + os.sep + 'data' + os.sep + 'models'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

container = "all-sites-data"
tft_raw_blob_name = "full_2010_2015_v_mvp_raw.parquet"

# output
ver = "0"
ext = "parquet"

rfr_raw_blob_name = f"hybrid-2010-2015-rfr-transformed-v{ver}.{ext}" # Transformed TFT data for RFR prediction
hybrid_raw_blob_name = f"hybrid-2010-2015-raw-v{ver}.{ext}" # TFT + RFR pred

# Data Transform for RFR Predictions


In [52]:
# Load data from Azure
data_df = get_raw_datasets(container, tft_raw_blob_name)

# Load scalers
preproc_objects_dir = root_dir + os.sep + 'code/src/preprocessing/preproc_objects'
preproc_save_path = preproc_objects_dir + os.sep + 'scaler_rfr-mvp.joblib'
with open(preproc_save_path,'rb') as fin:
    scaler=pickle.load(fin)

Data size: (4862712, 51)
Data Columns: Index(['GPP_NT_VUT_REF', 'site_id', 'timestep_idx_local',
       'timestep_idx_global', 'datetime', 'date', 'year', 'month', 'day',
       'hour', 'TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA',
       'PA_ERA', 'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6',
       'b7', 'IGBP', 'lat', 'long', 'koppen_sub', 'koppen_main', 'c3c4',
       'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
       'PET', 'Ts', 'ESACCI-sm', 'MODIS_LC', 'NDWI', 'Percent_Snow', 'Fpar',
       'Lai', 'LST_Day', 'LST_Night', 'MODIS_IGBP', 'MODIS_PFT',
       'gap_flag_hour', 'gap_flag_month'],
      dtype='object')
NA count: 0


In [53]:
categorical_cols = ['c3c4', 'koppen_sub', 'koppen_main',
                    'year', 'month', 'day', 'hour', 'MODIS_PFT', 'MODIS_LC', 'MODIS_IGBP']
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night']

# Transform numerical features
data_df.loc[:,realNum_cols] = scaler.transform(data_df[realNum_cols])
print(f"Data size: {data_df.shape}")

# Transform categorical features
dummy_df = pd.get_dummies(data_df[categorical_cols])
data_df = data_df.drop(columns=categorical_cols)
data_df = pd.concat([data_df, dummy_df], axis=1)
print(f"Data size after encoding: {data_df.shape}")

data_df.reset_index(inplace=True, drop=True)
print(f"Data size: {data_df.shape}")

AttributeError: ignored

In [None]:
data_cleanup_checkpoint = True

if data_cleanup_checkpoint:
  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, rfr_raw_blob_name, parquet_file, overwrite=True)

# Get RFR Predictions

In [9]:
# Load model
model_objects_dir = root_dir + os.sep + 'code/src/modeling/model_objects'
model_save_path = os.path.join(model_objects_dir, 'rfr_mvp_tuned.pkl')
with open(model_save_path, "rb") as fin:
  model = pickle.load(fin)

# Load file 
data_df = get_raw_datasets(container, rfr_raw_blob_name)

In [None]:
# Run RFR Predictions
target_variable = 'GPP_NT_VUT_REF'
drop_cols = ['site_id', 'timestep_idx_local', 'timestep_idx_global', 'index', 'datetime','gap_flag_hour', 'gap_flag_month']

X_data = data_df.drop([target_variable] + drop_cols, axis=1)
y_actual = data_df[target_variable]
y_pred = model.predict(X_data)
print(f"# of data count: actual[{len(y_actual)}], predicted[{len(y_pred)}]")

toCache = True
if toCache:
  ts = datetime.now().strftime("%y%m%d_%H%M")
  rfr_pred_file = tmp_dir + os.sep + f"rfr_predict_{ts}.pkl"
  with open(rfr_pred_file, "wb") as fout:
      print(f"RFR prediction result temporary saved to {rfr_pred_file}.")
      pickle.dump(y_pred, fout)

# Merge RFR prediction result with TFT data

In [None]:
# Load TFT data
data_df = get_raw_datasets(container, tft_raw_blob_name)

# Load RFR Prediction
rfr_pred_file = "" # TODO: Update if needed
with open(rfr_pred_file, "rb") as fin:
    print(f"Load RFR prediction result from {rfr_pred_file}.")
    rfr_pred = pickle.load(fin)

# Merge data
data_df['rfr_pred_gpp'] = rfr_pred

# Upload to cloud
toUpload = True
if toUpload:
  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, hybrid_raw_blob_name, parquet_file, overwrite=True)