# Notebook Setup

In [9]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

## Import Modules

In [18]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import sys
import math
import json
import time
import pandas as pd
import numpy as np
from calendar import monthrange
from datetime import datetime
from io import BytesIO
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV


# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Define Local File System Constants

In [11]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

# Load Train and Test dataset from Azure Storage Blob

In [12]:
# Define dataset
container = "all-sites-data"
ext = "parquet"
ver = "mvp"
model = "rfr"
train_blob_name = f"{model}-full_2010_2015-train-v-{ver}.{ext}"
val_blob_name = f"{model}-full_2010_2015-val-v-{ver}.{ext}"
test_blob_name = f"{model}-full_2010_2015-test-v-{ver}.{ext}"

In [19]:
train_df.dtypes

GPP_NT_VUT_REF                float64
site_id                        object
timestep_idx_local              int64
timestep_idx_global             int64
datetime               datetime64[ns]
index                           int64
TA_ERA                        float64
SW_IN_ERA                     float64
LW_IN_ERA                     float64
VPD_ERA                       float64
P_ERA                         float64
PA_ERA                        float64
EVI                           float64
NDVI                          float64
NIRv                          float64
b1                            float64
b2                            float64
b3                            float64
b4                            float64
b5                            float64
b6                            float64
b7                            float64
lat                           float64
long                          float64
c4_percent                    float64
BESS-PAR                      float64
BESS-PARdiff

In [13]:
# Load splits from Azure
azStorageClient = AzStorageClient(az_cred_file)

train_fs = azStorageClient.downloadBlob2Stream(container, train_blob_name)
train_df = pd.read_parquet(train_fs, engine='pyarrow')

val_fs = azStorageClient.downloadBlob2Stream(container, val_blob_name)
val_df = pd.read_parquet(val_fs, engine='pyarrow')

test_fs = azStorageClient.downloadBlob2Stream(container, test_blob_name)
test_df = pd.read_parquet(test_fs, engine='pyarrow')

# Train RFR Model

In [20]:
# Prep x, y dataset for train
target_variable = 'GPP_NT_VUT_REF'
drop_cols = ['site_id', 'timestep_idx_local', 'timestep_idx_global', 'index', 'datetime', 'lat', 'long' ,'gap_flag_hour', 'gap_flag_month']

X_train = train_df.drop([target_variable] + drop_cols, axis=1)
y_train = train_df[target_variable]
X_val = val_df.drop([target_variable] + drop_cols, axis=1)
y_val = val_df[target_variable]
X_test = test_df.drop([target_variable] + drop_cols, axis=1)
y_test = test_df[target_variable]

# # combine train and val into one for CV
# X_train = pd.concat([X_train, X_val], axis=0)
# y_train = pd.concat([y_train, y_val], axis=0)

In [None]:
# Define the model parameters for CV
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'max_features': [1.0, 'sqrt', 'log2']
}

# Fit RFR model with GridSearch tuning
start_time = time.time()
rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid=param_grid, cv=4, n_jobs=6, verbose=1)
grid_search.fit(X_train, y_train)
end_time = time.time()

print(f"Elapsed time: {end_time - start_time:.2f} seconds")

Fitting 5 folds for each of 12 candidates, totalling 60 fits


## Evaluate RFR on Test Set

In [None]:
# Predict on test set
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluate predictions
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Test RMSE: {rmse}")
print(f"Test MAE: {mae}")
print(f"Test R2: {r2}")

In [None]:
nse = 1 - (mean_squared_error(y_test, y_pred) / np.var(y_test))
print(nse)

from hydroeval import nse
nse_value = nse(np.array(y_test), np.array(y_pred))
print(nse_value)