## Import Modules

In [2]:
!pip install azure-storage-blob==12.14.1

Collecting azure-storage-blob==12.14.1
  Using cached azure_storage_blob-12.14.1-py3-none-any.whl (383 kB)
Collecting msrest>=0.7.1
  Using cached msrest-0.7.1-py3-none-any.whl (85 kB)
Collecting azure-core<2.0.0,>=1.24.2
  Using cached azure_core-1.26.3-py3-none-any.whl (174 kB)
Collecting isodate>=0.6.0
  Using cached isodate-0.6.1-py2.py3-none-any.whl (41 kB)
Collecting requests-oauthlib>=0.5.0
  Using cached requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)
Collecting oauthlib>=3.0.0
  Using cached oauthlib-3.2.2-py3-none-any.whl (151 kB)
Installing collected packages: oauthlib, isodate, requests-oauthlib, azure-core, msrest, azure-storage-blob
Successfully installed azure-core-1.26.3 azure-storage-blob-12.14.1 isodate-0.6.1 msrest-0.7.1 oauthlib-3.2.2 requests-oauthlib-1.3.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0[0m[39;49m -> [0m[32;49m23.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update

In [3]:
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

In [12]:
import os
import sys
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
sys.path.append('./.cred')
sys.path.append('./code/src/tools')
sys.path.append('./code/src/tools/CloudIO')

from azure.storage.blob import BlobServiceClient
from CloudIO.AzStorageClient import AzStorageClient

# Load Train and Test dataset from Azure Storage Blob

In [13]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

In [14]:
# AzStorageClient.listBlobs(container)
container = "baseline-data"
cred_file = az_cred_file
ext = "parquet"

# Container and file name
blob_name_train = "baseline-train-v-1-i-knn.parquet"
blob_name_val = "baseline-test-v-1-i-knn.parquet"

In [15]:
if os.path.exists(cred_file):
    connect_str = ""
    with open(cred_file, "rb") as f:
        data = json.load(f)
        connect_str = data['connectionstr']
        blob_svc_client = BlobServiceClient.from_connection_string(connect_str)
        tokens = connect_str.split(';')
        for t in tokens:
            if "AccountName=" in t:
                AccountName = t[len("AccountName="):]
            elif "AccountKey=" in t:
                AccountKey = t[len("AccountKey="):]

# Download train file
train_df = None
if not (os.path.exists(tmp_dir+blob_name_train)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name_train)
    train_df = pd.read_parquet(file_stream, engine='pyarrow')
    train_df.to_parquet(tmp_dir + blob_name_train)
else:
    train_df = pd.read_parquet(tmp_dir + blob_name_train)


# Load Test Data
test_df = None
if not (os.path.exists(tmp_dir+blob_name_val)):
    azStorageClient = AzStorageClient(az_cred_file)
    file_stream = azStorageClient.downloadBlob2Stream(container, blob_name_val)
    test_df = pd.read_parquet(file_stream, engine='pyarrow')
    test_df.to_parquet(tmp_dir + blob_name_val)
else:
    test_df = pd.read_parquet(tmp_dir + blob_name_val)

print(f"Train size: {train_df.shape}")
print(f"Test size: {test_df.shape}")

Train size: (631032, 49)
Test size: (234888, 49)


In [16]:
## TEMP: Drop gap-filled rows -> Later rerun pipeline to avoid up front
train_df = train_df.loc[train_df['gap_flag_hour']==0, ].copy()
test_df = test_df.loc[test_df['gap_flag_hour']==0, ].copy()
print(f"Train size: {train_df.shape}")
print(f"Test size: {test_df.shape}")

Train size: (606120, 49)
Test size: (223680, 49)


In [17]:
## Drop Cols and Prep Train/Test

target_variable = 'GPP_NT_VUT_REF'
drop_cols = ['gap_flag_hour', 'gap_flag_month', 'datetime', target_variable] + ['site_id'] # <----- site_id is temp drop, need to dummy encode in pipeline

# Extract the features and target variable from training data
X_train = train_df.drop(columns=drop_cols)
y_train = train_df[target_variable]

# Extract the features and target variable from test data
X_test = test_df.drop(columns=drop_cols)
y_test = test_df[target_variable]

print(f"Train size: {X_train.shape}")
print(f"Test size: {X_test.shape}")

Train size: (606120, 44)
Test size: (223680, 44)


## Fit Model: Random Forest Regressor (RFR)

In [18]:
# Create an instance of Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, verbose=1)

# Train the regressor on training data
start_time = time.time()
rf_regressor.fit(X_train, y_train)
end_time = time.time()
fit_time = end_time - start_time
print("Time taken to fit the model: {:.2f} seconds".format(fit_time))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  2.3min


Time taken to fit the model: 360.63 seconds


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  6.0min finished


## Evaluate Model

In [19]:
# Use the trained regressor to make predictions on test data
y_pred = rf_regressor.predict(X_test)

# Calculate and print RMSE, MAE, and R-squared on the test set
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("RMSE: {:.2f}".format(rmse))
print("MAE: {:.2f}".format(mae))
print("R-squared: {:.2f}".format(r2))


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s


RMSE: 3.76
MAE: 1.97
R-squared: 0.61


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished


In [None]:
# Define target variable
target_variable = 'GPP_NT_VUT_REF'

# Train Model
rf = RandomForestRegressor(featuresCol="features", labelCol=target_variable,
                           seed = 42)
model = rf.fit(train_df)

In [None]:
# Evaluate the Model
predictions = model.transform(test_df)

# Compute Evaluation Metrics
# RMSE
evaluator = RegressionEvaluator(labelCol=target_variable, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.5}")

# NSE
mean_gpp = predictions.agg(F.avg(target_variable)).first()[0]
nse_formula = 1 - (F.sum((predictions[target_variable] - predictions.prediction)**2) / F.sum((predictions[target_variable] - mean_gpp)**2))
nse = predictions.agg(nse_formula).first()[0]
print(f"Nash-Sutcliffe Efficiency (NSE): {nse:.5}")

In [None]:
hr_predictions = model.transform(hr_test_df)

# RMSE
evaluator = RegressionEvaluator(labelCol=target_variable, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(hr_predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.5}")

# NSE
mean_gpp = hr_predictions.agg(F.avg(target_variable)).first()[0]
nse_formula = 1 - (F.sum((hr_predictions[target_variable] - hr_predictions.prediction)**2) / F.sum((hr_predictions[target_variable] - mean_gpp)**2))
nse = hr_predictions.agg(nse_formula).first()[0]
print(f"Nash-Sutcliffe Efficiency (NSE): {nse:.5}")