# Notebook Setup

In [None]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
else:
  IN_COLLAB = False

#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/content/drive/MyDrive/W210/co2-flux-hourly-gpp-modeling"

In [None]:
if IN_COLLAB:
  from google.colab import drive
  drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Import Modules

In [None]:
# install required modules quietly
required_packages = ['geopandas', 'pyspark', 'azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

In [None]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import math
import json

from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

import pyspark.pandas as pd
from calendar import monthrange
from datetime import datetime
from io import BytesIO

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

# Load locale custome modules
import sys
if IN_COLLAB:
  os.chdir(MY_HOME_ABS_PATH)
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

## Initialize Spark Session

In [None]:
# Import SparkSession
from pyspark.sql import SparkSession
# Create a Spark Session
spark = SparkSession.builder.master("local[*]").config(
    "spark.jars.packages", 
    "org.apache.hadoop:hadoop-azure:3.3.1,com.microsoft.azure:azure-storage:8.6.6"
    ).getOrCreate()
# Check Spark Session Information
spark

## Define Local File System Constants

In [None]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/CO2_flux_gpp_modeling/DS_capstone_23Spring_CO2/Data/half_hourly_data"

# Load Train and Test dataset from Azure Storage Blob

In [None]:
# Define target dataset
container = "baseline-data"
ext = "parquet"
ver = "1"
blob_name_base = f"baseline_all_v_{ver}"
train_blob_name_base = f"baseline-train-v-{ver}"
test_blob_name_base = f"baseline-test-v-{ver}"

In [None]:
# Initialized Azure Storage Client
azStorageClient = AzStorageClient(az_cred_file)
sessionkeys = azStorageClient.getSparkSessionKeys()
spark.conf.set(sessionkeys[0],sessionkeys[1])

In [None]:
# Load train dataset
train_blob_path = f"wasbs://{container}@{sessionkeys[2]}.blob.core.windows.net/{train_blob_name_base}"
print(f"Loading train dataset from {train_blob_path}...")
train_df = spark.read.parquet(train_blob_path)

print(f"Data loaded: {train_df.count()} rows x {len(train_df.columns)} columns.")
print("Train data peak:")
train_df.show(5, False)

Loading train dataset from wasbs://baseline-data@mids23spring.blob.core.windows.net/baseline-train-v-1...
Data loaded: 1485926 rows x 49 columns.
Train data peak:
+--------------+------+---------+---------+-------+-----+------+----+-----+---+----+------------------+------------------+------------------+------+------+------+------+------+------+---+------+---------+-------+------+----------+----------------+----------------+----------------+----------------+----------------+-----------------+------------------+------------------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------+-----------------+-----------------+----------------+-------------+-------------+--------------+---------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# Load test dataset
test_blob_path = f"wasbs://{container}@{sessionkeys[2]}.blob.core.windows.net/{test_blob_name_base}"
print(f"Loading test dataset from {test_blob_path}...")
test_df = spark.read.parquet(test_blob_path)

print(f"Data loaded: {test_df.count()} rows x {len(test_df.columns)} columns.")
print("Test data peak:")
test_df.show(5, False)

Loading test dataset from wasbs://baseline-data@mids23spring.blob.core.windows.net/baseline-test-v-1...
Data loaded: 548544 rows x 49 columns.
Test data peak:
+--------------+------+---------+---------+-------+-----+-------+----+-----+---+----+------------------+------------------+------------------+------+------+-----+------+------+------+------+------+---------+--------+--------+----------+--------+---------+-------+-----------+-----------+------------+-------------+------------+------------+-------------+-------------+------------+----------+----------+------------+------------+-----------+----------------+-------------+-------------+--------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
hr_train_df = train_df.filter(col('minute') == 0)
hr_test_df = test_df.filter(col('minute') == 0)

# Randome Forest Regressor (Default)

In [None]:
# Define target variable
target_variable = 'GPP_NT_VUT_REF'

# Train Model
rf = RandomForestRegressor(featuresCol="features", labelCol=target_variable,
                           seed = 42)
model = rf.fit(train_df)

In [None]:
# Evaluate the Model
predictions = model.transform(test_df)

# Compute Evaluation Metrics
# RMSE
evaluator = RegressionEvaluator(labelCol=target_variable, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.5}")

# NSE
mean_gpp = predictions.agg(F.avg(target_variable)).first()[0]
nse_formula = 1 - (F.sum((predictions[target_variable] - predictions.prediction)**2) / F.sum((predictions[target_variable] - mean_gpp)**2))
nse = predictions.agg(nse_formula).first()[0]
print(f"Nash-Sutcliffe Efficiency (NSE): {nse:.5}")

Root Mean Squared Error (RMSE): 4.9831
Nash-Sutcliffe Efficiency (NSE): 0.58013


In [None]:
hr_predictions = model.transform(hr_test_df)

# RMSE
evaluator = RegressionEvaluator(labelCol=target_variable, predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(hr_predictions)
print(f"Root Mean Squared Error (RMSE): {rmse:.5}")

# NSE
mean_gpp = hr_predictions.agg(F.avg(target_variable)).first()[0]
nse_formula = 1 - (F.sum((hr_predictions[target_variable] - hr_predictions.prediction)**2) / F.sum((hr_predictions[target_variable] - mean_gpp)**2))
nse = hr_predictions.agg(nse_formula).first()[0]
print(f"Nash-Sutcliffe Efficiency (NSE): {nse:.5}")

Root Mean Squared Error (RMSE): 4.9916
Nash-Sutcliffe Efficiency (NSE): 0.57983


# Model Evaulation Visulation