# Notebook Setup

In [None]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"

In [None]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

## Import Modules

In [None]:
# install required modules quietly
required_packages = ['azure-storage-blob']

for p in required_packages: 
  try:
      __import__(p)
  except ImportError:
      %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [None]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'

if IN_COLLAB:
  raw_data_dir = "/content/drive/MyDrive/W210/Data/half_hourly_data"

site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-mvp.csv"

# File
container = "all-sites-data"
ext = "parquet"
ver = "mvp"
model = "tft"
blob_name_base = f"{model}_full_2010_2015_v_{ver}"
train_blob_name = f"{model}_full_2010_2015-train-v-{ver}.{ext}"
val_blob_name = f"{model}_full_2010_2015-val-v-{ver}.{ext}"
test_blob_name = f"{model}_full_2010_2015-test-v-{ver}.{ext}"

In [None]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

# Stage 1: Trim and Merge Site Metadata

In [None]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = True
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

In [None]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, 
                                    hourly_features, metadata_features, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

In [None]:
if data_df.isna().sum().sum() != 0:
  display(data_df[data_df.isna().any(axis=1)].groupby(['site_id', 'year', 'month']).count())
  display(pd.DataFrame(data_df.isna().sum()).T)

In [None]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

# CHECKPOINT: Save full raw data

In [None]:
# Upload data_df checkpoint to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = True
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:
  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

In [None]:
# (Optional) Load data_df from Azure checkpoint
load_data_checkpoint = False

if load_data_checkpoint:
    data_df = None
    local_file = tmp_dir + os.sep + blob_name 
    if not (os.path.exists(local_file)): # <--- when would this ever be true?
        azStorageClient = AzStorageClient(az_cred_file)
        file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
        data_df = pd.read_parquet(file_stream, engine='pyarrow')
        data_df.to_parquet(local_file)
    else:
        data_df = pd.read_parquet(local_file)

    print(f"Data size: {data_df.shape}")

# Train/Val/Test Split

In [None]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename, usecols=['site_id', 'filename', 'IGBP'])

# only focus on target sites
print(f"size:{site_metadata_df.shape}")
site_metadata_df.dropna(inplace=True)

# Group IGBP
print(site_metadata_df.IGBP.unique())
site_metadata_df['gen_IGBP'] = site_metadata_df['IGBP']
site_metadata_df['gen_IGBP'].replace('WSA', 'SAV', inplace=True)
site_metadata_df['gen_IGBP'].replace('CSH', 'SHB', inplace=True)
site_metadata_df['gen_IGBP'].replace('OSH', 'SHB', inplace=True)
site_metadata_df.drop(site_metadata_df[site_metadata_df['gen_IGBP'] == 'WAT'].index, inplace = True)
print(site_metadata_df.gen_IGBP.unique())

# Get available sites in the datasets
available_sites = data_df['site_id'].unique()
site_data_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(available_sites)]
print(f"available sites: {site_data_df.shape}")

# Conduct k-fold splitting
n = 5
skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=42) # Add random state for reproducibility
folds = skf.split(site_data_df['site_id'], site_data_df['gen_IGBP'])

site_splits = []
for i, (train_index, test_index) in enumerate(folds):
  print(f"Fold {i+1}:")
  data_df = site_data_df[['site_id', 'gen_IGBP']].iloc[test_index]
  sites = list(data_df.site_id.unique())
  print(f"  Count={test_index.shape}")
  print(f"  IGBP ={np.sort(data_df.gen_IGBP.unique())}")
  print(f"  Sites={sites}")
  print("")

  site_splits.append(sites)

# print all sites
print(site_splits)

# Terminate Runtime

In [None]:
from google.colab import runtime
runtime.unassign()