# Notebook Setup

In [6]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
#MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

In [7]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

## Import Modules

In [8]:
# install required modules quietly
# required_packages = ['azure-storage-blob']

# for p in required_packages: 
#   try:
#       __import__(p)
#   except ImportError:
#       %pip install {p} --quiet

import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
    sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [9]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
preproc_objects_dir = root_dir + os.sep + 'code' + os.sep + 'src' + os.sep + 'preprocessing' + os.sep + 'preproc_objects'

if IN_COLLAB:
    raw_data_dir = "/content/drive/MyDrive/W210/Data/half_hourly_data"

# input files
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-mvp.csv"

# File
container = "all-sites-data"
ext = "parquet"
ver = "mvp"
model = "rfr"
blob_name_base = f"{model}-full_2010_2015_v_{ver}"
train_blob_name = f"{model}-full_2010_2015-train-v-{ver}.{ext}"
val_blob_name = f"{model}-full_2010_2015-val-v-{ver}.{ext}"
test_blob_name = f"{model}-full_2010_2015-test-v-{ver}.{ext}"

In [10]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
#target_variable_qc = 'NEE_VUT_REF_QC'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
metadata_features = ['site_id', 'filename', 'lat', 'long', 'koppen_sub', 'koppen_main', 'IGBP',
                     'c3c4', 'c4_percent', 'monthly_data_available']

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

# Stage 1: Trim and Merge Site Metadata

In [11]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = False # <----------- set to false for RF run
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

In [12]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, 
                                    hourly_features, metadata_features, target_variable, raw_data_dir)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

Sites with missing monthly data: 43
1. AR-SLu: (10800, 28)
2. AR-Vir: (16992, 28)
3. AT-Neu: (26304, 28)
4. AU-ASM: (37248, 28)
SKIP: AU-Ade does not have sufficient data in desired time period
6. AU-Cpr: (36192, 28)
7. AU-Cum: (18864, 28)
8. AU-DaP: (28200, 28)
9. AU-DaS: (40392, 28)
SKIP: AU-Dry does not have sufficient data in desired time period
11. AU-Emr: (20448, 28)
SKIP: AU-Fog does not have sufficient data in desired time period
13. AU-Gin: (24336, 28)
14. AU-How: (43824, 28)
15. AU-RDF: (14904, 28)
16. AU-Rig: (31824, 28)
17. AU-Stp: (40632, 28)
18. AU-TTE: (21288, 28)
SKIP: AU-Tum is missing hourly data.
SKIP: AU-Wac does not have sufficient data in desired time period
21. AU-Whr: (27024, 28)
22. AU-Wom: (41928, 28)
SKIP: AU-Ync does not have sufficient data in desired time period
SKIP: BR-Sa1 is missing hourly data.
SKIP: BR-Sa3 does not have sufficient data in desired time period
SKIP: CA-Man does not have sufficient data in desired time period
SKIP: CA-NS4 does not have s

KeyboardInterrupt: 

In [None]:
if data_df.isna().sum().sum() != 0:
  display(data_df[data_df.isna().any(axis=1)].groupby(['site_id', 'year', 'month']).count())
  display(pd.DataFrame(data_df.isna().sum()).T)

In [None]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

# CHECKPOINT: Save full raw data

In [None]:
# Upload data_df checkpoint to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = True
tag = "raw"
blob_name = f"{blob_name_base}_{tag}.{ext}"

if data_cleanup_checkpoint:
  parquet_file = BytesIO()
  data_df.to_parquet(parquet_file, engine='pyarrow')
  parquet_file.seek(0)

  azStorageClient = AzStorageClient(az_cred_file)
  azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

In [None]:
# (Optional) Load data_df from Azure checkpoint
load_data_checkpoint = False

if load_data_checkpoint:
    data_df = None
    local_file = tmp_dir + os.sep + blob_name 
    if not (os.path.exists(local_file)): # <--- when would this ever be true?
        azStorageClient = AzStorageClient(az_cred_file)
        file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
        data_df = pd.read_parquet(file_stream, engine='pyarrow')
        data_df.to_parquet(local_file)
    else:
        data_df = pd.read_parquet(local_file)

    print(f"Data size: {data_df.shape}")

## Load Train/Val/Test Split

In [None]:
# Define folds dict name, path
folds_dict_fname = 'folds_dict_k5.joblib'
folds_dict_path = os.path.join(preproc_objects_dir, folds_dict_fname)

# Check if folds dict exists, if not then create new split
if os.path.exists(folds_dict_path):
    # Load existing folds dictionary
    folds_dict = joblib.load(folds_dict_path)
    folds = folds_dict['folds']

In [None]:
# Assign folds to train-val-test splits (for MVP)
VAL_INDEX = 3
TEST_INDEX = 4
train_sites = [x for x in range(k) if (x != VAL_INDEX) & (x != TEST_INDEX)]
val_sites = site_splits[VAL_INDEX]
test_sites = site_splits[TEST_INDEX]

## Data Transformation

In [None]:
raw_data_file_path = None
data_transformer = TFTDataTransformer(train_sites, val_sites, test_sites, raw_data_file_path, data_df)

In [None]:
categorical_cols = ['IGBP', 'c3c4', 'koppen_sub', 'koppen_main',
                    'year', 'month', 'day', 'hour', 'MODIS_PFT', 'MODIS_LC']
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night']
data_transformer.data_transform(categorical_cols, realNum_cols, cat_encode_type='dummy')

print("\nTrain data peak:")
display(data_transformer.train_df.head(2))

## FINISH: Upload train and test to Azure Blob Storage

In [None]:
final_checkpoint = True

if final_checkpoint:
  data_transformer.upload_train_test_to_azure(az_cred_file, container, train_blob_name, val_blob_name, test_blob_name)

# Terminate Runtime

In [None]:
from google.colab import runtime
runtime.unassign()