# Notebook Setup

In [2]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

In [3]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

## Import Modules

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
    sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [5]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
preproc_objects_dir = root_dir + os.sep + 'code' + os.sep + 'src' + os.sep + 'preprocessing' + os.sep + 'preproc_objects'

if IN_COLLAB:
    raw_data_dir = "/content/drive/MyDrive/W210/Data/half_hourly_data"

# Input files
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-mvp-v2.csv"
split_dict_filename = preproc_objects_dir + os.sep + "stratified_splits_k5.joblib"

# Output file naming
container = "all-sites-data"
ext = "parquet"
ver = "slim-features-v1"
model = "tft"
tag = "raw"
blob_name_base = f"{model}-full_2010_2015_v_{ver}"
blob_name = f"{blob_name_base}_{tag}.{ext}"
train_blob_name = f"{model}-full_2010_2015-train-v-{ver}.{ext}"
val_blob_name = f"{model}-full_2010_2015-val-v-{ver}.{ext}"
test_blob_name = f"{model}-full_2010_2015-test-v-{ver}.{ext}"

In [6]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date', # <---- will remove unused date features after preproc
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
msc_features = None
monthly_features = ['ESACCI-sm', 'BESS-RSDN']
metadata_features = ['site_id', 'filename', 'koppen_main', 'monthly_data_available']
precip_sum_features = False

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

# Stage 1: Trim and Merge Site Metadata

In [7]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = True
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

In [None]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, 
                                    hourly_features, metadata_features, target_variable, raw_data_dir,
                                    msc_features=msc_features, precip_sum_features=precip_sum_features,
                                    monthly_features=monthly_features)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

Sites with missing monthly data: 43
1. AR-SLu: (10800, 27)
2. AR-Vir: (20448, 27)
3. AT-Neu: (26304, 27)
4. AU-ASM: (37944, 27)
AU-Ade has less than 1 year of remaining sequences
AU-Cpr has too many gaps, missing % = 0.20757180156657964
7. AU-Cum: (19296, 27)
8. AU-DaP: (32304, 27)
9. AU-DaS: (43824, 27)
AU-Dry has too many gaps, missing % = 0.2732749178532311
11. AU-Emr: (22464, 27)
AU-Fog has less than 1 year of remaining sequences
13. AU-Gin: (28200, 27)
14. AU-How: (43824, 27)
15. AU-RDF: (16008, 27)
16. AU-Rig: (35064, 27)
17. AU-Stp: (43824, 27)
18. AU-TTE: (21528, 27)
SKIP: AU-Tum is missing hourly data.
AU-Wac has less than 1 year of remaining sequences
21. AU-Whr: (27024, 27)
22. AU-Wom: (43296, 27)
AU-Ync has too many gaps, missing % = 0.3057644110275689
SKIP: BR-Sa1 is missing hourly data.
BR-Sa3 has less than 1 year of remaining sequences
CA-Man has less than 1 year of remaining sequences
CA-NS4 has less than 1 year of remaining sequences
CA-NS7 has less than 1 year of rema

In [None]:
# Drop year column
data_df.drop(['year', 'date', 'day'], axis=1, inplace=True)

In [None]:
if data_df.isna().sum().sum() != 0:
  display(data_df[data_df.isna().any(axis=1)].groupby(['site_id', 'year', 'month']).count())
  display(pd.DataFrame(data_df.isna().sum()).T)

In [None]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

### Checkpoint: Save full raw data

In [None]:
# Upload data_df checkpoint to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = True

if data_cleanup_checkpoint:
    parquet_file = BytesIO()
    data_df.to_parquet(parquet_file, engine='pyarrow')
    parquet_file.seek(0)

    azStorageClient = AzStorageClient(az_cred_file)
    azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

## Load from Checkpoint

In [None]:
# (Optional) Load data_df from Azure checkpoint
load_data_checkpoint = False

if load_data_checkpoint:
    data_df = None
    local_file = tmp_dir + os.sep + blob_name 
    if not (os.path.exists(local_file)): # <--- when would this ever be true?
        azStorageClient = AzStorageClient(az_cred_file)
        file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
        data_df = pd.read_parquet(file_stream, engine='pyarrow')
        data_df.to_parquet(local_file)
    else:
        data_df = pd.read_parquet(local_file)

    print(f"Data size: {data_df.shape}")

## Load Train/Val/Test Split

In [None]:
# Load split dict from preprocessing/stratified-splits.ipynb
split_dict = joblib.load(split_dict_filename)

# Assign folds to train-val-test splits (for MVP)
train_folds = [x for x in range(1, split_dict['NUM_FOLDS']+1) if (x != split_dict['VAL_INDEX']) & (x != split_dict['TEST_INDEX'])]
train_sites = [z for x in train_folds for z in split_dict[f"fold_{x}"] ]
val_sites = split_dict[f"fold_{split_dict['VAL_INDEX']}"]
test_sites = split_dict[f"fold_{split_dict['TEST_INDEX']}"]

total_sites = len(train_sites) + len(val_sites) + len(test_sites)
print(f"# train/val/test sites: {len(train_sites)}/{len(val_sites)}/{len(test_sites)}")
print(f"% train/val/test sites: {len(train_sites)/total_sites:.3f}/{len(val_sites)/total_sites:.3f}/{len(test_sites)/total_sites:.3f}")

## Data Transformation

In [None]:
raw_data_file_path = None
model_name = 'rfr-mvp-v2-slim'
data_transformer = TFTDataTransformer(train_sites, val_sites, test_sites, model_name, raw_data_file_path, data_df, preproc_objects_dir)

In [None]:
categorical_cols = ['c3c4', 'koppen_sub', 'koppen_main', 'month', 'day', 'hour', 
                    'MODIS_PFT', 'MODIS_LC', 'MODIS_IGBP', 'hemisphere_NS', 'lat_band']
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night', 'prcp', 'prcp-lag3'] + \
                [f"{feat}{ext}" for feat in msc_features for ext in ["_szn_mean", "_amp_msc", "_min_msc"]]
data_transformer.data_transform(categorical_cols, realNum_cols, cat_encode_type='dummy')

print("\nTrain data peak:")
display(data_transformer.train_df.head(2))

In [None]:
final_checkpoint = True

if final_checkpoint:
    data_transformer.upload_train_test_to_azure(az_cred_file, container, train_blob_name, val_blob_name, test_blob_name)