# Notebook Setup

In [2]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

In [3]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

## Import Modules

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
    sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
    sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [5]:
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
preproc_objects_dir = root_dir + os.sep + 'code' + os.sep + 'src' + os.sep + 'preprocessing' + os.sep + 'preproc_objects'

if IN_COLLAB:
    raw_data_dir = "/content/drive/MyDrive/W210/Data/half_hourly_data"

# Input files
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'
monthly_data_filename = data_dir + os.sep + "monthly-mvp-v2.csv"
split_dict_filename = preproc_objects_dir + os.sep + "stratified_splits_k5.joblib"

# Output file naming
container = "all-sites-data"
ext = "parquet"
ver = "mvp-v2-knn"
model = "rfr"
tag = "raw"
blob_name_base = f"{model}-full_2010_2015_v_{ver}"
blob_name = f"{blob_name_base}_{tag}.{ext}"
train_blob_name = f"{model}-full_2010_2015-train-v-{ver}.{ext}"
val_blob_name = f"{model}-full_2010_2015-val-v-{ver}.{ext}"
test_blob_name = f"{model}-full_2010_2015-test-v-{ver}.{ext}"

In [6]:
# Define features and target variables of the data pipelines
target_variable = 'GPP_NT_VUT_REF'
hourly_features = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA',
                     'datetime', 'year', 'month', 'day', 'hour', 'date',
                     'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7']
msc_features = ['TA_ERA', 'SW_IN_ERA', 'P_ERA', 'EVI', 'NDVI', 'NIRv', 'b4']
metadata_features = ['site_id', 'filename', 'koppen_sub', 'koppen_main',
                     'c3c4', 'c4_percent', 'monthly_data_available', 'hemisphere_NS', 'lat_band']
precip_sum_features = True

# Define the features to use in KNN imputer, only using real values as cat are same per site
imp_exclude_cols = ['date', 'datetime', 'year', 'month', 'hour', 'day', 'minute', 'site_id', 'IGBP']
imp_cols = [x for x in hourly_features + ['GPP_NT_VUT_REF'] if x not in imp_exclude_cols]

# Stage 1: Trim and Merge Site Metadata

In [7]:
# Define imput params
impute = True
impute_method = 'knn'
impute_global = True
resample = False
time_col = 'datetime'
duration = 'H'

# Filter sequence to date range
missing_thresh = 0.2
start_date = '2010-01-01'
end_date ='2015-12-31'

# Impute params (if used)
k=5
weights='uniform'
n_fit=20000
c=-1

In [8]:
prep_hourly = PrepareAllSitesHourly(site_metadata_filename, monthly_data_filename, 
                                    hourly_features, metadata_features, target_variable, raw_data_dir,
                                    msc_features=msc_features, precip_sum_features=precip_sum_features)

data_df = prep_hourly.all_sites_all_sources(imp_cols, resample, impute, impute_method, impute_global,
                                            k, weights, n_fit, time_col, duration, start_date, end_date, missing_thresh, c)

Sites with missing monthly data: 43
1. AR-SLu: (10800, 51)
2. AR-Vir: (16992, 51)
3. AT-Neu: (26304, 51)
4. AU-ASM: (37248, 51)
AU-Ade has less than 1 year of remaining sequences
AU-Cpr has too many gaps, missing % = 0.20757180156657964
7. AU-Cum: (18864, 51)
8. AU-DaP: (28200, 51)
9. AU-DaS: (40392, 51)
AU-Dry has too many gaps, missing % = 0.2732749178532311
11. AU-Emr: (20448, 51)
AU-Fog has less than 1 year of remaining sequences
13. AU-Gin: (24336, 51)
14. AU-How: (43824, 51)
15. AU-RDF: (14904, 51)
16. AU-Rig: (31824, 51)
17. AU-Stp: (40632, 51)
18. AU-TTE: (21288, 51)
SKIP: AU-Tum is missing hourly data.
AU-Wac has less than 1 year of remaining sequences
21. AU-Whr: (27024, 51)
22. AU-Wom: (41928, 51)
AU-Ync has too many gaps, missing % = 0.3057644110275689
SKIP: BR-Sa1 is missing hourly data.
BR-Sa3 has less than 1 year of remaining sequences
CA-Man has less than 1 year of remaining sequences
CA-NS4 has less than 1 year of remaining sequences
CA-NS7 has less than 1 year of rema

In [9]:
# Drop year column
data_df.drop('year', axis=1, inplace=True)

In [10]:
if data_df.isna().sum().sum() != 0:
  display(data_df[data_df.isna().any(axis=1)].groupby(['site_id', 'year', 'month']).count())
  display(pd.DataFrame(data_df.isna().sum()).T)

In [11]:
# Get the memory usage of the dataframe in bytes
memory_usage = data_df.memory_usage().sum()
memory_usage_gb = memory_usage / 1_000_000_000
print(f"The dataframe uses {memory_usage_gb:.2f} GB of memory.")

The dataframe uses 2.78 GB of memory.


### Checkpoint: Save full raw data

In [12]:
# Upload data_df checkpoint to Azure Storage Blob
# ref: https://stackoverflow.com/a/54666079
data_cleanup_checkpoint = True

if data_cleanup_checkpoint:
    parquet_file = BytesIO()
    data_df.to_parquet(parquet_file, engine='pyarrow')
    parquet_file.seek(0)

    azStorageClient = AzStorageClient(az_cred_file)
    azStorageClient.uploadBlob(container, blob_name, parquet_file, overwrite=True)

File uploaded to all-sites-data/rfr-full_2010_2015_v_mvp-v2-knn_raw.parquet


## Load from Checkpoint

In [13]:
# (Optional) Load data_df from Azure checkpoint
load_data_checkpoint = False

if load_data_checkpoint:
    data_df = None
    local_file = tmp_dir + os.sep + blob_name 
    if not (os.path.exists(local_file)): # <--- when would this ever be true?
        azStorageClient = AzStorageClient(az_cred_file)
        file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
        data_df = pd.read_parquet(file_stream, engine='pyarrow')
        data_df.to_parquet(local_file)
    else:
        data_df = pd.read_parquet(local_file)

    print(f"Data size: {data_df.shape}")

## Load Train/Val/Test Split

In [14]:
# Load split dict from preprocessing/stratified-splits.ipynb
split_dict = joblib.load(split_dict_filename)

# Assign folds to train-val-test splits (for MVP)
train_folds = [x for x in range(1, split_dict['NUM_FOLDS']+1) if (x != split_dict['VAL_INDEX']) & (x != split_dict['TEST_INDEX'])]
train_sites = [z for x in train_folds for z in split_dict[f"fold_{x}"] ]
val_sites = split_dict[f"fold_{split_dict['VAL_INDEX']}"]
test_sites = split_dict[f"fold_{split_dict['TEST_INDEX']}"]

total_sites = len(train_sites) + len(val_sites) + len(test_sites)
print(f"# train/val/test sites: {len(train_sites)}/{len(val_sites)}/{len(test_sites)}")
print(f"% train/val/test sites: {len(train_sites)/total_sites:.3f}/{len(val_sites)/total_sites:.3f}/{len(test_sites)/total_sites:.3f}")

# train/val/test sites: 78/26/25
% train/val/test sites: 0.605/0.202/0.194


## Data Transformation

In [15]:
raw_data_file_path = None
model_name = 'rfr-mvp-v2'
data_transformer = TFTDataTransformer(train_sites, val_sites, test_sites, model_name, raw_data_file_path, data_df, preproc_objects_dir)

Data size: (4577688, 74).


In [16]:
categorical_cols = ['c3c4', 'koppen_sub', 'koppen_main', 'month', 'day', 'hour', 
                    'MODIS_PFT', 'MODIS_LC', 'MODIS_IGBP', 'hemisphere_NS', 'lat_band']
realNum_cols = ['TA_ERA', 'SW_IN_ERA', 'LW_IN_ERA', 'VPD_ERA', 'P_ERA', 'PA_ERA', 
                'EVI', 'NDVI', 'NIRv', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'b7',
                'c4_percent', 'BESS-PAR', 'BESS-PARdiff', 'BESS-RSDN', 'CSIF-SIFdaily',
                'PET', 'Ts', 'ESACCI-sm', 'NDWI', 'Percent_Snow', 'Fpar', 'Lai',
                'LST_Day', 'LST_Night', 'prcp', 'prcp-lag3'] + \
                [f"{feat}{ext}" for feat in msc_features for ext in ["_szn_mean", "_amp_msc", "_min_msc"]]
data_transformer.data_transform(categorical_cols, realNum_cols, cat_encode_type='dummy')

print("\nTrain data peak:")
display(data_transformer.train_df.head(2))

Data size: (4577688, 74).
Data size after encoding: (4577688, 102)
Number of sites in df: 128
Train Sites: ['AR-SLu' 'AR-Vir' 'AT-Neu' 'AU-ASM' 'AU-Cum' 'AU-DaS' 'AU-RDF' 'AU-TTE'
 'AU-Wom' 'BE-Bra' 'BE-Dor' 'BE-Lon' 'BE-Vie' 'CA-Ca2' 'CA-Ca3' 'CA-Cbo'
 'CA-TP1' 'CA-TP3' 'CA-TP4' 'CA-TPD' 'CH-Cha' 'CH-Lae' 'CN-Sw2' 'CZ-BK1'
 'CZ-BK2' 'CZ-KrP' 'CZ-RAJ' 'CZ-Stn' 'DE-Geb' 'DE-HoH' 'DE-Hte' 'DE-Kli'
 'DE-Obe' 'DE-RuR' 'DE-SfN' 'DE-Spw' 'ES-LJu' 'ES-LM2' 'FI-Let' 'FR-Fon'
 'FR-Lam' 'FR-Pue' 'IL-Yat' 'IT-CA1' 'IT-CA3' 'IT-Lav' 'IT-Noe' 'IT-Ro2'
 'IT-SRo' 'IT-Tor' 'NL-Hor' 'SE-Htm' 'SE-Lnn' 'SE-Ros' 'US-AR2' 'US-ARM'
 'US-CRT' 'US-Fmf' 'US-IB2' 'US-KFS' 'US-Me6' 'US-Myb' 'US-Prr' 'US-Ro1'
 'US-Rws' 'US-SRG' 'US-SRM' 'US-Syv' 'US-Tw4' 'US-UMB' 'US-UMd' 'US-Vcm'
 'US-Vcp' 'US-WCr' 'US-WPT' 'US-Wjs' 'US-Wkg']
Val Sites: ['AU-DaP' 'AU-Emr' 'AU-Gin' 'AU-How' 'AU-Rig' 'CA-Ca1' 'CA-Gro' 'CH-Fru'
 'CH-Oe2' 'DE-Hai' 'DK-Sor' 'FI-Hyy' 'FR-Aur' 'FR-Hes' 'GF-Guy' 'IT-SR2'
 'NL-Loo' 'SE-Deg' 'SE-Nor' 'US-

Unnamed: 0,GPP_NT_VUT_REF,site_id,timestep_idx_local,timestep_idx_global,datetime,index,TA_ERA,SW_IN_ERA,LW_IN_ERA,VPD_ERA,P_ERA,PA_ERA,EVI,NDVI,NIRv,b1,b2,b3,b4,b5,b6,b7,TA_ERA_szn_mean,TA_ERA_amp_msc,TA_ERA_min_msc,SW_IN_ERA_szn_mean,SW_IN_ERA_amp_msc,SW_IN_ERA_min_msc,P_ERA_szn_mean,P_ERA_amp_msc,P_ERA_min_msc,EVI_szn_mean,EVI_amp_msc,EVI_min_msc,NDVI_szn_mean,NDVI_amp_msc,NDVI_min_msc,NIRv_szn_mean,NIRv_amp_msc,NIRv_min_msc,b4_szn_mean,b4_amp_msc,b4_min_msc,prcp_week_sum,prcp_month_sum,c4_percent,BESS-PAR,BESS-PARdiff,BESS-RSDN,CSIF-SIFdaily,PET,Ts,prcp,prcp-lag3,ESACCI-sm,NDWI,Percent_Snow,Fpar,Lai,LST_Day,LST_Night,gap_flag_hour,gap_flag_month,koppen_sub,koppen_main,month,day,hour,MODIS_LC,c3c4_C3,c3c4_C4,c3c4_mix,c3c4_rotation,c3c4_unknown,MODIS_PFT_CRO,MODIS_PFT_DBF,MODIS_PFT_EBF,MODIS_PFT_ENF,MODIS_PFT_GRA,MODIS_PFT_MF,MODIS_PFT_Other,MODIS_PFT_SA,MODIS_PFT_SH,MODIS_IGBP_CRO,MODIS_IGBP_CSH,MODIS_IGBP_DBF,MODIS_IGBP_EBF,MODIS_IGBP_ENF,MODIS_IGBP_GRA,MODIS_IGBP_MF,MODIS_IGBP_OSH,MODIS_IGBP_SAV,MODIS_IGBP_URB,MODIS_IGBP_WAT,MODIS_IGBP_WET,MODIS_IGBP_WSA,hemisphere_NS_North,hemisphere_NS_South,lat_band_lat_band_2,lat_band_lat_band_3,lat_band_lat_band_4,lat_band_lat_band_5
0,-0.13057,AR-SLu,0,350640,2010-01-01 00:00:00,0,1.29671,-0.67229,0.81718,0.32487,1.19668,0.00966,-0.09524,-0.02366,-0.24887,-0.24622,-0.43414,-0.248,-0.22049,0.36961,0.73076,0.23363,1.85332,-0.38903,1.28096,1.67357,-1.03735,1.68537,-0.38324,0.2009,-1.57409,-0.44657,-1.0319,-0.25913,-0.3928,-1.16402,0.11314,-0.5149,-1.20482,-0.31786,-0.19059,-0.67987,0.05636,0.321,0.321,2.36922,2.00473,0.64667,1.96634,0.40973,-1.07532,1.76888,-0.11856,0.4152,-1.54625,-0.56224,-0.44318,0.13015,-0.1149,1.69276,1.7308,0,0.0,7,2,1.0,1.0,0.0,7,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0
1,-0.66336,AR-SLu,1,350641,2010-01-01 01:00:00,1,1.25024,-0.67229,0.81718,0.23047,-0.2299,0.01066,-0.09524,-0.02366,-0.24887,-0.24622,-0.43414,-0.248,-0.22049,0.36961,0.73076,0.23363,1.85332,-0.38903,1.28096,1.67357,-1.03735,1.68537,-0.38324,0.2009,-1.57409,-0.44657,-1.0319,-0.25913,-0.3928,-1.16402,0.11314,-0.5149,-1.20482,-0.31786,-0.19059,-0.67987,0.05636,0.321,0.321,2.36922,2.00473,0.64667,1.96634,0.40973,-1.07532,1.76888,-0.11856,0.4152,-1.54625,-0.56224,-0.44318,0.13015,-0.1149,1.69276,1.7308,0,0.0,7,2,1.0,1.0,1.0,7,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0


In [17]:
final_checkpoint = True

if final_checkpoint:
    data_transformer.upload_train_test_to_azure(az_cred_file, container, train_blob_name, val_blob_name, test_blob_name)

Uploading train dataset to rfr-full_2010_2015-train-v-mvp-v2-knn.parquet...
File uploaded to all-sites-data/rfr-full_2010_2015-train-v-mvp-v2-knn.parquet
Uploading val dataset to rfr-full_2010_2015-val-v-mvp-v2-knn.parquet...
File uploaded to all-sites-data/rfr-full_2010_2015-val-v-mvp-v2-knn.parquet
Uploading test dataset to rfr-full_2010_2015-test-v-mvp-v2-knn.parquet...
File uploaded to all-sites-data/rfr-full_2010_2015-test-v-mvp-v2-knn.parquet
