# Notebook Setup

In [28]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

In [29]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

## Import Modules

In [30]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [31]:
# Paths
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
preproc_objects_dir = root_dir + os.sep + 'code' + os.sep + 'src' + os.sep + 'preprocessing' + os.sep + 'preproc_objects'

# input files
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

# Azure container, file names
container = "all-sites-data"
ext = "parquet"
ver = "mvp"
tag = "raw"
blob_name_base = f"full_2010_2015_v_{ver}"
blob_name = f"{blob_name_base}_{tag}.{ext}"

## Load the Data DF Checkpoint from Pipeline

In [32]:
# (Optional) Load data_df from Azure checkpoint
load_data_checkpoint = True

if load_data_checkpoint:
    data_df = None
    local_file = tmp_dir + os.sep + blob_name 
    if not (os.path.exists(local_file)): 
        azStorageClient = AzStorageClient(az_cred_file)
        file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
        data_df = pd.read_parquet(file_stream, engine='pyarrow')
        data_df.to_parquet(local_file)
    else:
        data_df = pd.read_parquet(local_file)

    print(f"Data size: {data_df.shape}")

Data size: (4613880, 52)


# Stratified Split

In [33]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename, usecols=['site_id', 'filename', 'IGBP'])

# only focus on target sites
site_metadata_df.dropna(inplace=True)

# Group IGBP
site_metadata_df['gen_IGBP'] = site_metadata_df['IGBP']
site_metadata_df['gen_IGBP'].replace('WSA', 'SAV', inplace=True)
site_metadata_df['gen_IGBP'].replace('CSH', 'SHB', inplace=True)
site_metadata_df['gen_IGBP'].replace('OSH', 'SHB', inplace=True)
site_metadata_df.drop(site_metadata_df[site_metadata_df['gen_IGBP'] == 'WAT'].index, inplace = True)

# Get available sites in the datasets
available_sites = data_df['site_id'].unique()
site_data_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(available_sites)]
print(f"available sites: {site_data_df.shape}")

# Conduct k-fold splitting
n = 5
skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=42) # Add random state for reproducibility
folds = skf.split(site_data_df['site_id'], site_data_df['gen_IGBP'])

site_splits = []
split_dict = {}
for i, (train_index, test_index) in enumerate(folds):
    split_df = site_data_df[['site_id', 'gen_IGBP']].iloc[test_index]
    sites = list(split_df.site_id.unique())
    split_dict[f"fold_{i+1}"] = sites
    site_splits.append(sites)
    print(f"Fold {i+1}:")
    print(f"  Count={test_index.shape}")
    print(f"  IGBP ={np.sort(split_df.gen_IGBP.unique())}")
    print(f"  Sites={sites}\n")


# print all sites
print(site_splits)

# Add val, test indices to split_dict before saving
split_dict['VAL_INDEX'] = 3
split_dict['TEST_INDEX'] = 4
split_dict['NUM_FOLDS'] = n
print("\n")
print(split_dict)

available sites: (129, 4)
Fold 1:
  Count=(26,)
  IGBP =['CRO' 'DBF' 'EBF' 'ENF' 'GRA' 'MF' 'SAV' 'SHB' 'WET']
  Sites=['AR-SLu', 'AU-ASM', 'AU-Cpr', 'AU-Cum', 'AU-RDF', 'CA-TP3', 'CA-TPD', 'CN-Sw2', 'DE-SfN', 'NL-Hor', 'US-Me6', 'US-Syv', 'US-WCr', 'US-AR2', 'US-Tw4', 'US-UMB', 'US-Vcp', 'CH-Cha', 'CZ-BK1', 'CZ-KrP', 'DE-Obe', 'ES-LJu', 'FI-Let', 'FR-Lam', 'IT-Lav', 'SE-Lnn']

Fold 2:
  Count=(26,)
  IGBP =['CRO' 'DBF' 'EBF' 'ENF' 'GRA' 'MF' 'SAV' 'SHB' 'WET']
  Sites=['CZ-BK2', 'DE-Spw', 'FR-Pue', 'IT-CA3', 'IT-Noe', 'IT-Ro2', 'US-IB2', 'US-Myb', 'US-SRM', 'CA-Ca3', 'US-CRT', 'US-Fmf', 'US-KFS', 'US-Prr', 'US-UMd', 'US-Wjs', 'BE-Bra', 'BE-Lon', 'CH-Lae', 'CZ-RAJ', 'DE-HoH', 'DE-Kli', 'DE-RuR', 'IL-Yat', 'IT-Tor', 'SE-Htm']

Fold 3:
  Count=(26,)
  IGBP =['CRO' 'DBF' 'EBF' 'ENF' 'GRA' 'MF' 'SAV' 'SHB' 'WET']
  Sites=['AR-Vir', 'AT-Neu', 'AU-DaS', 'AU-TTE', 'AU-Wom', 'CA-TP1', 'IT-CA1', 'IT-SRo', 'US-WPT', 'US-Wkg', 'CA-Ca2', 'CA-Cbo', 'CA-TP4', 'US-ARM', 'US-Ro1', 'US-Rws', 'US-SRG', 

In [38]:
# MANUALLY ASSIGN SPLIT DICT FROM MARY'S INITIAL RUN (FOR NOW)
split_dict = {}
split_dict['fold_1'] = ['AR-SLu', 'AU-ASM', 'AU-Cpr', 'AU-Cum', 'AU-RDF', 'CA-TP3', 'CA-TPD', 'CN-Sw2',
    'DE-SfN', 'NL-Hor', 'US-Me6', 'US-Syv', 'US-WCr', 'US-AR2', 'US-Tw4', 'US-UMB', 
    'US-Vcp', 'CH-Cha', 'CZ-BK1', 'CZ-KrP', 'DE-Obe', 'ES-LJu', 'FI-Let', 'FR-Lam', 
    'IT-Lav', 'SE-Lnn']
split_dict['fold_2'] = ['CZ-BK2', 'DE-Spw', 'FR-Pue', 'IT-CA3', 'IT-Noe', 'IT-Ro2', 'US-IB2', 'US-Myb',
    'US-SRM', 'CA-Ca3', 'US-CRT', 'US-Fmf', 'US-KFS', 'US-Prr', 'US-UMd', 'US-Wjs',
    'BE-Bra', 'BE-Lon', 'CH-Lae', 'CZ-RAJ', 'DE-HoH', 'DE-Kli', 'DE-RuR', 'IL-Yat', 
    'IT-Tor', 'SE-Htm']
split_dict['fold_3'] = ['AR-Vir', 'AT-Neu', 'AU-DaS', 'AU-TTE', 'AU-Wom', 'CA-TP1', 'IT-CA1', 'IT-SRo',
    'US-WPT', 'US-Wkg', 'CA-Ca2', 'CA-Cbo', 'CA-TP4', 'US-ARM', 'US-Ro1', 'US-Rws',
    'US-SRG', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'DE-Geb', 'ES-LM2', 'FR-Fon', 
    'SE-Ros', 'DE-Hte']
split_dict['fold_4'] = ['AU-DaP', 'AU-Emr', 'AU-Gin', 'AU-How', 'AU-Rig', 'US-GLE', 'US-NR1', 'US-Twt',
    'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Bar', 'US-Mpj', 'US-Ses', 'CH-Fru', 'CH-Oe2',
    'DE-Hai', 'DK-Sor', 'FI-Hyy', 'FR-Aur', 'FR-Hes', 'GF-Guy', 'IT-SR2', 'SE-Deg',
    'SE-Nor', 'NL-Loo']
split_dict['fold_5'] = ['AU-Stp', 'AU-Whr', 'CA-Oas', 'DE-Lnf', 'ES-Amo', 'FI-Sod', 'IT-CA2', 'US-Ton',
    'US-Var', 'US-Whs', 'US-Ho1', 'US-Oho', 'US-Seg', 'CH-Dav', 'CZ-Lnz', 'CZ-wet',
    'DE-Gri', 'DE-Tha', 'ES-LM1', 'FR-Bil', 'FR-FBn', 'IT-BCi', 'IT-MBo', 'IT-Ren',
    'RU-Fyo']
split_dict['VAL_INDEX'] = 4
split_dict['TEST_INDEX'] = 5
split_dict['NUM_FOLDS'] = 5

In [39]:
# Save out as dict
joblib.dump(split_dict, os.path.join(preproc_objects_dir, 'stratified_splits_k5.joblib'))

['/root/co2-flux-hourly-gpp-modeling/code/src/preprocessing/preproc_objects/stratified_splits_k5.joblib']