# Notebook Setup

In [2]:
#TODO: CHANGE THIS BASED ON YOUR OWN LOCAL SETTINGS
#MY_HOME_ABS_PATH = "/Users/jetcalz07/Desktop/MIDS/W210_Capstone/co2-flux-hourly-gpp-modeling"
MY_HOME_ABS_PATH = "/root/co2-flux-hourly-gpp-modeling"

In [3]:
if 'google.colab' in str(get_ipython()):
  IN_COLLAB = True
  from google.colab import drive
  drive.mount('/content/drive/')
else:
  IN_COLLAB = False

## Import Modules

In [4]:
import os
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"
import pandas as pd
import numpy as np
import sys
import joblib
from io import BytesIO
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans

# Load locale custome modules
os.chdir(MY_HOME_ABS_PATH)
if IN_COLLAB:
  sys.path.insert(0,os.path.abspath("./code/src/tools"))
else:
  sys.path.append(os.path.abspath("./code/src/tools"))

from CloudIO.AzStorageClient import AzStorageClient
from data_pipeline_lib import *

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Define Constants

In [5]:
# Paths
root_dir =  MY_HOME_ABS_PATH
tmp_dir =  root_dir + os.sep + '.tmp'
raw_data_dir = tmp_dir
data_dir = root_dir + os.sep + 'data'
cred_dir = root_dir + os.sep + '.cred'
az_cred_file = cred_dir + os.sep + 'azblobcred.json'
preproc_objects_dir = root_dir + os.sep + 'code' + os.sep + 'src' + os.sep + 'preprocessing' + os.sep + 'preproc_objects'

# input files
site_metadata_filename = data_dir + os.sep + 'site-metadata.csv'

# Azure container, file names
container = "all-sites-data"
ext = "parquet"
ver = "mvp"
tag = "raw"
blob_name_base = f"full_2010_2015_v_{ver}"
blob_name = f"{blob_name_base}_{tag}.{ext}"

## Load the Data DF Checkpoint from Pipeline

In [6]:
# (Optional) Load data_df from Azure checkpoint
load_data_checkpoint = True

if load_data_checkpoint:
    data_df = None
    local_file = tmp_dir + os.sep + blob_name 
    if not (os.path.exists(local_file)): 
        azStorageClient = AzStorageClient(az_cred_file)
        file_stream = azStorageClient.downloadBlob2Stream(container, blob_name)
        data_df = pd.read_parquet(file_stream, engine='pyarrow')
        data_df.to_parquet(local_file)
    else:
        data_df = pd.read_parquet(local_file)

    print(f"Data size: {data_df.shape}")

Data size: (4862712, 51)


# Stratified Split V1 (Stratified on IGBP)

In [7]:
SITE_SPLITS =[
  ['AR-SLu', 'AU-ASM', 'AU-Cpr', 'AU-Cum', 'AU-RDF', 'CA-TP3', 'CA-TPD', 'CN-Sw2',
    'DE-SfN', 'NL-Hor', 'US-Me6', 'US-Syv', 'US-WCr', 'US-AR2', 'US-Tw4', 'US-UMB', 
    'US-Vcp', 'CH-Cha', 'CZ-BK1', 'CZ-KrP', 'DE-Obe', 'ES-LJu', 'FI-Let', 'FR-Lam', 
    'IT-Lav', 'SE-Lnn'], 
  ['CZ-BK2', 'DE-Spw', 'FR-Pue', 'IT-CA3', 'IT-Noe', 'IT-Ro2', 'US-IB2', 'US-Myb',
    'US-SRM', 'CA-Ca3', 'US-CRT', 'US-Fmf', 'US-KFS', 'US-Prr', 'US-UMd', 'US-Wjs',
    'BE-Bra', 'BE-Lon', 'CH-Lae', 'CZ-RAJ', 'DE-HoH', 'DE-Kli', 'DE-RuR', 'IL-Yat', 
    'IT-Tor', 'SE-Htm'], 
  ['AR-Vir', 'AT-Neu', 'AU-DaS', 'AU-TTE', 'AU-Wom', 'CA-TP1', 'IT-CA1', 'IT-SRo',
    'US-WPT', 'US-Wkg', 'CA-Ca2', 'CA-Cbo', 'CA-TP4', 'US-ARM', 'US-Ro1', 'US-Rws',
    'US-SRG', 'US-Vcm', 'BE-Dor', 'BE-Vie', 'CZ-Stn', 'DE-Geb', 'ES-LM2', 'FR-Fon', 
    'SE-Ros', 'DE-Hte'],
  ['AU-DaP', 'AU-Emr', 'AU-Gin', 'AU-How', 'AU-Rig', 'US-GLE', 'US-NR1', 'US-Twt',
    'CA-Ca1', 'CA-Gro', 'US-AR1', 'US-Bar', 'US-Mpj', 'US-Ses', 'CH-Fru', 'CH-Oe2',
    'DE-Hai', 'DK-Sor', 'FI-Hyy', 'FR-Aur', 'FR-Hes', 'GF-Guy', 'IT-SR2', 'SE-Deg',
    'SE-Nor', 'NL-Loo'],
  ['AU-Stp', 'AU-Whr', 'CA-Oas', 'DE-Lnf', 'ES-Amo', 'FI-Sod', 'IT-CA2', 'US-Ton',
    'US-Var', 'US-Whs', 'US-Ho1', 'US-Oho', 'US-Seg', 'CH-Dav', 'CZ-Lnz', 'CZ-wet',
    'DE-Gri', 'DE-Tha', 'ES-LM1', 'FR-Bil', 'FR-FBn', 'IT-BCi', 'IT-MBo', 'IT-Ren',
    'RU-Fyo']
]

In [8]:
# Check GPP summary by split
for i in range(5):
    print(f"\nSplit {i}")
    print(data_df.loc[data_df['site_id'].isin(SITE_SPLITS[i]), 'GPP_NT_VUT_REF'].agg(['mean', 'var']))


Split 0
mean    3.46445
var    47.18681
Name: GPP_NT_VUT_REF, dtype: float64

Split 1
mean    3.15823
var    42.43168
Name: GPP_NT_VUT_REF, dtype: float64

Split 2
mean    3.58570
var    51.63695
Name: GPP_NT_VUT_REF, dtype: float64

Split 3
mean    3.64023
var    50.66586
Name: GPP_NT_VUT_REF, dtype: float64

Split 4
mean    3.08400
var    40.30168
Name: GPP_NT_VUT_REF, dtype: float64


^^^It is likely problematic that the different splits have a wide range of GPP variance. The GPP means by split aren't terribly far apart, but could be closer

# Stratified Split V2 (Use clustering on IGBP, GPP_Mean, GPP_Var)

In [9]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename, usecols=['site_id', 'IGBP', 'GPP_var', 'GPP_mean', 'GPP_var_bins', 'GPP_mean_bins'])

# only focus on target sites
site_metadata_df.dropna(inplace=True)
available_sites = data_df['site_id'].unique()
site_data_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(available_sites), ].copy()
print(f"available sites: {site_data_df.shape}")

# Group IGBP
site_data_df['gen_IGBP'] = site_data_df['IGBP']
site_data_df['gen_IGBP'].replace('WSA', 'SAV', inplace=True)
site_data_df['gen_IGBP'].replace('CSH', 'SHB', inplace=True)
site_data_df['gen_IGBP'].replace('OSH', 'SHB', inplace=True)
site_data_df.drop(site_data_df[site_data_df['gen_IGBP'] == 'WAT'].index, inplace = True)

# Make site_id the index
site_index = site_data_df['site_id'].values
site_data_df.set_index(site_index, inplace=True)

available sites: (128, 6)


In [13]:
# Encode categoricals to use in clustering
cat_features = ['gen_IGBP']
num_features = ['GPP_mean', 'GPP_var']

encoder = OneHotEncoder()
cat_feature_encoded = encoder.fit_transform(site_data_df[cat_features].values)
cat_feature_df = pd.DataFrame(cat_feature_encoded.toarray(), columns=encoder.get_feature_names_out(cat_features), index=site_index)

# Recombine all together
cluster_df = pd.concat([cat_feature_df, site_data_df[num_features]], axis=1)
print(cluster_df.shape)
cluster_df.head()

(128, 11)


Unnamed: 0,gen_IGBP_CRO,gen_IGBP_DBF,gen_IGBP_EBF,gen_IGBP_ENF,gen_IGBP_GRA,gen_IGBP_MF,gen_IGBP_SAV,gen_IGBP_SHB,gen_IGBP_WET,GPP_mean,GPP_var
AR-SLu,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.8966,88.14843
AR-Vir,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,10.67907,195.83335
AT-Neu,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,6.63636,146.69835
AU-ASM,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.74776,3.32712
AU-Cum,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2106,24.54087


In [14]:
# Cluster sites on above criteria
n=5
kmeans = KMeans(n_clusters=n, random_state=42)
kmeans.fit(cluster_df)
cluster_labels = kmeans.labels_
cluster_df['cluster'] = cluster_labels

# Get the value counts
unique, counts = np.unique(cluster_labels, return_counts=True)
for val, count in zip(unique, counts):
    print(f'{val}: {count}')
    
# View df
cluster_df.reset_index(inplace=True, names=['site_id'])
cluster_df.head(2)

0: 33
1: 4
2: 19
3: 29
4: 43


Unnamed: 0,site_id,gen_IGBP_CRO,gen_IGBP_DBF,gen_IGBP_EBF,gen_IGBP_ENF,gen_IGBP_GRA,gen_IGBP_MF,gen_IGBP_SAV,gen_IGBP_SHB,gen_IGBP_WET,GPP_mean,GPP_var,cluster
0,AR-SLu,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,6.8966,88.14843,2
1,AR-Vir,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,10.67907,195.83335,1


In [18]:
# Conduct k-fold splitting
n = 5
skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=42) # Add random state for reproducibility
folds = skf.split(cluster_df['site_id'], cluster_df['cluster'])

site_splits = []
split_dict = {}
for i, (train_index, test_index) in enumerate(folds):
    split_df = cluster_df[['site_id', 'cluster']].iloc[test_index]
    sites = list(split_df.site_id.unique())
    split_dict[f"fold_{i+1}"] = sites
    site_splits.append(sites)
    print(f"Fold {i+1}:")
    print(f"  Count={test_index.shape}")
    print(f"  Cluster ={np.sort(split_df.cluster.unique())}")
    print(f"  Sites={sites}\n")
    unique, counts = np.unique(split_df.cluster, return_counts=True)
    for val, count in zip(unique, counts):
        print(f'Cluster {val}: {count}')


# print all sites
print(site_splits)

# Add val, test indices to split_dict before saving
split_dict['VAL_INDEX'] = 3
split_dict['TEST_INDEX'] = 4
split_dict['NUM_FOLDS'] = n
print("\n")
print(split_dict)

Fold 1:
  Count=(26,)
  Cluster =[0 1 2 3 4]
  Sites=['AR-SLu', 'AU-Cum', 'AU-DaP', 'AU-Rig', 'CA-TPD', 'CN-Sw2', 'ES-Amo', 'FI-Sod', 'IT-SRo', 'US-GLE', 'US-WPT', 'US-AR1', 'US-AR2', 'US-CRT', 'US-Oho', 'US-UMB', 'US-Vcp', 'BE-Lon', 'CH-Cha', 'CZ-KrP', 'CZ-Lnz', 'DE-Gri', 'DE-Hai', 'DE-Obe', 'ES-LM1', 'SE-Lnn']

Cluster 0: 6
Cluster 1: 1
Cluster 2: 4
Cluster 3: 6
Cluster 4: 9
Fold 2:
  Count=(26,)
  Cluster =[0 1 2 3 4]
  Sites=['AU-Emr', 'AU-Gin', 'AU-How', 'CZ-BK2', 'DE-Lnf', 'DE-Spw', 'FR-Pue', 'US-NR1', 'US-Ton', 'US-Whs', 'US-Bar', 'US-Seg', 'BE-Bra', 'CH-Dav', 'CZ-RAJ', 'DE-RuR', 'FI-Hyy', 'FI-Let', 'FR-FBn', 'FR-Fon', 'GF-Guy', 'IT-BCi', 'IT-Ren', 'RU-Fyo', 'SE-Deg', 'SE-Htm']

Cluster 0: 6
Cluster 1: 1
Cluster 2: 4
Cluster 3: 6
Cluster 4: 9
Fold 3:
  Count=(26,)
  Cluster =[0 1 2 3 4]
  Sites=['AT-Neu', 'AU-RDF', 'AU-Stp', 'IT-CA1', 'IT-Noe', 'IT-Ro2', 'US-IB2', 'US-Me6', 'US-Syv', 'US-Twt', 'US-KFS', 'US-Ro1', 'US-Rws', 'US-SRG', 'US-Ses', 'US-Tw4', 'BE-Vie', 'CZ-wet', 'DE-Ho



In [16]:
# Save out as dict
save_split_path = os.path.join(preproc_objects_dir, 'clustered_stratified_splits_k5.joblib')
if not os.path.exists(save_split_path):
    print("Saving split dict")
    joblib.dump(split_dict, save_split_path)
else:
    print("Path exists, rename to avoid overwriting")

Path exists, rename to avoid overwriting


In [19]:
split_dict = joblib.load(save_split_path)

for i in range(1, n+1):
    print(f"\nSplit {i}")
    print(data_df.loc[data_df['site_id'].isin(split_dict[f'fold_{i}']), 'GPP_NT_VUT_REF'].agg(['mean', 'var']))


Split 1
mean    3.35864
var    49.99133
Name: GPP_NT_VUT_REF, dtype: float64

Split 2
mean    3.63331
var    47.49071
Name: GPP_NT_VUT_REF, dtype: float64

Split 3
mean    3.36472
var    44.55653
Name: GPP_NT_VUT_REF, dtype: float64

Split 4
mean    3.18848
var    45.79553
Name: GPP_NT_VUT_REF, dtype: float64

Split 5
mean    3.45568
var    46.72548
Name: GPP_NT_VUT_REF, dtype: float64


## Stratified Splits V3 (Balance GPP Mean by binning)
This approach ignores IGBP. I think V2 is better approach

In [20]:
# Load Site data
site_metadata_df = pd.read_csv(site_metadata_filename, usecols=['site_id', 'IGBP', 'GPP_var', 'GPP_mean', 'GPP_var_bins', 'GPP_mean_bins'])

# only focus on target sites
site_metadata_df.dropna(inplace=True)
available_sites = data_df['site_id'].unique()
site_data_df = site_metadata_df.loc[site_metadata_df['site_id'].isin(available_sites)]
print(f"available sites: {site_data_df.shape}")

# Group IGBP
site_data_df['gen_IGBP'] = site_data_df['IGBP']
site_data_df['gen_IGBP'].replace('WSA', 'SAV', inplace=True)
site_data_df['gen_IGBP'].replace('CSH', 'SHB', inplace=True)
site_data_df['gen_IGBP'].replace('OSH', 'SHB', inplace=True)
site_data_df.drop(site_data_df[site_data_df['gen_IGBP'] == 'WAT'].index, inplace = True)

# # Make site_id the index
# site_index = site_data_df['site_id'].values
# site_data_df.set_index(site_index, inplace=True)

available sites: (128, 6)


In [21]:
# Conduct k-fold splitting
n = 5
skf = StratifiedKFold(n_splits=n, shuffle=True, random_state=42) # Add random state for reproducibility
folds = skf.split(site_data_df['site_id'], site_data_df['GPP_mean_bins'])

site_splits = []
split_dict = {}
for i, (train_index, test_index) in enumerate(folds):
    split_df = site_data_df[['site_id', 'GPP_mean_bins', 'gen_IGBP']].iloc[test_index]
    sites = list(split_df.site_id.unique())
    split_dict[f"fold_{i+1}"] = sites
    site_splits.append(sites)
    print(f"Fold {i+1}:")
    print(f"  Count={test_index.shape}")
    print(f"  gen_IGBP ={np.sort(split_df.gen_IGBP.unique())}")
    print(f"  Sites={sites}\n")


# print all sites
print(site_splits)

# Add val, test indices to split_dict before saving
split_dict['VAL_INDEX'] = 3
split_dict['TEST_INDEX'] = 4
split_dict['NUM_FOLDS'] = n
print("\n")
print(split_dict)

Fold 1:
  Count=(26,)
  gen_IGBP =['CRO' 'DBF' 'EBF' 'ENF' 'GRA' 'MF' 'SAV' 'SHB' 'WET']
  Sites=['AR-SLu', 'AR-Vir', 'AU-DaP', 'AU-DaS', 'AU-RDF', 'AU-Rig', 'AU-Stp', 'CA-Oas', 'CA-TP3', 'DE-SfN', 'FR-Pue', 'US-GLE', 'US-Me6', 'US-WCr', 'CA-Ca1', 'US-ARM', 'US-Ses', 'US-UMB', 'BE-Lon', 'CZ-Stn', 'DE-HoH', 'ES-LJu', 'FR-Fon', 'IT-Lav', 'RU-Fyo', 'NL-Loo']

Fold 2:
  Count=(26,)
  gen_IGBP =['CRO' 'DBF' 'EBF' 'ENF' 'GRA' 'MF' 'SAV' 'WET']
  Sites=['AU-Cum', 'AU-How', 'AU-TTE', 'CA-TPD', 'CZ-BK2', 'IT-CA1', 'IT-CA2', 'US-IB2', 'US-SRM', 'US-WPT', 'CA-Ca2', 'US-AR2', 'US-Fmf', 'US-UMd', 'US-Vcm', 'BE-Vie', 'CH-Cha', 'CH-Fru', 'CH-Lae', 'CZ-RAJ', 'CZ-wet', 'DE-Gri', 'FI-Let', 'IT-BCi', 'IT-Ren', 'SE-Lnn']

Fold 3:
  Count=(26,)
  gen_IGBP =['CRO' 'DBF' 'ENF' 'GRA' 'MF' 'SAV' 'WET']
  Sites=['AU-Gin', 'CA-TP1', 'CN-Sw2', 'DE-Spw', 'FI-Sod', 'IT-Ro2', 'IT-SRo', 'US-Myb', 'US-NR1', 'US-Syv', 'US-Wkg', 'CA-Ca3', 'US-CRT', 'US-Oho', 'US-Ro1', 'US-SRG', 'US-Seg', 'US-Tw4', 'BE-Bra', 'BE-Dor', 'C

In [22]:
for i in range(1, n+1):
    print(f"\nSplit {i}")
    print(data_df.loc[data_df['site_id'].isin(split_dict[f'fold_{i}']), 'GPP_NT_VUT_REF'].agg(['mean', 'var']))


Split 1
mean    3.33880
var    45.20372
Name: GPP_NT_VUT_REF, dtype: float64

Split 2
mean    3.81129
var    54.71842
Name: GPP_NT_VUT_REF, dtype: float64

Split 3
mean    3.11827
var    42.16004
Name: GPP_NT_VUT_REF, dtype: float64

Split 4
mean    3.59823
var    51.18516
Name: GPP_NT_VUT_REF, dtype: float64

Split 5
mean    3.13887
var    40.28240
Name: GPP_NT_VUT_REF, dtype: float64
