# Data Splitting

## Notebook Setup

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GroupShuffleSplit
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)

## Import Data

In [None]:
gcb = pd.read_parquet("/work/data/Global_Coral_Bleaching_DB/gcb_v4.parquet")
#gcb_class = pd.read_parquet("/work/data/Global_Coral_Bleaching_DB/gcb_v5.parquet")

## Drop Imputed Bleaching Data, Non-Hard Coral Data, and Measurements pre-1998

In [None]:
gcb= gcb[(gcb["Substrate_Name"] == "Hard Coral")]
gcb = gcb[gcb["Bleached_Value_Imputed"] == False]
gcb = gcb[gcb["Year"] >= 1998]

## Drop Data Leakage Columns

In [None]:
gcb.drop(columns=['S1','S2','S3','S4','Bleaching_Level','Bleaching_Prevalence_Score',
                  'Severity_ID','Severity_Code','Bleaching_Prevalence_Score_ID',
                  'Percent_Bleached'], inplace=True)

# gcb_class.drop(columns=['S1','S2','S3','S4','Bleaching_Level','Bleaching_Prevalence_Score',
#                   'Severity_ID','Severity_Code','Bleaching_Prevalence_Score_ID',
#                   'Percent_Bleached','Percent_Bleached_Value'], inplace=True)

## Perform Split

In [None]:
SEAA_PROVINCE = ['Western Coral Triangle', 'Eastern Coral Triangle', 'Sunda Shelf', 'Tropical Southwestern Pacific',
                 'Northeast Australian Shelf', 'Sahul Shelf', 'South China Sea', 'Andaman', 'Java Transitional']

CARB_PROVINCE = ['Tropical Northwestern Atlantic']

In [None]:
SEAA = gcb[gcb['PROVINCE'].isin(SEAA_PROVINCE)]
CARB = gcb[gcb['PROVINCE'].isin(CARB_PROVINCE)]

In [None]:
# Group split that ensures all observations for a site are in only one split to prevent data leakage
# gss = GroupShuffleSplit(n_splits=split_size, train_size=tr_size, random_state=rand_state)
# for train_idx, holdout_idx in gss.split(gcb, groups=gcb.Site_ID):
#     train_df = gcb.iloc[train_idx]
#     holdout_df = gcb.iloc[holdout_idx]
    # train_df_class = gcb_class.iloc[train_idx]
    # holdout_df_class = gcb_class.iloc[holdout_idx]
rand_state = 42
tr_size = 0.7
val_size = 0.2
split_size = 1

def custom_shuffle_split(df, tr_size, val_size, rand_state):
    # Shuffle the data
    shuffled_data = df.sample(frac=1, random_state=rand_state)

    # Split shuffled data into train, validation, and test sets
    train_df, remaining_data = train_test_split(shuffled_data, train_size=tr_size, random_state=rand_state)
    holdout_df, val_df = train_test_split(remaining_data, test_size=val_size/(1-tr_size), random_state=rand_state)

    return train_df, val_df, holdout_df

In [None]:
train_df_SEAA, val_df_SEAA, holdout_df_SEAA = custom_shuffle_split(SEAA, tr_size, val_size, rand_state)
train_df_CARB, val_df_CARB, holdout_df_CARB = custom_shuffle_split(CARB, tr_size, val_size, rand_state)
train_df_GLOB, val_df_GLOB, holdout_df_GLOB = custom_shuffle_split(gcb,  tr_size, val_size, rand_state)

## Validation of Distributions

In [None]:
# Validate Depth

fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))

alpha = 0.05

# Plot the SEAA data
sns.kdeplot(train_df_SEAA['Depth_m'], color='b', shade=True, alpha=alpha, label='train', ax=axes[0])
sns.kdeplot(val_df_SEAA['Depth_m'], color='g', shade=True, alpha=alpha, label='val', ax=axes[0])
sns.kdeplot(holdout_df_SEAA['Depth_m'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[0])
axes[0].set_title('SEAA_Depth')
axes[0].legend()

# Plot the CARB data
sns.kdeplot(train_df_CARB['Depth_m'], color='b', shade=True, alpha=alpha, label='train', ax=axes[1])
sns.kdeplot(val_df_CARB['Depth_m'], color='g', shade=True, alpha=alpha, label='val', ax=axes[1])
sns.kdeplot(holdout_df_CARB['Depth_m'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[1])
axes[1].set_title('CARB_Depth')
axes[1].legend()

# Plot the GLOB data
sns.kdeplot(train_df_GLOB['Depth_m'], color='b', shade=True, alpha=alpha, label='train', ax=axes[2])
sns.kdeplot(val_df_GLOB['Depth_m'], color='g', shade=True, alpha=alpha, label='val', ax=axes[2])
sns.kdeplot(holdout_df_GLOB['Depth_m'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[2])
axes[2].set_title('GLOB_Depth')
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(12, 6))

alpha = 0.05

# Plot the SEAA data -- Long
sns.kdeplot(train_df_SEAA['Longitude_Degrees'], color='b', shade=True, alpha=alpha, label='train', ax=axes[0,0])
sns.kdeplot(val_df_SEAA['Longitude_Degrees'], color='g', shade=True, alpha=alpha, label='val', ax=axes[0,0])
sns.kdeplot(holdout_df_SEAA['Longitude_Degrees'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[0,0])
axes[0,0].set_title('SEAA_Long')
axes[0,0].legend()

# Plot the CARB data -- Long
sns.kdeplot(train_df_CARB['Longitude_Degrees'], color='b', shade=True, alpha=alpha, label='train', ax=axes[0,1])
sns.kdeplot(val_df_CARB['Longitude_Degrees'], color='g', shade=True, alpha=alpha, label='val', ax=axes[0,1])
sns.kdeplot(holdout_df_CARB['Longitude_Degrees'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[0,1])
axes[0,1].set_title('CARB_Long')
axes[0,1].legend()

# Plot the GLOB data -- Long
sns.kdeplot(train_df_GLOB['Longitude_Degrees'], color='b', shade=True, alpha=alpha, label='train', ax=axes[0,2])
sns.kdeplot(val_df_GLOB['Longitude_Degrees'], color='g', shade=True, alpha=alpha, label='val', ax=axes[0,2])
sns.kdeplot(holdout_df_GLOB['Longitude_Degrees'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[0,2])
axes[0,2].set_title('GLOB_Long')
axes[0,2].legend()

# Plot the SEAA data -- Lat
sns.kdeplot(train_df_SEAA['Latitude_Degrees'], color='b', shade=True, alpha=alpha, label='train', ax=axes[1,0])
sns.kdeplot(val_df_SEAA['Latitude_Degrees'], color='g', shade=True, alpha=alpha, label='val', ax=axes[1,0])
sns.kdeplot(holdout_df_SEAA['Latitude_Degrees'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[1,0])
axes[1,0].set_title('SEAA_Lat')
axes[1,0].legend()

# Plot the CARB data -- Lat
sns.kdeplot(train_df_CARB['Latitude_Degrees'], color='b', shade=True, alpha=alpha, label='train', ax=axes[1,1])
sns.kdeplot(val_df_CARB['Latitude_Degrees'], color='g', shade=True, alpha=alpha, label='val', ax=axes[1,1])
sns.kdeplot(holdout_df_CARB['Latitude_Degrees'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[1,1])
axes[1,1].set_title('CARB_Lat')
axes[1,1].legend()

# Plot the GLOB data -- Lat
sns.kdeplot(train_df_GLOB['Latitude_Degrees'], color='b', shade=True, alpha=alpha, label='train', ax=axes[1,2])
sns.kdeplot(val_df_GLOB['Latitude_Degrees'], color='g', shade=True, alpha=alpha, label='val', ax=axes[1,2])
sns.kdeplot(holdout_df_GLOB['Latitude_Degrees'], color='r', shade=True, alpha=alpha, label='holdout', ax=axes[1,2])
axes[1,2].set_title('GLOB_Lat')
axes[1,2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Validate Time of Measurement

## split by northern/southern hemisphere first, then month

fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(12, 12))

frames = {'train_SEAA': train_df_SEAA, 'val_SEAA': val_df_SEAA, 'holdout_SEAA': holdout_df_SEAA,
          'train_CARB': train_df_CARB, 'val_CARB': val_df_CARB, 'holdout_CARB': holdout_df_CARB,
          'train_GLOB': train_df_GLOB, 'val_GLOB': val_df_GLOB, 'holdout_GLOB': holdout_df_GLOB}

colors = ['blue', 'green', 'red']
alpha = 0.3

counter = 0
for i in range(3):
    for j in range(3):

        df_name = list(frames.keys())[counter]
        df = frames[df_name]
        month_counts = df.groupby('Date_Month')['Date_Month'].count()
        ax = sns.barplot(x=month_counts.index, y=month_counts.values, ax=axes[i, j], color=colors[i], alpha=alpha)
        axes[i, j].set_title(df_name)

        ax.set_xticks(range(len(month_counts.index)))
        ax.set_xticklabels(month_counts.index.astype(int))
        counter += 1

plt.tight_layout()
plt.show()

## Create X & y Training and Holdout DFs

In [None]:
def X_y_split(train, val, holdout):
    X_train = train.drop(columns='Percent_Bleached_Value')
    y_train = train.Percent_Bleached_Value

    X_val = val.drop(columns='Percent_Bleached_Value')
    y_val = val.Percent_Bleached_Value

    X_holdout = holdout.drop(columns='Percent_Bleached_Value')
    y_holdout = holdout.Percent_Bleached_Value
    return X_train, y_train, X_val, y_val, X_holdout, y_holdout

X_train_SEAA, y_train_SEAA, X_val_SEAA, y_val_SEAA, X_holdout_SEAA, y_holdout_SEAA = X_y_split(train_df_SEAA, val_df_SEAA, holdout_df_SEAA)
X_train_CARB, y_train_CARB, X_val_CARB, y_val_CARB, X_holdout_CARB, y_holdout_CARB = X_y_split(train_df_CARB, val_df_CARB, holdout_df_CARB)
X_train_GLOB, y_train_GLOB, X_val_GLOB, y_val_GLOB, X_holdout_GLOB, y_holdout_GLOB = X_y_split(train_df_GLOB, val_df_GLOB, holdout_df_GLOB)


# # Create X & y Training and Holdout DFs for Classification
# X_train_class = train_df_class.drop(columns='Bleached_Class')
# X_holdout_class = holdout_df_class.drop(columns='Bleached_Class')

# y_train_class = train_df_class.Bleached_Class
# y_holdout_class = holdout_df_class.Bleached_Class

In [None]:
# Write out SEAA parquet files to DeepNote

train_df_SEAA.to_parquet('/work/data/Xy_Data/train_df_SEAA.parquet')
holdout_df_SEAA.to_parquet('/work/data/Xy_Data/holdout_df_SEAA.parquet')
val_df_SEAA.to_parquet('/work/data/Xy_Data/val_df_SEAA.parquet')

X_train_SEAA.to_parquet('/work/data/Xy_Data/X_train_SEAA.parquet')
y_train_SEAA.to_frame('y_train').to_parquet('/work/data/Xy_Data/y_train_SEAA.parquet')
X_val_SEAA.to_parquet('/work/data/Xy_Data/X_val_SEAA.parquet')
y_val_SEAA.to_frame('y_val').to_parquet('/work/data/Xy_Data/y_val_SEAA.parquet')
X_holdout_SEAA.to_parquet('/work/data/Xy_Data/X_holdout_SEAA.parquet')
y_holdout_SEAA.to_frame('y_holdout').to_parquet('/work/data/Xy_Data/y_holdout_SEAA.parquet')


In [None]:
# Write out CARB parquet files to DeepNote

train_df_CARB.to_parquet('/work/data/Xy_Data/train_df_CARB.parquet')
holdout_df_CARB.to_parquet('/work/data/Xy_Data/holdout_df_CARB.parquet')
val_df_CARB.to_parquet('/work/data/Xy_Data/val_df_CARB.parquet')

X_train_CARB.to_parquet('/work/data/Xy_Data/X_train_CARB.parquet')
y_train_CARB.to_frame('y_train').to_parquet('/work/data/Xy_Data/y_train_CARB.parquet')
X_val_CARB.to_parquet('/work/data/Xy_Data/X_val_CARB.parquet')
y_val_CARB.to_frame('y_val').to_parquet('/work/data/Xy_Data/y_val_CARB.parquet')
X_holdout_CARB.to_parquet('/work/data/Xy_Data/X_holdout_CARB.parquet')
y_holdout_CARB.to_frame('y_holdout').to_parquet('/work/data/Xy_Data/y_holdout_CARB.parquet')

In [None]:
# Write out GLOB parquet files to DeepNote

train_df_GLOB.to_parquet('/work/data/Xy_Data/train_df_GLOB.parquet')
holdout_df_GLOB.to_parquet('/work/data/Xy_Data/holdout_df_GLOB.parquet')
val_df_GLOB.to_parquet('/work/data/Xy_Data/val_df_GLOB.parquet')

X_train_GLOB.to_parquet('/work/data/Xy_Data/X_train_GLOB.parquet')
y_train_GLOB.to_frame('y_train').to_parquet('/work/data/Xy_Data/y_train_GLOB.parquet')
X_val_GLOB.to_parquet('/work/data/Xy_Data/X_val_GLOB.parquet')
y_val_GLOB.to_frame('y_val').to_parquet('/work/data/Xy_Data/y_val_GLOB.parquet')
X_holdout_GLOB.to_parquet('/work/data/Xy_Data/X_holdout_GLOB.parquet')
y_holdout_GLOB.to_frame('y_holdout').to_parquet('/work/data/Xy_Data/y_holdout_GLOB.parquet')


In [None]:
# Write out SEAA parquet files to S3

train_df_SEAA.to_parquet('/datasets/s3/data/Xy_Data/train_df_SEAA.parquet')
holdout_df_SEAA.to_parquet('/datasets/s3/data/Xy_Data/holdout_df_SEAA.parquet')
val_df_SEAA.to_parquet('/datasets/s3/data/Xy_Data/val_df_SEAA.parquet')

X_train_SEAA.to_parquet('/datasets/s3/data/Xy_Data/X_train_SEAA.parquet')
y_train_SEAA.to_frame('y_train').to_parquet('/datasets/s3/data/Xy_Data/y_train_SEAA.parquet')
X_val_SEAA.to_parquet('/datasets/s3/data/Xy_Data/X_val_SEAA.parquet')
y_val_SEAA.to_frame('y_val').to_parquet('/datasets/s3/data/Xy_Data/y_val_SEAA.parquet')
X_holdout_SEAA.to_parquet('/datasets/s3/data/Xy_Data/X_holdout_SEAA.parquet')
y_holdout_SEAA.to_frame('y_holdout').to_parquet('/datasets/s3/data/Xy_Data/y_holdout_SEAA.parquet')


In [None]:
# Write out CARB parquet files to S3

train_df_CARB.to_parquet('/datasets/s3/data/Xy_Data/train_df_CARB.parquet')
holdout_df_CARB.to_parquet('/datasets/s3/data/Xy_Data/holdout_df_CARB.parquet')
val_df_CARB.to_parquet('/datasets/s3/data/Xy_Data/val_df_CARB.parquet')

X_train_CARB.to_parquet('/datasets/s3/data/Xy_Data/X_train_CARB.parquet')
y_train_CARB.to_frame('y_train').to_parquet('/datasets/s3/data/Xy_Data/y_train_CARB.parquet')
X_val_CARB.to_parquet('/datasets/s3/data/Xy_Data/X_val_CARB.parquet')
y_val_CARB.to_frame('y_val').to_parquet('/datasets/s3/data/Xy_Data/y_val_CARB.parquet')
X_holdout_CARB.to_parquet('/datasets/s3/data/Xy_Data/X_holdout_CARB.parquet')
y_holdout_CARB.to_frame('y_holdout').to_parquet('/datasets/s3/data/Xy_Data/y_holdout_CARB.parquet')

In [None]:
# Write out GLOB parquet files to S3
train_df_GLOB.to_parquet('/datasets/s3/data/Xy_Data/train_df_GLOB.parquet')
holdout_df_GLOB.to_parquet('/datasets/s3/data/Xy_Data/holdout_df_GLOB.parquet')
val_df_GLOB.to_parquet('/datasets/s3/data/Xy_Data/val_df_GLOB.parquet')

X_train_GLOB.to_parquet('/datasets/s3/data/Xy_Data/X_train_GLOB.parquet')
y_train_GLOB.to_frame('y_train').to_parquet('/datasets/s3/data/Xy_Data/y_train_GLOB.parquet')
X_val_GLOB.to_parquet('/datasets/s3/data/Xy_Data/X_val_GLOB.parquet')
y_val_GLOB.to_frame('y_val').to_parquet('/datasets/s3/data/Xy_Data/y_val_GLOB.parquet')
X_holdout_GLOB.to_parquet('/datasets/s3/data/Xy_Data/X_holdout_GLOB.parquet')
y_holdout_GLOB.to_frame('y_holdout').to_parquet('/datasets/s3/data/Xy_Data/y_holdout_GLOB.parquet')

In [None]:
# ## Write out parquet files for classification

# # Write out parquet files to DeepNote
# train_df_class.to_parquet('/work/data/Xy_Data/train_df_class.parquet')
# holdout_df_class.to_parquet('/work/data/Xy_Data/holdout_df_class.parquet')

# X_train_class.to_parquet('/work/data/Xy_Data/X_train_class.parquet')
# y_train_class.to_frame('y_train_class').to_parquet('/work/data/Xy_Data/y_train_class.parquet')

# X_holdout_class.to_parquet('/work/data/Xy_Data/X_holdout_class.parquet')
# y_holdout_class.to_frame('y_holdout_class').to_parquet('/work/data/Xy_Data/y_holdout_class.parquet')

# # Write out parquet files to S3
# train_df_class.to_parquet('/datasets/s3/data/Xy_Data/train_df_class.parquet')
# holdout_df_class.to_parquet('/datasets/s3/data/Xy_Data/holdout_df_class.parquet')

# X_train_class.to_parquet('/datasets/s3/data/Xy_Data/X_train_class.parquet')
# y_train_class.to_frame('y_train_class').to_parquet('/datasets/s3/data/Xy_Data/y_train_class.parquet')

# X_holdout_class.to_parquet('/datasets/s3/data/Xy_Data/X_holdout_class.parquet')
# y_holdout_class.to_frame('y_holdout_class').to_parquet('/datasets/s3/data/Xy_Data/y_holdout_class.parquet')