In [None]:
# Import Standard Libraries
import os
import pandas as pd
import geopandas as gpd
import numpy as np

# Import Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import plotly.express as px
import plotly.graph_objects as go

# Import ML Libraries
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, silhouette_samples

# Import custom functions
import env_functions as ef
import s3_functions as sf
import common_functions as cf

In [ ]:
# Determine the environment and get appropriate vars
deepnote, env_vars = ef.load_env_vars()

# Iterate through the vars and set them as global vars
for var_name, var in env_vars.items():
    globals()[var_name] = var

# If not in the DeepNote environment, create a dict for aws creds
#   that were located in the environment file.  This will be passed
#   to all aws s3 functions.
if not deepnote:
    aws_env_vars = {
        'access_key_id': aws_access_key_id,
        'secret_access_key': aws_secret_access_key,
        'bucket_name': s3_bucket_name
    }

In [None]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)

# MapBox Token for Plotly Maps
px.set_mapbox_access_token(os.environ.get("MAPBOX_TOKEN"))

In [None]:
# Read in Data
if deepnote:
    gcb = pd.read_parquet("/work/data/Global_Coral_Bleaching_DB/gcb_v4.parquet")
else:
    gcb = pd.read_parquet(sf.load_from_s3(file_path="data/Global_Coral_Bleaching_DB/gcb_v4.parquet", **aws_env_vars))

gcb = gcb[(gcb["Substrate_Name"] == "Hard Coral")]

# Drop data leakage columns
gcb.drop(columns=['S1','S2','S3','S4','Bleaching_Level','Bleaching_Prevalence_Score',
                  'Severity_ID','Severity_Code','Bleaching_Prevalence_Score_ID',
                  'Percent_Bleached'], inplace=True)

ecoregions = gcb['Ecoregion_Name']
# Drop other columns that are not needed
gcb.drop(columns=['Sample_ID', 'Site_ID', 'Reef_ID', 'Date_Day', 'Date_Month',
                  'Date_Year', 'Ocean_Name','Realm_Name',
                  'Ecoregion_Name', 'Country_Name', 'State_Island_Province_Name',
                  'City_Town_Name_1', 'City_Town_Name_2', 'City_Town_Name_3',
                  'City_Town_Name_4', 'Data_Source', 'Bleached_Value_Imputed', 'Date',
                  'Month_Name', 'Month_Year', 'Country_Code', 'Exposure_Cat',
                  'Country_Name_Cat', 'Year', 'Exposure', 'Substrate_Name'], inplace=True)

# Drop y-values
#gcb.drop(columns=['Percent_Bleached_Value'], inplace=True)

In [None]:
# Define 2 regions are create separate dataframes
SEAA_PROVINCE = ['Western Coral Triangle', 'Eastern Coral Triangle', 'Sunda Shelf', 'Tropical Southwestern Pacific',
                 'Northeast Australian Shelf', 'Sahul Shelf', 'South China Sea', 'Andaman', 'Java Transitional']

CARB_PROVINCE = ['Tropical Northwestern Atlantic']

# Create seperate dataframes for each region
SEAA = gcb[gcb['PROVINCE'].isin(SEAA_PROVINCE)]
CARB = gcb[gcb['PROVINCE'].isin(CARB_PROVINCE)]

gcb.drop(columns=['PROVINCE'], inplace=True)
SEAA.drop(columns=['PROVINCE'], inplace=True)
CARB.drop(columns=['PROVINCE'], inplace=True)

### Determine the Number of Clusters

In [None]:
# Global
range_n_clusters = range(2,40)

sil_score = []
for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value
    clusterer = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = clusterer.fit_predict(gcb)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(gcb, cluster_labels)
    sil_score.append(silhouette_avg)

fig = px.line(x=range_n_clusters, y=sil_score, markers=True, labels={'x':'Number of clusters, k', 'y':'Silhouette score'})
fig.update_layout(xaxis_title='Number of clusters, k', yaxis_title='Silhouette score', font=dict(size=18))
fig.show()

In [None]:
inertia = []
k_range = range(1, 40) 
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(gcb)
    inertia.append(kmeans.inertia_)
  
fig = px.line(x=k_range, y=inertia, markers=True, labels={'x':'Number of clusters, k', 'y':'Inertia'})
fig.update_layout(title='Elbow Method For Optimal k (Global)', xaxis_title='Number of clusters, k', yaxis_title='Inertia')
fig.show()

In [None]:
# k means using 3 clusters and k-means++ initialization
kmeans = KMeans(n_clusters = 3, init='k-means++')
kmeans.fit(gcb)
pred = kmeans.predict(gcb)

gcb['cluster'] = pred
gcb['cluster'] = gcb['cluster'].astype(str)
gcb = gcb.merge(ecoregions, left_index=True, right_index=True)
gcb['cluster'].value_counts()
#gcb['cluster'].value_counts()

cluster
1    27962
0     5745
2      401
Name: count, dtype: int64

In [None]:
fig = px.scatter_geo(gcb, lat="Latitude_Degrees", lon="Longitude_Degrees",
                     color="cluster", projection="orthographic",hover_name='Ecoregion_Name')

fig.show()

### Cluster on each region

In [None]:
# SEAA Region
range_n_clusters = range(2,40) 

sil_score = []
for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value
    clusterer = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = clusterer.fit_predict(SEAA)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(SEAA, cluster_labels)
    sil_score.append(silhouette_avg)

fig = px.line(x=range_n_clusters, y=sil_score, markers=True, labels={'x':'Number of clusters, k', 'y':'Silhouette score'})
fig.update_layout(xaxis_title='Number of clusters, k', yaxis_title='Silhouette score', font=dict(size=18))
fig.show()

In [None]:
inertia = []
k_range = range(1, 40) 
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(SEAA)
    inertia.append(kmeans.inertia_)
  
fig = px.line(x=k_range, y=inertia, markers=True, labels={'x':'Number of clusters, k', 'y':'Inertia'})
fig.update_layout(title='Elbow Method For Optimal k (SEAA)', xaxis_title='Number of clusters, k', yaxis_title='Inertia')
fig.show()

In [None]:
# k means using 5 clusters and k-means++ initialization
kmeans = KMeans(n_clusters = 5, init='k-means++')
kmeans.fit(SEAA)
pred = kmeans.predict(SEAA)

SEAA['cluster'] = pred
SEAA['cluster'] = SEAA['cluster'].astype(str)
SEAA = SEAA.merge(ecoregions, left_index=True, right_index=True)
SEAA['cluster'].value_counts()
#SEAA['cluster'].value_counts()

cluster
0    11794
3     1369
2      215
1      182
4      120
Name: count, dtype: int64

In [None]:
fig = px.scatter_geo(SEAA, lat="Latitude_Degrees", lon="Longitude_Degrees",
                     color="cluster",hover_name='Ecoregion_Name')

fig.show()

In [None]:
# CARB Region
range_n_clusters = range(2,40) #[2, 3, 4, 5, 6, 7, 8, 9, 10]

sil_score = []
for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value
    clusterer = KMeans(n_clusters=n_clusters, random_state=0)
    cluster_labels = clusterer.fit_predict(CARB)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(CARB, cluster_labels)
    sil_score.append(silhouette_avg)


fig = px.line(x=range_n_clusters, y=sil_score, markers=True, labels={'x':'Number of clusters, k', 'y':'Silhouette score'})
fig.update_layout(xaxis_title='Number of clusters, k', yaxis_title='Silhouette score', font=dict(size=18))
fig.show()

In [None]:
inertia = []
k_range = range(1, 40) 
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0).fit(CARB)
    inertia.append(kmeans.inertia_)
  
fig = px.line(x=k_range, y=inertia, markers=True, labels={'x':'Number of clusters, k', 'y':'Inertia'})
fig.update_layout(title='Elbow Method For Optimal k (CARB)', xaxis_title='Number of clusters, k', yaxis_title='Inertia')
fig.show()

In [None]:
# k means using 2,13 clusters and k-means++ initialization
kmeans = KMeans(n_clusters = 2, init='k-means++')
kmeans.fit(CARB)
pred = kmeans.predict(CARB)

CARB['cluster'] = pred
CARB['cluster'] = CARB['cluster'].astype(str)
CARB = CARB.merge(ecoregions, left_index=True, right_index=True)
CARB['cluster'].value_counts()

cluster
1    9237
0    4031
Name: count, dtype: int64

In [None]:
fig = px.scatter_geo(CARB, lat="Latitude_Degrees", lon="Longitude_Degrees",
                     color="cluster", size="Percent_Bleached_Value",hover_name="Ecoregion_Name")

fig.show()

The clustering successfully identified USA territories

In [None]:
SEAA.columns

Index(['Depth_m', 'Latitude_Degrees', 'Longitude_Degrees', 'Distance_to_Shore',
       'Turbidity', 'Cyclone_Frequency', 'ClimSST', 'Temperature_Kelvin',
       'Temperature_Mean', 'Temperature_Minimum', 'Temperature_Maximum',
       'Temperature_Kelvin_Standard_Deviation', 'Windspeed', 'SSTA',
       'SSTA_Standard_Deviation', 'SSTA_Mean', 'SSTA_Minimum', 'SSTA_Maximum',
       'SSTA_Frequency', 'SSTA_Frequency_Standard_Deviation',
       'SSTA_FrequencyMax', 'SSTA_FrequencyMean', 'SSTA_DHW',
       'SSTA_DHW_Standard_Deviation', 'SSTA_DHWMax', 'SSTA_DHWMean', 'TSA',
       'TSA_Standard_Deviation', 'TSA_Minimum', 'TSA_Maximum', 'TSA_Mean',
       'TSA_Frequency', 'TSA_Frequency_Standard_Deviation', 'TSA_FrequencyMax',
       'TSA_FrequencyMean', 'TSA_DHW', 'TSA_DHW_Standard_Deviation',
       'TSA_DHWMax', 'TSA_DHWMean', 'Percent_Bleached_Value', 'AG.CON.FERT.ZS',
       'AG.LND.AGRI.K2', 'AG.LND.AGRI.ZS', 'AG.LND.FRST.K2', 'AG.LND.FRST.ZS',
       'AG.LND.IRIG.AG.ZS', 'EN.FSH.THRD.N

In [None]:
site_info = ['Percent_Bleached_Value', 'Latitude_Degrees', 'Longitude_Degrees', 
             'Distance_to_Shore', 'Turbidity', 'Temperature_Kelvin', 'Temperature_Mean', 
             'Temperature_Maximum', 'Temperature_Minimum', 'Windspeed']

fish_info = ['ER.FSH.CAPT.MT', 'ER.FSH.PROD.MT', 'ER.FSH.AQUA.MT', 'EN.FSH.THRD.NO']

agr_info = ['NV.AGR.TOTL.KD.ZG', 'NV.AGR.TOTL.ZS', 'AG.LND.FRST.ZS', 'AG.LND.IRIG.AG.ZS',
            'AG.LND.FRST.K2', 'AG.LND.AGRI.ZS', 'AG.LND.AGRI.K2', 'AG.CON.FERT.ZS']

weather_info = ['Latitude_Degrees', 'Longitude_Degrees', 'Cyclone_Frequency', 'ClimSST', 
                'Temperature_Kelvin', 'Temperature_Mean', 'Temperature_Minimum', 
                'Temperature_Maximum', 'Temperature_Kelvin_Standard_Deviation', 'Windspeed',
                'SSTA', 'SSTA_Standard_Deviation', 'SSTA_Mean', 'SSTA_Minimum', 'SSTA_Maximum',
                'SSTA_Frequency', 'SSTA_Frequency_Standard_Deviation',
                'SSTA_FrequencyMax', 'SSTA_FrequencyMean', 'SSTA_DHW',
                'SSTA_DHW_Standard_Deviation', 'SSTA_DHWMax', 'SSTA_DHWMean', 'TSA',
                'TSA_Standard_Deviation', 'TSA_Minimum', 'TSA_Maximum', 'TSA_Mean',
                'TSA_Frequency', 'TSA_Frequency_Standard_Deviation', 'TSA_FrequencyMax',
                'TSA_FrequencyMean', 'TSA_DHW', 'TSA_DHW_Standard_Deviation',
                'TSA_DHWMax', 'TSA_DHWMean']

In [None]:
GLOB_site_fish_ag = gcb[list(set(site_info + fish_info + agr_info))]

In [None]:
# GLOB_site_fish_ag Region
range_n_clusters = range(2,40) #[2, 3, 4, 5, 6, 7, 8, 9, 10]

sil_score = []
for n_clusters in range_n_clusters:
    # Initialize the clusterer with n_clusters value
    clusterer = KMeans(n_clusters=n_clusters, random_state=0, n_init=10)
    cluster_labels = clusterer.fit_predict(GLOB_site_fish_ag)

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed clusters
    silhouette_avg = silhouette_score(GLOB_site_fish_ag, cluster_labels)
    sil_score.append(silhouette_avg)


fig = px.line(x=range_n_clusters, y=sil_score, markers=True, labels={'x':'Number of clusters, k', 'y':'Silhouette score'})
fig.update_layout(xaxis_title='Number of clusters, k', yaxis_title='Silhouette score', font=dict(size=18))
fig.show()

In [None]:
inertia = []
k_range = range(1, 40) 
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10).fit(GLOB_site_fish_ag)
    inertia.append(kmeans.inertia_)
  
fig = px.line(x=k_range, y=inertia, markers=True, labels={'x':'Number of clusters, k', 'y':'Inertia'})
fig.update_layout(title='Elbow Method For Optimal k (GLOB_site_fish_ag)', xaxis_title='Number of clusters, k', yaxis_title='Inertia')
fig.show()

In [None]:
# k means using 6 clusters and k-means++ initialization
kmeans = KMeans(n_clusters = 6, init='k-means++', n_init='auto')
kmeans.fit(GLOB_site_fish_ag)
pred = kmeans.predict(GLOB_site_fish_ag)

GLOB_site_fish_ag['cluster'] = pred
GLOB_site_fish_ag['cluster'] = GLOB_site_fish_ag['cluster'].astype(str)
GLOB_site_fish_ag = GLOB_site_fish_ag.merge(ecoregions, left_index=True, right_index=True)
GLOB_site_fish_ag['cluster'].value_counts()

cluster
0    23890
4     4330
3     4303
2     1034
1      302
5      249
Name: count, dtype: int64

In [None]:
fig = px.scatter_geo(GLOB_site_fish_ag, lat="Latitude_Degrees", lon="Longitude_Degrees",
                     color="cluster", size="Percent_Bleached_Value",hover_name="Ecoregion_Name")

fig.show()

In [None]:
GLOB_site_weather = gcb[list(set(site_info + weather_info))]

In [None]:
inertia = []
k_range = range(1, 40) 
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=0, n_init=10).fit(GLOB_site_weather)
    inertia.append(kmeans.inertia_)
  
fig = px.line(x=k_range, y=inertia, markers=True, labels={'x':'Number of clusters, k', 'y':'Inertia'})
fig.update_layout(title='Elbow Method For Optimal k (GLOB_site_weather)', xaxis_title='Number of clusters, k', yaxis_title='Inertia')
fig.show()

In [None]:
# k means using 10 clusters and k-means++ initialization
kmeans = KMeans(n_clusters = 10, init='k-means++', n_init='auto')
kmeans.fit(GLOB_site_weather)
pred = kmeans.predict(GLOB_site_weather)

GLOB_site_weather['cluster'] = pred
GLOB_site_weather['cluster'] = GLOB_site_weather['cluster'].astype(str)
GLOB_site_weather = GLOB_site_weather.merge(ecoregions, left_index=True, right_index=True)
GLOB_site_weather['cluster'].value_counts()

cluster
5    26489
0     3840
6     2172
3      629
4      564
7      188
9       86
8       73
2       45
1       22
Name: count, dtype: int64

In [None]:
fig = px.scatter_geo(GLOB_site_weather, lat="Latitude_Degrees", lon="Longitude_Degrees",
                     color="cluster", size="Percent_Bleached_Value",hover_name="Ecoregion_Name")

fig.show()

In [None]:
GLOB_site_weather.groupby('cluster')['Percent_Bleached_Value'].mean()


cluster
0    15.379447
1    16.028636
2    20.820000
3    17.030570
4    14.310469
5    12.832369
6    16.031564
7    18.010076
8    23.301712
9    16.019897
Name: Percent_Bleached_Value, dtype: float64

In [None]:
cluster_interest = GLOB_site_weather[GLOB_site_weather['cluster'] == '8']
cluster_interest.describe()

Unnamed: 0,TSA_DHWMax,Distance_to_Shore,SSTA_FrequencyMax,TSA_DHWMean,Temperature_Maximum,Temperature_Mean,Temperature_Kelvin_Standard_Deviation,SSTA_FrequencyMean,TSA_Minimum,SSTA_Minimum,...,Turbidity,TSA,TSA_DHW_Standard_Deviation,Temperature_Kelvin,TSA_Frequency,TSA_FrequencyMax,TSA_DHW,SSTA_Frequency_Standard_Deviation,TSA_Frequency_Standard_Deviation,Latitude_Degrees
count,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,...,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0,73.0
mean,9.632603,148830.388219,22.739726,0.420548,304.476027,299.819315,1.385068,3.342466,-5.751918,-3.097884,...,0.048458,-0.904658,1.157397,300.607671,3.219178,6.506849,0.873699,4.354247,1.343425,-6.083307
std,5.209337,6046.055416,7.19959,1.099469,1.079014,1.478821,0.463208,2.935507,1.178217,0.510548,...,0.020561,1.217798,1.292437,1.841663,3.76846,2.972668,4.784356,1.848305,0.527424,11.165637
min,4.03,136210.66,14.0,0.15,300.38,292.96,0.96,1.0,-10.23,-5.465556,...,0.0264,-6.07,0.61,290.1,0.0,3.0,0.0,2.76,0.87,-26.173
25%,6.92,146399.14,17.0,0.21,304.05,299.94,0.96,1.0,-6.62,-3.362222,...,0.0353,-1.6,0.86,299.98,0.0,5.0,0.0,2.76,1.12,-13.801528
50%,10.05,148186.12,20.0,0.23,304.05,300.3,1.2,3.0,-5.58,-2.86,...,0.0353,-0.86,0.86,300.61,1.0,7.0,0.0,3.93,1.23,-3.873361
75%,10.1,149777.99,29.0,0.34,305.32,300.3,1.72,5.0,-4.67,-2.81,...,0.0644,-0.35,1.06,301.01,7.0,7.0,0.0,6.01,1.27,-3.860389
max,40.71,171234.19,47.0,9.58,305.98,301.7,2.81,19.0,-4.21,-2.42,...,0.126711,2.76,11.18,303.97,21.0,26.0,40.45,10.98,3.82,27.889


In [None]:
GLOB_site_weather.describe()

Unnamed: 0,TSA_DHWMax,Distance_to_Shore,SSTA_FrequencyMax,TSA_DHWMean,Temperature_Maximum,Temperature_Mean,Temperature_Kelvin_Standard_Deviation,SSTA_FrequencyMean,TSA_Minimum,SSTA_Minimum,...,Turbidity,TSA,TSA_DHW_Standard_Deviation,Temperature_Kelvin,TSA_Frequency,TSA_FrequencyMax,TSA_DHW,SSTA_Frequency_Standard_Deviation,TSA_Frequency_Standard_Deviation,Latitude_Degrees
count,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,...,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0,34108.0
mean,10.382085,3795.68698,23.736286,0.573582,305.152932,300.463061,1.650495,5.201232,-6.450333,-3.479399,...,0.075372,-0.970373,1.408405,301.391394,2.053798,8.021595,1.251191,4.81622,1.750975,7.96075
std,6.079852,13666.991943,6.781766,1.150129,1.293308,1.53232,0.708189,2.37577,2.372,0.600845,...,0.061347,1.655848,1.328966,1.989437,2.946582,4.232664,3.509799,1.286091,0.771854,15.71239
min,0.0,3.2,0.0,0.0,300.38,290.88,0.79,0.0,-19.54,-8.9675,...,0.0,-11.97,0.0,287.04,0.0,0.0,0.0,0.0,0.0,-28.8645
25%,6.77,132.0375,19.0,0.27,304.45,299.79,1.18,3.6,-7.59,-3.715556,...,0.039642,-1.82,0.893333,300.36,0.0,5.2,0.0,3.86,1.244375,-3.809701
50%,8.98,504.88,23.0,0.39,305.11,300.8075,1.35,5.0,-5.69,-3.358889,...,0.057,-0.71,1.14,301.74,1.0,7.0,0.0,4.71,1.564286,11.230556
75%,12.0,1945.39,28.0,0.6,305.79,301.6,2.07,6.0,-4.8,-3.106667,...,0.0841,0.12,1.6,302.76,3.0,9.0,1.18,5.73,2.07,20.0505
max,146.795,299218.5,52.0,13.15,313.14,303.52,6.08,21.0,0.0,0.0,...,1.284467,5.9,24.77,310.44,29.0,52.0,52.45,18.51,16.39,36.75


In [None]:
# large distance to shore, colder temp, low turbidity