# Explore the Global Coral Database

## Notebook Setup

### Import Libraries

In [None]:
# Import Standard Libraries
import os
import pandas as pd
import geopandas as gpd
import numpy as np

# Import Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

### Import Data

In [None]:
gcb = pd.read_parquet("/work/data/Global_Coral_Bleaching_DB/gcb_v4.parquet")
meow = gpd.read_file("/work/data/MEOW/meow_ecos.shp")

In [None]:
gcb.sample(20)

### Setup Notebook Params

In [None]:
# Pandas Configs
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Ignore Warnings
import warnings
warnings.simplefilter('ignore', category=FutureWarning)
warnings.simplefilter('ignore', category=UserWarning)

# MapBox Token for Plotly Maps
px.set_mapbox_access_token(os.environ.get("MAPBOX_TOKEN"))

## Exploration

In [None]:
# Top 20 countries represented in the data
gcb.Country_Name.value_counts().head(20)

In [None]:
# Choropleth of Countries Obs Counts
obs_per_country = gcb.groupby('Country_Code').size().reset_index(name='Observation_Counts')
fig = px.choropleth(obs_per_country,
                    locations='Country_Code',
                    locationmode='ISO-3',
                    color='Observation_Counts',
                    color_continuous_scale=px.colors.sequential.Plasma,
                    labels={'Observation_Counts': 'Number of Observations'},
                    title='Global Distribution of Observations by Country Code')

fig.show()

In [None]:
# Choropleth of Countries Site Counts
sites_per_country = gcb.groupby('Country_Code')['Site_ID'].nunique().reset_index(name='Unique_Site_Counts')
fig = px.choropleth(sites_per_country,
                    locations='Country_Code',
                    locationmode='ISO-3',
                    color='Unique_Site_Counts',
                    color_continuous_scale=px.colors.sequential.Viridis,
                    labels={'Unique_Site_Counts': 'Number of Unique Sites'},
                    title='Number of Unique Sites per Country')

fig.show()

In [None]:
# Map of Site Locations

fig = px.scatter_geo(gcb.sample(10000), lat="Latitude_Degrees", lon="Longitude_Degrees", hover_name="Country_Name",
                     color="Temperature_Mean", size="Percent_Bleached_Value", projection="orthographic")

fig.show()

In [None]:
# Country names for the Top 20 sampled sites represented in the data
gcb[gcb.Site_ID.isin(gcb.groupby('Site_ID').size().sort_values(ascending=False).head(20).index)]['Country_Name'].drop_duplicates()

In [None]:
# Count of samples over time
gcb['Month_Year'] = gcb['Date'].dt.to_period('M')
obs_per_mo = gcb.groupby('Month_Year').size().reset_index(name="Counts")
obs_per_mo['Month_Year'] = obs_per_mo['Month_Year'].dt.to_timestamp()

fig = px.line(obs_per_mo, x='Month_Year', y='Counts',
              title='Monthly Observation Counts',
              labels={'Month_Year': 'Month/Year', 'Counts': 'Count of Observations'})

fig.show()

In [None]:
# Count of Obs by Month
month_short_names = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun',
                    7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}

obs_by_month = gcb.groupby('Date_Month').size().reset_index(name='Observation_Counts')
obs_by_month['Month_Name'] = obs_by_month['Date_Month'].map(month_short_names)

fig = px.bar(obs_by_month,
             x='Month_Name',
             y='Observation_Counts',
             labels={'Month_Name': 'Month', 'Observation_Counts': 'Number of Observations'},
             title='Number of Observations by Month')

fig.update_layout(xaxis={'categoryorder':'array','categoryarray':list(month_short_names.values())})

fig.show()

In [None]:
# Sites with 75%+ Bleaching 2015-2020
obs_bleached_5yr = gcb[(gcb.Percent_Bleached_Value >= 75) & (gcb.Date_Year.between(2015,2020, inclusive='both'))]
obs_bleached_5yr.Date_Year = obs_bleached_5yr.Date_Year.astype('int').astype('str')

fig = px.scatter_geo(obs_bleached_5yr, lat="Latitude_Degrees", lon="Longitude_Degrees", hover_name="Country_Name",
                     color="Date_Year", projection="orthographic",
                     title="Sites with 75%+ Bleaching 2015-2020")

fig.update_layout(legend_title="Year")
fig.update_traces(marker=dict(line=dict(width=0)))

fig.show()

In [None]:
obs_bleached_5yr

In [None]:
# Coral Triangle Region with Sites of 75%+ Bleaching between 2015-2020

# Define the Coral Triangle region
coral_triangle = meow[meow.PROVINCE.isin(['Western Coral Triangle', 'Eastern Coral Triangle'])]
coral_triangle_geo = coral_triangle.geometry.__geo_interface__

# Plot the sites
fig_scatter = px.scatter_mapbox(obs_bleached_5yr[obs_bleached_5yr.PROVINCE.isin(['Western Coral Triangle', 'Eastern Coral Triangle'])],
                                lat="Latitude_Degrees",
                                lon="Longitude_Degrees",
                                hover_name="Country_Name",
                                color="Date_Year",
                                size="Percent_Bleached_Value",
                                color_continuous_scale=px.colors.cyclical.IceFire,
                                size_max=15,
                                zoom=2.65,
                                center={"lat": 0, "lon": 138.5},
                                title="Sites with 75%+ Bleaching 2015-2020")

# Create the Coral Triangle map
fig_choropleth = px.choropleth_mapbox(coral_triangle, 
                                      geojson=coral_triangle_geo, 
                                      color='ECOREGION',
                                      opacity=0.25, 
                                      locations=coral_triangle.index,
                                      center={"lat": 2, "lon": 138.5},
                                      mapbox_style="carto-positron",
                                      zoom=2.65,
                                      title='Coral Triangle Region')

# Add the site plots to the Coral Triangle map
for trace in fig_scatter.data:
    fig_choropleth.add_trace(trace)

fig_choropleth.update_layout(title_text="Coral Triangle Region with Sites of 75%+ Bleaching 2015-2020")

fig_choropleth.show()

In [None]:
# Correlation Matrix
gcb_numeric = gcb.select_dtypes(include=['float64','int64','int8'])

# Remove cols from the gcb_numeric in prep for corr_matrix:
# SSTA_Mean: Always 0
# S1, S2, S3, S4: Composite for Percent_Bleached_Value
# Severity_ID, Bleaching_Prevalence_Score_ID: Estimates for Percent_Bleached_Value
# Site_ID, Sample_ID: Relational Lookup Keys
gcb_numeric_drop_cols = ['SSTA_Mean','Severity_ID','Bleaching_Prevalence_Score_ID',
                         'Site_ID','Sample_ID','S1','S2','S3','S4']
gcb_numeric.drop(columns=gcb_numeric_drop_cols, axis=1, inplace=True)

# Create the correlation matrix
corr_matrix = gcb_numeric.corr(method='pearson')

# Plot the matrix
plt.figure(figsize=(50, 40))  # Adjust the figure size as needed
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, square=True)

plt.xticks(rotation=45, ha='right')
#plt.yticks(rotation=45)
plt.title('Pearson Correlation Matrix')

plt.show()

In [None]:
# Bleaching by Country
# How to do this reasonably ... avg by obs per year per country?  Let's start there
#  Okay, after having tried this, the problem is that it's very site dependent, especially for 
#  larger countries (United States, Mexico) or widely distributed countries (Indonesia, Micronesia)

# Bleaching by ecoregion instead ... better
# gcb.groupby(['Ecoregion_Name','Date_Year'])['Percent_Bleached_Value'].mean().reset_index()

# Bleaching by MEOW Province
gcb.groupby(['PROVINCE','Date_Year'])['Percent_Bleached_Value'].mean().reset_index()

In [None]:
gcb_filtered= gcb[(gcb["Substrate_Name"] == "Hard Coral")]
gcb_filtered = gcb_filtered[gcb_filtered["Bleached_Value_Imputed"] == False]
gcb_filtered.groupby(['Year'])['Percent_Bleached_Value'].mean()

sns.lineplot(data=gcb_filtered, x="Year", y="Percent_Bleached_Value")

In [None]:
gcb_filtered.columns

In [None]:
sns.lineplot(data=gcb_filtered, x="Year", y="SSTA")

In [None]:
sns.lineplot(data=gcb_filtered, x="Year", y="SSTA_Minimum")

In [None]:
sns.lineplot(data=gcb_filtered, x="Year", y="SSTA_Maximum")

In [None]:
gcb[gcb['Percent_Bleached_Value'] == 0].groupby(['Year']).size()

In [None]:
gcb.groupby(['Year']).size()

In [None]:
gcb.groupby('Year')['Site_ID'].nunique()

In [None]:
gcb.groupby(['Year']).size()/gcb.groupby('Year')['Site_ID'].nunique()

In [None]:
gcb.groupby(['Year','PROVINCE']).size()

In [None]:
gcb.groupby(['Site_ID','Year']).size().sort_values(ascending=False).head(100)

In [None]:
gcb[gcb.Site_ID == 9530].groupby('Year').size()

In [None]:
gcb[gcb['Percent_Bleached_Value'] == 0][['Site_ID','Year']].groupby(['Site_ID','Year']).size().sort_values(ascending=False).head(200)