In [5]:
import os
import pandas as pd
import numpy as np

# Import custom functions
import env_functions as ef
import s3_functions as sf

# Vis Imports
import seaborn as sns

In [6]:
pd.set_option('mode.chained_assignment', None)
pd.options.display.float_format = '{:.2f}'.format
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [7]:
# Determine the environment and get appropriate vars
deepnote, env_vars = ef.load_env_vars()

# Iterate through the vars and set them as global vars
for var_name, var in env_vars.items():
    globals()[var_name] = var

# If not in the DeepNote environment, create a dict for aws creds
#   that were located in the environment file.  This will be passed
#   to all aws s3 functions.
if not deepnote:
    aws_env_vars = {
        'access_key_id': aws_access_key_id,
        'secret_access_key': aws_secret_access_key,
        'bucket_name': s3_bucket_name
    }

Loading dotenv file


In [8]:
if deepnote:
    gcb_df = pd.read_csv("/datasets/s3/data/Global_Coral_Bleaching_DB/Global_Coral_Bleaching_DB_v3.csv", low_memory=False)
else:
    gcb_df = pd.read_csv(sf.load_from_s3(file_path="data/Global_Coral_Bleaching_DB/Global_Coral_Bleaching_DB_v3.csv", **aws_env_vars), low_memory=False)

### Impute Data

#### Simple Imputation - Columns Missing Handfull of Values

In this section we update some columns that were missing a couple of values (1-10) and we discard all rows that were missing the cruical column 'SSTA' which dramatically cleaned up the dataframe while only discarding about 0.4% of the entire dataframe. It should also be noted that rows missing 'SSTA' were also missing other cruical oceanic information.

In [9]:
# Fix the observation missing the French Polynesian country name
gcb_df.Country_Name.fillna("French Polynesia", inplace=True)

# Only 3 missing records for Ecoregion, all are on the northern coast of Honshu Island, Japan
gcb_df.Ecoregion_Name.fillna("Honshu, Japan", inplace=True)

# If Substrate_Name is missing, it's hard coral
gcb_df.Substrate_Name.fillna("Hard Coral", inplace=True)

# If Distance_to_Shore is missing, it's a FL Key site, 62m from shore
gcb_df.Distance_to_Shore.fillna(62, inplace=True)

# 'SSTA' is a column that seems to track with other important oceanic metrics.
# There are 259 rows that are missing 'SSTA' and the dataset is about 63k rows.
# This means that we are only discarding about 0.4% of the total dataframe while retaining
# a bulk of the information that we need for a quality analysis.
gcb_df.dropna(subset=['SSTA'], inplace=True)

#### Not So Simple Imputation - Depth_m & SSTA_Minimum

After performing the simple imputations above, we still have a couple of important columns that could be cleaned up: Depth_m and SSTA_Minimum.

In [10]:
def impute_depth(row):
    '''
    Function to impute Depth_m
    param row: Pandas dataframe row object
    return: depth
    '''

    # Set the default value to None
    depth = None

    # Assign the current value of depth as 'depth'
    depth = row['Depth_m']

    if not pd.isna(depth):
        return depth
    else:
        temp_df = gcb_df[gcb_df['Reef_ID'] == row['Reef_ID']]
        reef_median = temp_df['Depth_m'].median()

        if not np.isnan(reef_median):
            return reef_median
        else:
            # If the reef median is also NaN, use the overall median
            overall_median = gcb_df['Depth_m'].median()
            return overall_median

In [11]:
def impute_SSTAMin(row):
    '''
    Function to impute SSTA_Minimum
    param row: Pandas dataframe row object
    return: SSTA_Minimum
    '''

    # Set the default value to None
    sstaValue = None

    # Assign the current value of SSTA_Minimum as 'sstaValue'
    sstaValue = row['SSTA_Minimum']

    if not pd.isna(sstaValue):
        return sstaValue
    else:
        temp_df = gcb_df[gcb_df['Ocean_Name'] == row['Ocean_Name']]
        ocean_median = temp_df['SSTA_Minimum'].median()

        if not np.isnan(ocean_median):
            return ocean_median
        else:
            # If the ocean median is also NaN, use the overall median
            overall_median = gcb_df['SSTA_Minimum'].median()
            return overall_median

In [12]:
gcb_df['Depth_m'] = gcb_df.apply(impute_depth, axis=1)
gcb_df['SSTA_Minimum'] = gcb_df.apply(impute_SSTAMin, axis=1)

### Write Out

In [13]:
# Uncomment to write out parquet file, which is used in the feature building notebook
# if deepnote:
#     gcb_df.to_parquet("/work/data/Global_Coral_Bleaching_DB/gcb_v3.parquet")
#     gcb_df.to_parquet("/datasets/s3/data/Global_Coral_Bleaching_DB/gcb_v3.parquet")
# else:
#     write_to_s3(file_path="data/Global_Coral_Bleaching_DB/gcb_v3.parquet", data=gcb_df, **aws_env_vars)

### Validate the Data

In [14]:
# How much data is missing from the columns?

#gcb_df.isna().sum() # Total NaN per col
#gcb_df.isna().mean() # Percent NaN per col

# Number of observations with no bleaching value
#len(gcb_df[gcb_df["Percent_Bleached_Value"].isna()])

# Identify the columns missing more than 10% of data
gcb_cols = gcb_df.columns[gcb_df.isna().mean() > 0.1]
gcb_df[gcb_cols].isna().mean()

Reef_ID                         0.20
Bleaching_Level                 0.30
Percent_Bleached                0.86
Percent_Bleaching_Old_Method    1.00
S1                              0.30
S2                              0.30
S3                              0.30
S4                              0.31
Bleaching_Prevalence_Score      1.00
Bleaching_Prevalence_Score_ID   1.00
Severity_Code                   0.89
Severity_ID                     0.89
bleach_intensity                1.00
Number_Bleached_Colonies        1.00
Percent_Hard_Coral              1.00
Percent_Macroalgae              1.00
Site_Name                       0.87
City_Town_Name_2                0.40
City_Town_Name_3                0.87
City_Town_Name_4                1.00
Sample_Comments                 0.95
Site Comments                   0.96
Cover Comments                  1.00
dtype: float64