# Coral Bleaching
## Preprocessing

 

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import RandomUnderSampler

In [2]:
# import data
df = pd.read_csv('../data/interim/coral_bleaching_cleaned_v2.csv')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34251 entries, 0 to 34250
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Latitude_Degrees            34251 non-null  float64
 1   Longitude_Degrees           34251 non-null  float64
 2   Ocean_Name                  34251 non-null  object 
 3   Realm_Name                  34251 non-null  object 
 4   Country_Name                34251 non-null  object 
 5   State_Island_Province_Name  34187 non-null  object 
 6   Distance_to_Shore           34249 non-null  float64
 7   Turbidity                   34251 non-null  float64
 8   Cyclone_Frequency           34251 non-null  float64
 9   Date_Year                   34251 non-null  int64  
 10  Depth_m                     32643 non-null  float64
 11  Percent_Bleaching           34251 non-null  float64
 12  Windspeed                   34251 non-null  float64
 13  SSTA                        342

### Drop Unnecessary Columns and NaNs
The current data still has a few categorical features and some missing values that need to be handled. Since there are only two values missing from Distance_to_Shore, I will drop these missing values. However, I will impute missing Depth_m values based on medians by Realm_Name. 

I have decided to use Realm_Name as the regional feature, which will ultimately be converted using one-hot encoding. The remaining spatial features - Latitude_Degrees, Longitude_Degrees, Ocean_Name, Country_Name, State_Island_Province_Name - and be remove. Additionally, Date and Date_Year can be removed. 

In [3]:
# Keep Realm_Name for imputing and one-hot encoding
# drop remaining object features

df_2 = df.drop(columns=['Latitude_Degrees', 'Longitude_Degrees','Ocean_Name', 'Country_Name', 
                          'Date', 'State_Island_Province_Name', 'Date_Year'], axis=1)

# drop observations missing distance to shore
df_2 = df_2[df_2['Distance_to_Shore'].notna()] 
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34249 entries, 0 to 34250
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Realm_Name             34249 non-null  object 
 1   Distance_to_Shore      34249 non-null  float64
 2   Turbidity              34249 non-null  float64
 3   Cyclone_Frequency      34249 non-null  float64
 4   Depth_m                32641 non-null  float64
 5   Percent_Bleaching      34249 non-null  float64
 6   Windspeed              34249 non-null  float64
 7   SSTA                   34249 non-null  float64
 8   SSTA_Frequency         34249 non-null  float64
 9   SSTA_DHW               34249 non-null  float64
 10  TSA                    34249 non-null  float64
 11  TSA_Frequency          34249 non-null  float64
 12  TSA_DHW                34249 non-null  float64
 13  Temperature_C          34249 non-null  float64
 14  Temperature_Maximum_C  34249 non-null  float64
 15  Bl

In [4]:
df_2['Bleaching_indicator'].value_counts(normalize=True)

0    0.707875
1    0.292125
Name: Bleaching_indicator, dtype: float64

### Undersampling & Train/Test Split

Due to the imbalance in Bleaching_indicator (70/30), I will use undersampling to work with a more balanced target variable. The data can then be split into training and test sets. 

In [5]:
X = df_2.drop(columns=['Bleaching_indicator'])
y = df_2['Bleaching_indicator']

# Undersample data to counteract imbalance
# i.e. select fewer Bleaching_indicator == 0 observations
rus = RandomUnderSampler(random_state=5)
X_resample, y_resample = rus.fit_resample(X, y)

# split into train and test 80/20
X_train, X_test, y_train, y_test = train_test_split(X_resample, y_resample, test_size=0.2, random_state=5)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
y_train.value_counts(normalize=True), y_test.value_counts(normalize=True)

(16008, 16) (16008,) (4002, 16) (4002,)


(0    0.501687
 1    0.498313
 Name: Bleaching_indicator, dtype: float64,
 1    0.506747
 0    0.493253
 Name: Bleaching_indicator, dtype: float64)

### Impute Missing Depth Values

As noted previously, missing Depth_m values will be imputed using the medians grouped by Realm. The code below establishes the medians from the training data, then fills missing values in both training and test data. 

In [6]:
realm_depth_median = X_train.groupby(by=['Realm_Name']).median()['Depth_m']
realm_depth_median

Realm_Name
Central Indo-Pacific            6.000
Eastern Indo-Pacific            1.800
Temperate Australasia           6.000
Temperate Northern Atlantic    21.675
Temperate Northern Pacific      5.000
Tropical Atlantic               8.300
Tropical Eastern Pacific        8.900
Western Indo-Pacific            5.500
Name: Depth_m, dtype: float64

In [7]:
def fill_missing_depth(X, medians):

    for i in range(len(X)):

        if X['Depth_m'].isnull().iloc[i]:
            realm = X['Realm_Name'].iloc[i]
            median = medians[realm]
            X.iloc[i, 4] = median
        else:
            pass
        
    return X

In [8]:
X_train = fill_missing_depth(X_train, realm_depth_median)
X_test = fill_missing_depth(X_test, realm_depth_median)

In [9]:
print(f"Missing train data depths: {X_train.isnull()['Depth_m'].sum()}")
print(f"Missing test data depths: {X_train.isnull()['Depth_m'].sum()}")

Missing train data depths: 0
Missing test data depths: 0


### Scaling (if necessary) 

While I am mostly considering using various forms of decision trees that do not require scaling, I will scale the numeric data in case it could be useful for a regression model. 

In [10]:
# Select only numeric features
scale_columns = ['Distance_to_Shore', 'Turbidity', 'Cyclone_Frequency', 'Depth_m',
                 'Percent_Bleaching', 'Windspeed', 'SSTA', 'SSTA_Frequency', 'SSTA_DHW',
                 'TSA', 'TSA_Frequency', 'TSA_DHW', 'Temperature_C',
                 'Temperature_Maximum_C', 'Exposure_cat']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[scale_columns])
X_test_scaled = scaler.transform(X_test[scale_columns])

# convert to df, reinstating original index and column names
X_train_scaled = pd.DataFrame(X_train_scaled, index=X_train.index, columns=scale_columns)
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=scale_columns)

### One-Hot Encoding Realm_Name


In [11]:
# add Realm_Name back to scaled data
X_train_scaled['Realm_Name'] = X_train['Realm_Name']
X_test_scaled['Realm_Name'] = X_test['Realm_Name']

# Create dummies/one-hot encode for Realm_Name in all X data
X_train = pd.get_dummies(X_train, columns=['Realm_Name'], prefix='Realm')
X_test = pd.get_dummies(X_test, columns=['Realm_Name'], prefix='Realm')

X_train_scaled = pd.get_dummies(X_train_scaled, columns=['Realm_Name'], prefix='Realm')
X_test_scaled = pd.get_dummies(X_test_scaled, columns=['Realm_Name'], prefix='Realm')


In [12]:
# save train and test data
datapath = '../data/processed/'

X_train.to_csv(str(datapath)+'X_train_data.csv')
X_train_scaled.to_csv(str(datapath)+'X_train_scaled_data.csv')
y_train.to_csv(str(datapath)+'y_train_data.csv')

X_test.to_csv(str(datapath)+'X_test_data.csv')
X_test_scaled.to_csv(str(datapath)+'X_test_scaled_data.csv')
y_test.to_csv(str(datapath)+'y_test_data.csv')


# index is maintained, remember to reassign in pd.read_csv 
# pd.read_csv([file], index_col = 'Unnamed: 0')