# Species Distribution Model (SDM)

## Import Libraries

In [89]:
#Libraries
import pandas as pd
import numpy as np
import datetime
import plotly.express as px

#Preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit, TimeSeriesSplit
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Modeling & Metrics
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Load the Dataset

In [90]:
# Species data
df = pd.read_csv('all_species.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1949 entries, 0 to 1948
Columns: 160 entries, Datetime to Abundance (ind/m2)
dtypes: float64(153), int64(2), object(5)
memory usage: 2.4+ MB


In [91]:
df.sort_values(by='Year', inplace=True) 
df.reset_index(inplace=True) # reset index for proper min/max time
df = df[df['Zone'] != 'F'] # retroactively remove Zone F which is absent in training set
df

Unnamed: 0,index,Datetime,Year,Datetime.1,Tide,Weather Condition,Water temperature (ºC),Zone,Supratidal/Middle Intertidal,Substrate,...,Callionymus lira (peixe-pau lira),Oncidiella celtica,Doriopsilla areolata (nudibrânquio),Scorpaena sp. (Rascasso),Lipophrys pholis (ad.),Diplodus cervinus,Gobiusculus flavescens,Sessile Coverage,Total Mobile Species,Abundance (ind/m2)
0,1948,11/28/2011,2011,11,0.6,Clear sky,16.0,D,Medium,Puddle,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,66.45,4.0,0.05
1,1923,12/12/2011,2011,12,0.9,Clear sky,16.0,E,Medium,Rock,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,95.00,8.0,0.00
2,1922,12/12/2011,2011,12,0.9,Clear sky,16.0,E,Medium,Rock,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.25,0.0,0.00
3,1921,12/12/2011,2011,12,0.9,Clear sky,16.0,E,Medium,Rock,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,4.0,0.00
4,1920,12/12/2011,2011,12,0.9,Clear sky,16.0,E,Medium,Rock,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1938,41,6/5/2020,2020,6,0.6,Sunny,19.0,E,Medium,Puddle/Rock/Sand,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.05,2.0,0.10
1944,47,5/8/2020,2020,5,0.4,Sunny,17.0,D,Medium,Rock,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,99.65,2.0,0.15
1945,48,5/8/2020,2020,5,0.4,Sunny,17.0,D,Medium,Rock,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.00,2.0,0.15
1946,49,5/8/2020,2020,5,0.4,Sunny,17.0,D,Medium,Rock,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,98.50,2.0,0.15


In [92]:
df[['Tide', 'Water temperature (ºC)', 'Sessile Coverage', 'Total Mobile Species','Abundance (ind/m2)']].describe()

Unnamed: 0,Tide,Water temperature (ºC),Sessile Coverage,Total Mobile Species,Abundance (ind/m2)
count,1879.0,1879.0,1879.0,1875.0,1879.0
mean,0.729159,16.821767,52.711895,3.851733,0.21554
std,0.178576,2.266078,34.378994,12.931326,0.657095
min,0.3,11.0,0.0,0.0,0.0
25%,0.6,15.0,21.025,0.0,0.0
50%,0.7,17.0,56.5,0.0,0.0
75%,0.9,19.0,84.9,3.0,0.15
max,1.4,22.0,123.5,254.0,12.7


**Initial Observations**
- `Tide` has a nearly equal mean and median with a majority of values spread within 2 (TODO: How was tide measured?), indicating a possible normal distribution.
- `Water temperature (ºC)` may have a similar distribution to `Tide`. Are observed min and max values related for these features due to an event?
- `Sessile Coverage` may need to be plotted to confirm if the distribution is normal. Is there a time factor, like seasonality?
- `Total Mobile Species` and related field `Abundance(ind/m2)` has a relatively large range of sample values. Double check that thes values appear to be correlated.

**Note**: This evaluation is not only to determine the shape of the distribution, as all numeric columns are transformed by removing the mean value of each feature, then scaling it using SciKit Learn's Preprocessing library.


## Data Preprocessing

#### Categorical Features

In [59]:
X["Weather Condition"].value_counts()

Clear sky        1385
Cloudy            345
Sunny             103
Rain               83
Fairly Cloudy      33
Name: Weather Condition, dtype: int64

In [62]:
X["Weather Condition"].replace(to_replace="Sunny and Windy", value="Sunny", inplace=True)

Only 9 values observed as 'Sunny and Windy', so collapse with Sunny.

In [63]:
X["Weather Condition"].value_counts()

Clear sky        1385
Cloudy            345
Sunny             103
Rain               83
Fairly Cloudy      33
Name: Weather Condition, dtype: int64

#### Datetime

In [93]:
df['Datetime'] = df.loc[:, 'Datetime'].astype('datetime64[ns]')

In [94]:
min_date = df.Datetime.iloc[0]
max_date = df.Datetime.iloc[-1]
print("Min:", min_date, "Max:", max_date)

Min: 2011-11-28 00:00:00 Max: 2020-11-16 00:00:00


### Train/Test split

Train percent determined based on a Discord discussion regarding the "Abundance measure inconsistency around Septemper 2015":

"From the information of that year’s report, there was a damage to the pier holding the sand at the protected area in the storm of 2014 and continued in 2015, this caused the increase in sand in the rocky shore and therefore the decrease in abundance. The pier was re-established in the Summer of 2016."

In [None]:
# Define X and y
X = df[['Datetime'] + numeric_features + categorical_features].sort_values(by='Datetime')
y = df['Abundance (ind/m2)']

In [95]:
# Code adapted from https://www.rasgoml.com/feature-engineering-tutorials/scikit-learn-time-series-split
train_percent = .5
time_between = max_date - min_date
train_cutoff = min_date + train_percent*time_between
train_cutoff

Timestamp('2016-05-23 00:00:00')

In [96]:
train_df = df[df['Datetime'] <= train_cutoff]
test_df = df[df['Datetime'] > train_cutoff]
print("Train:", train_df.Zone.unique())
print("Test:", test_df.Zone.unique())
print("Train:", train_df['Datetime'].min(), train_df['Datetime'].max())
print("Test:", test_df['Datetime'].min(), test_df['Datetime'].max())


Train: ['D' 'E' 'B' 'A']
Test: ['B' 'A' 'D' 'E']
Train: 2011-11-28 00:00:00 2016-05-10 00:00:00
Test: 2016-06-07 00:00:00 2020-11-16 00:00:00


### Building a Pipeline
Note: Numerical features are considered here. 

In [64]:
numeric_features = ['Tide', 'Water temperature (ºC)', 'Sessile Coverage']
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), 
           ("scaler", StandardScaler())]
)

categorical_features = ['Weather Condition', 'Zone']
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

## Model Selection

#### Linear Regressor

In [None]:
# Create the pipeline with preprocessor and linear regressor
linear_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear', LinearRegression())
])

# Fit the pipeline on the data
linear_pipeline.fit(X_train, y_train)

#### Random Forest Regressor

In [None]:
# Create the pipeline with preprocessor and random forest regressor
forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('forest', RandomForestRegressor(n_estimators = 300, max_features = 'sqrt', max_depth = 5, random_state = 18))
])

# Fit the pipeline on the data
forest_pipeline.fit(X_train, y_train)

### Model Evaluation
Evaluate the trained models using appropriate metrics such as mean squared error (MSE) and mean absolute error (MAE).  Compare the performance of different models.

In [None]:
linear_y_preds = linear_pipeline.predict(X_test)
mean_squared_error(y_test, linear_y_preds)

In [None]:
# Predict and score
forest_y_preds = forest_pipeline.predict(X_test)
mse = mean_squared_error(y_test, forest_y_preds)
mse

In [None]:
rmse = mse**0.5
rmse

In [None]:
# Plot difference between the actual and the predicted values
plt.figure(figsize=(5, 7))


ax = sns.distplot(y, hist=False, color="r", label="Actual Value")
sns.distplot(forest_y_preds, hist=False, color="b", label="Fitted Values" , ax=ax)


plt.title('Actual vs Fitted Values for Abundance')


plt.show();

### Feature Importances
Determine features that influence the abundance of mobile species.

In [None]:
import eli5

In [None]:
# Extract encoded feature names and append them to the known list of numerical features
onehot_columns = list(forest_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(input_features=categorical_features))
numeric_features_list = list(numeric_features)
numeric_features_list.extend(onehot_columns)

In [None]:
target_names = y_test.unique().astype(str)

In [None]:
eli5.explain_weights(forest_pipeline.named_steps['forest'], top=3, feature_names=numeric_features_list)

[Source](https://towardsdatascience.com/extracting-feature-importances-from-scikit-learn-pipelines-18c79b4ae09a)

### Test Model

### Conclusion
Summarize your findings, discuss any insights gained from the analysis, and suggest future steps for improvement if applicable.