In this notebook, I will...
   - calculate baseline error
   - run random forest regressors on abundance 
   - load regional dfs, impute, add island col as ID, rbind all 5 dfs 
   - predict out for each group 

#### Scoring models

We evaluated the performance of the abundance models based on mean average error (accuracy) and mean absolute percent error (MAPE).

## Import modules

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, confusion_matrix, auc
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn import *
from sklearn.metrics import r2_score
from sklearn.metrics import classification_report
import pickle
import requests
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [2]:
os.chdir('C:/Users/linds/OneDrive/Documents/samoa_corals_data')

## Import data

In [3]:
coral_types=['columnar']
target_types=['binary','percent']
df = dict()

for i in range(0,len(coral_types)):
    for j in range(0,len(target_types)):
        df[str(coral_types[i])+'_'+str(target_types[j])]=pd.read_csv(str(coral_types[i])+'_'+str(target_types[j])+'.csv')
        del df[str(coral_types[i])+'_'+str(target_types[j])]['Unnamed: 0'] # artifact indexing column
# Access the data as, e.g., df['scler_percent']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df['columnar_percent'].drop(['Clmnr_R','lat','lon','ID'], axis=1), 
                                                    df['columnar_percent']['Clmnr_R'], 
                                                    test_size = 0.3, random_state = 30)


print('Columnar Training Features Shape:', X_train.shape)
print('Columnar Training Labels Shape:', y_train.shape)
print('Columnar Testing Features Shape:', X_test.shape)
print('Columnar Testing Labels Shape:', y_test.shape)

# The baseline predictions are the averages

baseline_preds = np.array([y_train.mean()] * len(y_train))
baseline_errors = abs(baseline_preds - y_train)
print('Baseline prediction error: ', round(np.mean(baseline_errors), 2))

Columnar Training Features Shape: (2214, 9)
Columnar Training Labels Shape: (2214,)
Columnar Testing Features Shape: (949, 9)
Columnar Testing Labels Shape: (949,)
Baseline prediction error:  0.11


### Train random forest regressor

In [5]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 30)

# Train the model on training data
rf.fit(X_train, y_train)

# Predicting to the test data
predictions = rf.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error: ', round(np.mean(errors), 2))

Mean Absolute Error:  0.17


In [8]:
df['columnar_percent'].Clmnr_R.value_counts()

0     3153
20       9
40       1
Name: Clmnr_R, dtype: int64

### Tune the model

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
bootstrap = [True, False] # Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'bootstrap': bootstrap}

print(random_grid)

# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters, using 10 fold cross validation, 
# Search across n_iter * cv different combinations, and use all available cores
rf_best = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 40, cv = 10, verbose=2, random_state=42, n_jobs = -1)

# Fit the random search model
rf_best.fit(X_train, y_train)
print(rf_best.best_params_)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'bootstrap': [True, False]}
Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [None]:
#Predicting on the test set
predictions = rf_best.predict(X_test)

# Calculate the absolute errors
errors = abs(predictions - y_test)

# Print out the mean absolute error (mae)
print('Mean Absolute Error: ', round(np.mean(errors), 2))

### Pickle the model

In [8]:
with open('columnar_abundance_model.pkl', 'wb') as fid:
    pickle.dump(rf, fid, 2)