# Random Forest with Earth Engine

## Importar pacotes

In [None]:
import geopandas as gpd
import pandas as pd

import seaborn as sns
import numpy as np
import math
import matplotlib

import ee
import geemap

In [None]:
ee.Authenticate()
ee.Initialize(project='ee-curuai')

# importar dados

In [None]:
# df = pd.read_csv("C:/Users/LarissaVieiraValadão/Downloads/clean_min_data.csv").drop(['Unnamed: 0'],axis=1)
df = pd.read_csv('C:/Users/l_v_v/Documents/GitHub/py6s_harmonize_sample/datasets/Landsat Sampling/Merged Landsat Data/Drop Outlier Data/clean_min_data.csv').drop(['Unnamed: 0'],axis=1)
df.columns

Index(['CHLOROPHYLL', 'CHLOROPHYLL_A', 'CHLOROPHYLL_B', 'CLOUD_COVER',
       'DEPTH_CLASS', 'DOC', 'ID', 'LATITUDE', 'LOCATION', 'LONGITUDE',
       'MISSION', 'N_TOTAL', 'N_TOTAL_DISSOLVED', 'POC', 'P_ORGANIC',
       'P_TOTAL', 'SAMPLE_SITE', 'SAMPLING_DEPTH', 'SILICA', 'SPM', 'TOC',
       'TOTAL_DEPTH', 'TURBIDITY', 'WATER_PERIOD', 'blue_max', 'blue_mean',
       'blue_median', 'blue_min', 'blue_stdDev', 'count_pixel', 'datetime',
       'dif_date_point', 'green_max', 'green_mean', 'green_median',
       'green_min', 'green_stdDev', 'img_date', 'nir_max', 'nir_mean',
       'nir_median', 'nir_min', 'nir_stdDev', 'red_max', 'red_mean',
       'red_median', 'red_min', 'red_stdDev', 'system_index',
       'dif_date_point_abs', 'satellite'],
      dtype='object')

In [None]:
df_subset = df[['SPM','blue_mean', 
       'green_mean',
       'nir_mean', 
       'red_mean',
       'datetime',
       'WATER_PERIOD']].copy()
# retirar valores em branco
df_subset = df_subset.dropna()
df_subset.isna().sum()

SPM             0
blue_mean       0
green_mean      0
nir_mean        0
red_mean        0
datetime        0
WATER_PERIOD    0
dtype: int64

In [None]:
drop_columns_X_model = ['SPM','datetime','WATER_PERIOD']

# Funções de calculo de métricas

In [None]:
def model_metrics(y_true,y_pred):
    ''' y = observed target values
    y_pred = predicted target values'''
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_absolute_error
    from sklearn.metrics import mean_squared_error
    from sklearn.metrics import mean_absolute_percentage_error
    from sklearn.metrics import explained_variance_score

    return {'r2':r2_score(y_true, y_pred),
'mae':mean_absolute_error(y_true, y_pred),
'mse':mean_squared_error(y_true, y_pred),
'mape':mean_absolute_percentage_error(y_true, y_pred),
'exp_var': explained_variance_score(y_true, y_pred)
    }

In [None]:
def cv_model_metrics(model,X,y,n_cv=5):
    ''' model = model to evaluate
    X = predictors
    y = observed target values
    cv = number of cross validations, standard is 5'''
    from sklearn.model_selection import ShuffleSplit

    cv = ShuffleSplit(n_splits=n_cv, test_size=0.15, random_state=0)
    from sklearn.model_selection import cross_val_score   
       
    return {'r2':abs(cross_val_score(model, X, y, cv=cv,scoring='r2')).mean(),
'mae':abs(cross_val_score(model, X, y, cv=cv,scoring='neg_mean_absolute_error')).mean(),
'mse':abs(cross_val_score(model, X, y, cv=cv,scoring='neg_mean_squared_error')).mean(),
'mape':abs(cross_val_score(model, X, y, cv=cv,scoring='neg_mean_absolute_percentage_error')).mean(),
'exp_var': abs(cross_val_score(model, X, y, cv=cv,scoring='explained_variance')).mean()
    }

## Separar teste e treino

In [None]:
# separate parameters
y = df_subset['SPM'].copy()
X = df_subset.drop(drop_columns_X_model,axis = 1).copy()

In [None]:
# Partition the training into 3481 training and 1519 validation samples
sample = sample.randomColumn(seed = 1)
training = sample.filter(ee.Filter.lt('random', 0.7))
validation = sample.filter(ee.Filter.gte('random', 0.7))

In [None]:
# Train a random forest with hyperparameter tuning ...
num_trees = ee.List.sequence(5, 200, 5)

def train(num_trees):
  """Train a Random Forest with N trees"""
  classifier = ee.Classifier.smileRandomForest(num_trees).train(features=training,
      classProperty = 'cropland', inputProperties = composite.bandNames())
  return classifier;


def train_acc(num_trees):
  acc = ee.Classifier.smileRandomForest(num_trees).train(
        features=training,
        classProperty='cropland',
        inputProperties=composite.bandNames()
      ).confusionMatrix().accuracy()
  return acc



classifiers = num_trees.map(train)

In [None]:
# Pick the classifier that returns the highest accuracy on the validation set.
def get_accuracy(classifier):
  return validation.classify(classifier).errorMatrix('cropland', 'classification').accuracy()

accuracies = classifiers.map(get_accuracy)
display(accuracies)

# Pick the most accurate classifier
i = ee.Array(accuracies).argmax().get(0)
classifier = classifiers.get(i)