<a href="https://colab.research.google.com/github/lgiesen/forest_height/blob/main/notebooks/random_forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount ('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# the zipped data is uploaded in the root_path folder
root_path = 'drive/MyDrive/Colab Notebooks/data/'
path_images = f'{root_path}images/'
path_masks = f'{root_path}masks/'
user = "lgiesen"
repo = "forest_height"
!git clone https://github.com/{user}/{repo}.git

Cloning into 'forest_height'...
remote: Enumerating objects: 302, done.[K
remote: Counting objects: 100% (165/165), done.[K
remote: Compressing objects: 100% (132/132), done.[K
remote: Total 302 (delta 91), reused 77 (delta 32), pack-reused 137[K
Receiving objects: 100% (302/302), 19.80 MiB | 24.61 MiB/s, done.
Resolving deltas: 100% (157/157), done.


Load dataset

In [6]:
import pandas as pd
X_train = pd.read_pickle("forest_height/data/X_train.pkl")
y_train = pd.read_pickle("forest_height/data/y_train.pkl")
X_test = pd.read_pickle("forest_height/data/X_test.pkl")
y_test = pd.read_pickle("forest_height/data/y_test.pkl")

Random Search

In [34]:
import numpy as np
# Number of trees in random forest
n_estimators = [100, 200, 300, 400, 500] #[int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 100, num = 11)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'criterion': ['absolute_error'],
               'bootstrap': bootstrap}

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
# initialize model
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(
    estimator = rf,
    param_distributions = random_grid,
    # scoring="neg_mean_absolute_error", # strategy to evaluate the performance
    n_iter = 50,
    cv = 5, # k-fold cross-validation
    verbose=3, # the higher, the more messages
    random_state=42,
    n_jobs = -1, # use all processors
    return_train_score=True)

# train model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [None]:
cv_results = rf_random.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
df = pd.DataFrame(cv_results)

In [None]:
rf_random.best_params_

# todo: feature importance:

In [None]:
import matplotlib.pyplot as plt
# get importance
importance = rf_random.coef_
# summarize feature importance
for i,v in enumerate(importance):
 print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()

In [None]:
# rerun this cell
sort = rf.feature_importances_.argsort()
plt.barh(X_train.feature_names[sort], rf.feature_importances_[sort])
plt.xlabel("Feature Importance")

Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV# Create the parameter grid based on the results of random search
# param_grid = {
#     'bootstrap': [True],
#     'max_depth': [80, 90, 100, 110],
#     'max_features': [2, 3],
#     'min_samples_leaf': [3, 4, 5],
#     'min_samples_split': [8, 10, 12],
#     'n_estimators': [100, 200, 300, 1000]
# }
# Create a based model
rf = RandomForestRegressor()# Instantiate the grid search model
grid_search = GridSearchCV(
    estimator = rf,
    param_grid = random_grid,
    cv = 3,
    n_jobs = -1,
    verbose = 3)

In [None]:
%%time
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
best_grid = grid_search.best_estimator_
y_pred = best_grid.predict(X_test)
grid_accuracy = evaluate(best_grid, test_features, test_labels)

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor
# initialize model
rf = RandomForestRegressor(rf_random.best_params_, random_state=42)

# train model
rf.fit(X_train, y_train)

Save model

In [None]:
import joblib
# save model
joblib.dump(rf, 'random_forest.joblib')
# load model with:
# rf = joblib.load("random_forest.joblib")

In [None]:
X_test = X_test.reshape(-1, 1)

In [None]:
base_accuracy = evaluate(rf_random, X_test, y_test)

In [None]:
y_pred = rf_random.predict(X_test)

Visualize prediction

In [None]:
%run forest_height/src/visualize_data.py

In [None]:
plot(y_pred[0])
plot(X_test[0])

In [None]:
ypred_rf = rf.predict(Xtest)

mse_rf = mse(ytest, ypred_rf)
rmse_rf = mse_rf ** (1/2)
mae_rf = mae(ytest, ypred_rf)
mape_rf = mape(ytest, ypred_rf)

print(mape_rf)
print(mae_rf)
print(rmse_rf)

In [10]:
from sklearn.metrics import (mean_absolute_error,
                             mean_absolute_percentage_error,
                             mean_squared_error)


def evaluate_model(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred) ** (1/2)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f'MAE: {mae}; MSE: {mse}; RMSE: {rmse}; MAPE: {mape}')
    return (mae, mse, rmse, mape)

In [None]:
mae, mse, rmse, mape = evaluate_model(y_test, y_pred)

In [None]:
from matplotlib import pyplot as plt
plt.scatter(X_test, y_test, color = 'green')
plt.scatter(X_test, y_pred, color = 'red')
plt.title('Random Forest Regression')
plt.xlabel('Pixel')
plt.ylabel('Forest Height')
plt.show()

In [None]:
del X_train, y_train