# Random Forest Model, Trained on Limited Dataset
## Model Dataset Limited by SOCAT Sampling Locations

(to be used as a baseline)

In [1]:
# Standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# Machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Python file with supporting functions
import model_utils

2023-01-12 13:01:48.265095: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
global_model_path = '/home/julias/MLEE-final-project/models/saved_models/recon_models'

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

# Load Split Datasets 

## Load Split Datasets

In [3]:
X_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X.nc').to_dataframe().dropna()
y_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y.nc').to_dataframe().dropna() 
X_train_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X_train.nc').to_dataframe().dropna() 
y_train_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y_train.nc').to_dataframe().dropna()
X_test_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X_test.nc').to_dataframe().dropna() 
y_test_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y_test.nc').to_dataframe().dropna() 

## Check that data was saved and loaded properly:

In [None]:
X_train_df

In [None]:
y_test_df

## Create Numpy Arrays for Original Data

In [4]:
X = X_df.to_numpy()         
y = y_df.to_numpy().ravel() 
X_train = X_train_df.to_numpy() 
y_train = y_train_df.to_numpy().ravel()
X_test = X_test_df.to_numpy()  
y_test = y_test_df.to_numpy().ravel()

# Basic RF Model

### Build Basic RF Model

In [5]:
RF_basic = RandomForestRegressor(n_estimators=100,max_depth=5,min_samples_split=5,min_samples_leaf=4)

In [None]:
RF_model_basic = RF_basic.fit(X_train,y_train)

In [None]:
joblib.dump(RF_model_basic, os.path.join(global_model_path,'RF_model_basic.h5')) 

### Re-load Basic Model for Analysis

In [None]:
RF_model_basic = joblib.load(os.path.join(global_model_path,'RF_model_basic.h5'))

In [None]:
train_score_basic = RF_model_basic.score(X_train,y_train)
print('Train Score: %.5f' % train_score)

In [None]:
test_score_basic = RF_model_basic.score(X_test,y_test)
print('Test Score: %.5f' % test_score)

In [None]:
y_pred_basic = RF_model_basic.predict(X_test)
mse_basic = mean_squared_error(y_test, y_pred_basic)
print('MSE: %.2f' % mse)

In [None]:
train_score = RF_prelim_load_test.score(X_train,y_train)
print('Train Score: %.5f' % train_score)

# Select Approach and Parameters

## Approach

- As tested in basic model above, RF requires that the data is input using numpy. 
- For consistency and to faciliate comparison with other models, will continue using the original (not normalized) data.

## Parameters

Use a cross-validation search to optimize the RF model

In [None]:
# Number of trees in random forest
n_estimators = np.arange(100,500,100) #Test 100, 200, 300, 400

# Number of features to consider at every split
max_features = [1.0, 'sqrt'] #1.0 is the same as auto, auto is depreciated 

# Maximum number of levels in tree
max_depth = np.arange(10,50,10)

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 8, 12]

# Use default bootstrap=True

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
              }

In [None]:
print(random_grid)

In [None]:
reg0 = RandomForestRegressor(random_state=0)

rf_search = RandomizedSearchCV(estimator = reg0, param_distributions = random_grid, 
                                n_iter = 5, cv = 3, verbose=2, n_jobs = -1)
# Verbose =2 will display computations and scores, n_jobs = -1 will use all processors

rf_parameters = rf_search.fit(X_train,y_train)

print("The best hyperparameters: \n",rf_search.best_params_)

# RF Model

Will serve as baseline for comparison to other algorithms

## Build RF Model

Using best parameters found above

In [None]:
best_params = {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 12, 'max_features': 'sqrt', 'max_depth': 30}

In [None]:
RF = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                           max_depth=best_params['max_depth'],
                           max_features=best_params['max_features'],
                           min_samples_split=best_params['min_samples_split'],
                           min_samples_leaf=best_params['min_samples_leaf'])

In [None]:
RF_model = RF.fit(X_train,y_train)

In [None]:
joblib.dump(RF_model_basic, os.path.join(global_model_path,'RF_model.h5')) 

## Re-load model for analysis

In [None]:
RF_model = joblib.load(os.path.join(global_model_path,'RF_model.h5'))

In [None]:
train_score = RF_model.score(X_train,y_train)
print('Train Score: %.5f' % train_score)

In [None]:
test_score = RF_model.score(X_test,y_test)
print('Test Score: %.5f' % test_score)

In [None]:
y_pred = RF_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('MSE: %.2f' % mse)