# Random Forest Model, Trained on Limited Dataset
## Model Dataset Limited by SOCAT Sampling Locations

(to be used as a baseline)

In [3]:
# Standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# Machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

# Python file with supporting functions
import model_utils

In [4]:
global_model_path = '/home/julias/MLEE-final-project/models/saved_models/recon_models'

https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

# Load Split Datasets 

## Load Split Datasets

In [5]:
X_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X.nc').to_dataframe().dropna()
y_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y.nc').to_dataframe().dropna() 
X_train_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X_train.nc').to_dataframe().dropna() 
y_train_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y_train.nc').to_dataframe().dropna()
X_test_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X_test.nc').to_dataframe().dropna() 
y_test_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y_test.nc').to_dataframe().dropna() 

## Check that data was saved and loaded properly:

In [9]:
X_train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SSS,SST,MLD,Chl,XCO2,T0,T1,A,B,C
xlon,ylat,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-179.5,-77.5,1982-03-15,33.997498,0.640288,26.234657,0.331255,340.962250,0.292600,0.956235,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-04-15,34.180756,-1.755514,59.944748,0.641632,341.075439,-0.234491,0.972118,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-05-15,34.445652,-1.776272,220.441910,0.057961,341.193176,-0.683919,0.729558,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-06-15,34.508339,-1.809904,381.795532,0.017642,341.310730,-0.959933,0.280231,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-08-15,34.562683,-1.823900,429.841278,0.005047,341.545685,-0.720667,-0.693281,-0.976296,-0.001889,0.216431
...,...,...,...,...,...,...,...,...,...,...,...,...
179.5,62.5,1996-01-15,32.590897,-1.597806,27.985397,0.074352,361.389465,0.966848,0.255353,0.887011,0.004029,0.461731
179.5,62.5,1998-12-15,32.283058,0.631731,25.075340,0.897956,367.086853,0.962309,-0.271958,0.887011,0.004029,0.461731
179.5,62.5,1999-12-15,32.390461,0.080004,26.077244,0.703162,368.669678,0.962309,-0.271958,0.887011,0.004029,0.461731
179.5,62.5,2010-02-15,32.757904,-1.240300,27.023640,0.085494,389.416168,0.702527,0.711657,0.887011,0.004029,0.461731


In [10]:
y_test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pCO2
xlon,ylat,time,Unnamed: 3_level_1
-179.5,-77.5,1982-02-15,165.219524
-179.5,-77.5,1982-07-15,332.735048
-179.5,-77.5,1982-12-15,150.877219
-179.5,-77.5,1983-05-15,311.177084
-179.5,-77.5,1983-10-15,338.556662
...,...,...,...
179.5,62.5,1985-01-15,380.706079
179.5,62.5,1987-02-15,391.417257
179.5,62.5,1990-01-15,431.635894
179.5,62.5,1992-12-15,406.267746


## Create Numpy Arrays for Original Data

In [11]:
X = X_df.to_numpy()         
y = y_df.to_numpy().ravel() 
X_train = X_train_df.to_numpy() 
y_train = y_train_df.to_numpy().ravel()
X_test = X_test_df.to_numpy()  
y_test = y_test_df.to_numpy().ravel()

# Basic RF Model

In [12]:
RF_basic = RandomForestRegressor(n_estimators=100,max_depth=5,min_samples_split=5,min_samples_leaf=4)

In [None]:
RF_model_basic = RF_basic.fit(X_train,y_train)

In [None]:
train_score_basic = RF_model_basic.score(X_train,y_train)
print('Train Score: %.5f' % train_score)

In [None]:
test_score_basic = RF_model_basic.score(X_test,y_test)
print('Test Score: %.5f' % test_score)

In [None]:
y_pred_basic = RF_model_basic.predict(X_test)
mse_basic = mean_squared_error(y_test, y_pred_basic)
print('MSE: %.2f' % mse)

In [None]:
joblib.dump(RF_model_basic, os.path.join(global_model_path,'RF_model_basic.h5')) 

In [None]:
RF_prelim_load_test = joblib.load(os.path.join(global_model_path,'RF_model_basic.h5'))

In [None]:
train_score = RF_prelim_load_test.score(X_train,y_train)
print('Train Score: %.5f' % train_score)

# Select Approach and Parameters

## Approach

- As tested in basic model above, RF requires that the data is input using numpy. 
- For consistency and to faciliate comparison with other models, will continue using the original (not normalized) data.

## Parameters

Use a cross-validation search to optimize the RF model

In [32]:
# Number of trees in random forest
n_estimators = np.arange(100,500,100) #Test 100, 200, 300, 400

# Number of features to consider at every split
max_features = [1.0, 'sqrt'] #1.0 is the same as auto, auto is depreciated 

# Maximum number of levels in tree
max_depth = np.arange(10,50,10)

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 8, 12]

# Use default bootstrap=True

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
              }

In [33]:
print(random_grid)

{'n_estimators': array([100, 200, 300, 400]), 'max_features': [1.0, 'sqrt'], 'max_depth': array([10, 20, 30, 40]), 'min_samples_split': [5, 10, 15], 'min_samples_leaf': [2, 8, 12]}


In [35]:
reg0 = RandomForestRegressor(random_state=0)

rf_search = RandomizedSearchCV(estimator = reg0, param_distributions = random_grid, 
                                n_iter = 5, cv = 3, verbose=2, n_jobs = -1)
# Verbose =2 will display computations and scores, n_jobs = -1 will use all processors

rf_parameters = rf_search.fit(X_train,y_train)

print("The best hyperparameters: \n",rf_search.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time= 1.3min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time= 1.3min
[CV] END max_depth=40, max_features=sqrt, min_samples_leaf=12, min_samples_split=5, n_estimators=300; total time= 1.3min
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=12, min_samples_split=5, n_estimators=400; total time= 1.7min
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=12, min_samples_split=5, n_estimators=400; total time= 1.8min
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=12, min_samples_split=5, n_estimators=400; total time= 1.8min
The best hyperparameters: 
 {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 12, 'max_features': 'sqrt', 'max_depth': 30}
[CV] END max_depth=20, max_features=1.0, min_samples_leaf=8, min_samples_sp

# RF Model

Will serve as baseline for comparison to other algorithms

## Build RF Model

Using best parameters found above

In [36]:
best_params = {'n_estimators': 400, 'min_samples_split': 5, 'min_samples_leaf': 12, 'max_features': 'sqrt', 'max_depth': 30}

In [46]:
RF = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                           max_depth=best_params['max_depth'],
                           max_features=best_params['max_features'],
                           min_samples_split=best_params['min_samples_split'],
                           min_samples_leaf=best_params['min_samples_leaf'])

In [47]:
RF_model = RF.fit(X_train,y_train)

In [48]:
joblib.dump(RF_model_basic, os.path.join(global_model_path,'RF_model.h5')) 

['/home/julias/MLEE-final-project/models/saved_models/recon_models/RF_model.h5']

## Re-load model for analysis

In [49]:
RF_model = joblib.load(os.path.join(global_model_path,'RF_model.h5'))

In [50]:
train_score = RF_model.score(X_train,y_train)
print('Train Score: %.5f' % train_score)

Train Score: 0.63388


In [51]:
test_score = RF_model.score(X_test,y_test)
print('Test Score: %.5f' % test_score)

Test Score: 0.59993


In [52]:
y_pred = RF_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('MSE: %.2f' % mse)

MSE: 641.36
