# Random Forest 

(to be used as a baseline)

In [None]:
# Standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# Machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
#import xgboost as xgb     # extreme gradient boosting (XGB)
#from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Python file with supporting functions
import model_utils

In [None]:
recon_model_path = '/home/julias/MLEE-final-project/models/saved_models/recon_models'

# Load Split Datasets and Create Versions for Experimentation

## Load Split Datasets

In [25]:
X_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X.nc').to_dataframe().dropna()
y_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y.nc').to_dataframe().dropna() 
X_train_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X_train.nc').to_dataframe().dropna() 
y_train_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y_train.nc').to_dataframe().dropna()
X_test_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X_test.nc').to_dataframe().dropna() 
y_test_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y_test.nc').to_dataframe().dropna() 

Check that data was saved and loaded properly:

In [26]:
X_train_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SSS,SST,MLD,Chl,XCO2,T0,T1,A,B,C
xlon,ylat,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-179.5,-77.5,1982-03-15,33.997498,0.640288,26.234657,0.331255,340.962250,0.292600,0.956235,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-04-15,34.180756,-1.755514,59.944748,0.641632,341.075439,-0.234491,0.972118,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-05-15,34.445652,-1.776272,220.441910,0.057961,341.193176,-0.683919,0.729558,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-06-15,34.508339,-1.809904,381.795532,0.017642,341.310730,-0.959933,0.280231,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-08-15,34.562683,-1.823900,429.841278,0.005047,341.545685,-0.720667,-0.693281,-0.976296,-0.001889,0.216431
...,...,...,...,...,...,...,...,...,...,...,...,...
179.5,62.5,1996-01-15,32.590897,-1.597806,27.985397,0.074352,361.389465,0.966848,0.255353,0.887011,0.004029,0.461731
179.5,62.5,1998-12-15,32.283058,0.631731,25.075340,0.897956,367.086853,0.962309,-0.271958,0.887011,0.004029,0.461731
179.5,62.5,1999-12-15,32.390461,0.080004,26.077244,0.703162,368.669678,0.962309,-0.271958,0.887011,0.004029,0.461731
179.5,62.5,2010-02-15,32.757904,-1.240300,27.023640,0.085494,389.416168,0.702527,0.711657,0.887011,0.004029,0.461731


In [27]:
y_test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pCO2
xlon,ylat,time,Unnamed: 3_level_1
-179.5,-77.5,1982-02-15,165.219524
-179.5,-77.5,1982-07-15,332.735048
-179.5,-77.5,1982-12-15,150.877219
-179.5,-77.5,1983-05-15,311.177084
-179.5,-77.5,1983-10-15,338.556662
...,...,...,...
179.5,62.5,1985-01-15,380.706079
179.5,62.5,1987-02-15,391.417257
179.5,62.5,1990-01-15,431.635894
179.5,62.5,1992-12-15,406.267746


### Create Numpy Arrays for Original Data

In [43]:
X_original = X_df.to_numpy()         
y_original = y_df.to_numpy().ravel() 
X_train_original = X_train_df.to_numpy() 
y_train_original = y_train_df.to_numpy().ravel()
X_test_original = X_test_df.to_numpy()  
y_test_original = y_test_df.to_numpy().ravel()

## Create Normalized Dataframes

*Note: Done here, as opposed to in test/train split, so that I can save the original train/test datasets and later determine whether normalization led to improvement. (My group has not historically normalized data before training.)*

In [None]:
#X_df_norm = (X_df - X_df.mean())/X_df.std() 
#y_df_norm = (y_df - y_df.mean())/y_df.std()
#X_train_df_norm = (X_train_df - X_train_df.mean())/X_train_df.std()
#y_train_df_norm = (y_train_df - y_train_df.mean())/y_train_df.std()
#X_test_df_norm = (X_test_df - X_test_df.mean())/X_test_df.std()
#y_test_df_norm = (y_test_df - y_test_df.mean())/y_test_df.std()

CHANGE METHOD TO BELOW
- This way, time and lat/lon conversions are not normalized
- Note that coordinates don't seem to be input into ML algorithms, so T0,T1,A,B,C are the inputs of time and space (as would want, don't want two forms of time and space input)

In [49]:
X_df_norm = X_df
y_df_norm = y_df
X_train_df_norm = X_train_df
y_train_df_norm = y_train_df
X_test_df_norm = X_test_df
y_test_df_norm = y_test_df

for df in X_df_norm, X_train_df_norm, X_test_df_norm:
    df.SSS = (df.SSS - df.SSS.mean())/df.SSS.std() 
    df.SST = (df.SST - df.SST.mean())/df.SST.std() 
    df.MLD = (df.MLD - df.MLD.mean())/df.MLD.std() 
    df.Chl = (df.Chl - df.Chl.mean())/df.Chl.std() 
    df.XCO2 = (df.XCO2 - df.XCO2.mean())/df.XCO2.std() 

for df in y_df_norm, y_train_df_norm, y_test_df_norm:
    df.pCO2 = (df.pCO2 - df.pCO2.mean())/df.pCO2.std() 

In [51]:
X_train_df_norm

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,SSS,SST,MLD,Chl,XCO2,T0,T1,A,B,C
xlon,ylat,time,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
-179.5,-77.5,1982-03-15,-0.265890,-1.488140,-0.692089,0.127524,-1.566749,0.292600,0.956235,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-04-15,-0.102081,-1.720083,-0.150905,0.678477,-1.560679,-0.234491,0.972118,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-05-15,0.134703,-1.722093,2.425728,-0.357605,-1.554364,-0.683919,0.729558,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-06-15,0.190737,-1.725349,5.016110,-0.429174,-1.548060,-0.959933,0.280231,-0.976296,-0.001889,0.216431
-179.5,-77.5,1982-08-15,0.239313,-1.726704,5.787440,-0.451532,-1.535459,-0.720667,-0.693281,-0.976296,-0.001889,0.216431
...,...,...,...,...,...,...,...,...,...,...,...,...
179.5,62.5,1996-01-15,-1.523210,-1.704815,-0.663983,-0.328508,-0.471215,0.966848,0.255353,0.887011,0.004029,0.461731
179.5,62.5,1998-12-15,-1.798378,-1.488968,-0.710701,1.133481,-0.165658,0.962309,-0.271958,0.887011,0.004029,0.461731
179.5,62.5,1999-12-15,-1.702374,-1.542382,-0.694616,0.787700,-0.080769,0.962309,-0.271958,0.887011,0.004029,0.461731
179.5,62.5,2010-02-15,-1.373927,-1.670204,-0.679423,-0.308730,1.031888,0.702527,0.711657,0.887011,0.004029,0.461731


## Create Numpy Arrays 

In [None]:
X_n = X_df_norm.to_numpy()         
y_n = y_df_norm.to_numpy().ravel() 
X_train_n = X_train_df_norm.to_numpy()
y_train_n = y_train_df_norm.to_numpy().ravel()
X_test_n = X_test_df_norm.to_numpy() 
y_test_n = y_test_df_norm.to_numpy().ravel()

## Building and Training the RF Model

In [None]:
# try using cross-validation to get the best hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 5)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5,55, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15, 25]

# Minimum number of samples required at each leaf node
min_samples_leaf = [4, 8, 12,16]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
print(random_grid)

In [None]:
reg0 = RandomForestRegressor(random_state=0)
# perform cross validation
rf_random0 = RandomizedSearchCV(estimator = reg0, param_distributions = random_grid, 
                                n_iter = 5, cv = 3, verbose=2, n_jobs = -1)
rf_tas = rf_random0.fit(X_train,y_train)

print("The best hyperparameters: \n",rf_tas.best_params_)

## Testing the RF Model

In [13]:
# name everything with XGB
# will be Notebook B in 3_code

In [14]:
# name everything with NN
# will be Notebook C in 3_code

In [None]:
model.save(os.path.join(recon_model_path,'NN_model.h5'))

#### 3.c.ii. Testing the NN Model

In [None]:
# then reload before start working with test data
model = load_model(os.path.join(recon_model_path,'NN_model.h5'))