# Compare RF, XGB, and NN Models

In [None]:
# Standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import pickle

# Machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import *
from tensorflow.keras import Sequential

# Python file with supporting functions
import model_utils

In [1]:
recon_model_path = '/home/julias/MLEE-final-project/models/saved_models/recon_models'
global_model_path = '/home/julias/MLEE-final-project/models/saved_models/global_models'

# Import Datasets

## Import Model Dataset Limited by SOCAT Sampling Locations

In [None]:
#X_ds_limited = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon')
#y_ds_limited = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon')
X_train_ds_limited = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X_train.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon')
y_train_ds_limited = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y_train.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon')
X_test_ds_limited = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X_test.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon')
y_test_ds_limited = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y_test.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon')

In [None]:
#X_df_limited = X_ds_limited.to_dataframe().dropna()
#y_df_limited = y_ds_limited.to_dataframe().dropna()
X_train_df_limited = X_train_ds_limited.to_dataframe().dropna()
y_train_df_limited = y_train_ds_limited.to_dataframe().dropna()
X_test_df_limited = X_test_ds_limited.to_dataframe().dropna()
y_test_df_limited = y_test_ds_limited.to_dataframe().dropna()

In [None]:
#X_limited = X_df_limited.to_numpy()         
#y_limited = y_df_limited.to_numpy().ravel() 
X_train_limited = X_train_df_limited.to_numpy() 
y_train_limited = y_train_df_limited.to_numpy().ravel()
X_test_limited = X_test_df_limited.to_numpy()  
y_test_limited = y_test_df_limited.to_numpy().ravel()

## Import Global Model Dataset and Reduced Version (2007-2017) Used for NN 

In [None]:
#X_df_global = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X.nc').to_dataframe().dropna()
#y_df_global = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y.nc').to_dataframe().dropna() 
X_train_df_global = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X_train.nc').to_dataframe().dropna() 
y_train_df_global = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y_train.nc').to_dataframe().dropna()
X_test_df_global = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X_test.nc').to_dataframe().dropna() 
y_test_df_global = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y_test.nc').to_dataframe().dropna() 

In [None]:
#X_global = X_df_global.to_numpy()         
#y_global = y_df_global.to_numpy().ravel() 
X_train_global = X_train_df_global.to_numpy() 
y_train_global = y_train_df_global.to_numpy().ravel()
X_test_global = X_test_df_global.to_numpy()  
y_test_global = y_test_df_global.to_numpy().ravel()

In [None]:
slice_time_start = '2007'

#X_ds_reduced = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon').sel(time=slice(slice_time_start,None))
#y_ds_reduced = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon').sel(time=slice(slice_time_start,None))
X_train_ds_reduced = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X_train.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon').sel(time=slice(slice_time_start,None))
y_train_ds_reduced = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y_train.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon').sel(time=slice(slice_time_start,None))
X_test_ds_reduced = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/X_test.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon').sel(time=slice(slice_time_start,None))
y_test_ds_reduced = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/global_split_datasets/y_test.nc').sortby(['time','xlon','ylat']).transpose('time','ylat','xlon').sel(time=slice(slice_time_start,None))

In [None]:
#X_df_reduced = X_ds_reduced.to_dataframe().dropna()
#y_df_reduced = y_ds_reduced.to_dataframe().dropna()
X_train_df_reduced = X_train_ds_reduced.to_dataframe().dropna()
y_train_df_reduced = y_train_ds_reduced.to_dataframe().dropna()
X_test_df_reduced = X_test_ds_reduced.to_dataframe().dropna()
y_test_df_reduced = y_test_ds_reduced.to_dataframe().dropna()

# Import Models

## Import Models Trained on Limited Dataset

### RF Limited

Note that while an "optimized" RF model was created, the basic one is used here because it had a higher test score and a comparable training score.

In [None]:
RF_limited = joblib.load(os.path.join(recon_model_path,'RF_model_basic.h5'))

In [None]:
y_pred_RF_limited = RF_limited.predict(X_test_limited)

### XGB Limited

Highest parameter XGB model is used.

In [None]:
XGB_limited = joblib.load(os.path.join(recon_model_path,'XGB_prelim_highest_param.h5'))

In [None]:
y_pred_XGB_limited = XGB_limited.predict(X_test_df_limited)

### NN Limited

Optimized NN trained on original dataframes is used.

In [None]:
NN_limited = load_model(os.path.join(recon_model_path,'NN_model1.h5'))

In [None]:
y_pred_NN_limited = NN_limited.predict(X_test_df_limited)

## Import Models Trained on Global Dataset

### RF Global

In [None]:
RF_global = 

In [None]:
y_pred_RF_global = RF_global.predict(X_test_global)

### XGB Global

Highest parameter XGB model is used.

In [None]:
XGB_global = joblib.load(os.path.join(global_model_path,'XGB_prelim_highest_param.h5'))

In [None]:
y_pred_XGB_global = XGB_global.predict(X_test_df_global)

### NN Global (Reduced)

Semi-optimized NN is used. Further optimization was not possible. Note that the NN was trained on a reduced global dataset (2007-2017).

In [None]:
NN_global = load_model(os.path.join(global_model_path,'NN_model_semioptimized.h5'))

In [None]:
y_pred_NN_global = NN_global.predict(X_test_df_global)

# Comparison of Models Trained on Limited Dataset

In [None]:
fig, ax = plt.subplots(nrows=3, figsize=(8,12)) 

r1 = range(len(y_test_limited))
ax[0].plot(r1, y_test_limited, label="original")
ax[0].plot(r1, y_pred_RF_limited, label="predicted")
ax[0].set_title("pC02 Test and Predicted Data for RF Limited Model")
ax[0].legend()

r2 = range(len(y_test_df_limited))
ax[1].plot(r2, y_test_df_limited, label="original")
ax[1].plot(r2, y_pred_XGB_limited, label="predicted")
ax[1].set_title("pC02 Test and Predicted Data for XGB Limited Model")
ax[1].legend()

r3 = range(len(y_test_df_limited))
ax[2].plot(r3, y_test_df_limited, label="original")
ax[2].plot(r3, y_pred_NN_limited, label="predicted")
ax[2].set_title("pC02 Test and Predicted Data for NN Limited Model")
ax[2].legend()

# Comparison of Models Trained on Global Dataset

In [None]:
fig, ax = plt.subplots(nrows=3, figsize=(8,12)) 

r1 = range(len(y_test_global))
ax[0].plot(r1, y_test_global, label="original")
ax[0].plot(r1, y_pred_RF_global, label="predicted")
ax[0].set_title("pC02 Test and Predicted Data for RF Global Model")
ax[0].legend()

r2 = range(len(y_test_df_global))
ax[1].plot(r2, y_test_df_global, label="original")
ax[1].plot(r2, y_pred_XGB_global, label="predicted")
ax[1].set_title("pC02 Test and Predicted Data for XGB Global Model")
ax[1].legend()

r3 = range(len(y_test_df_reduced))
ax[2].plot(r3, y_test_df_reduced, label="original")
ax[2].plot(r3, y_pred_NN_global, label="predicted")
ax[2].set_title("pC02 Test and Predicted Data for NN Global Model")
ax[2].legend()

# Comparison of RF Models

In [None]:
fig, ax = plt.subplots(nrows=2, figsize=(8,12)) 


# Comparison of XGB Models

# Comparison of NN Models