### Description

This file trains the random forest models and creates the reconstructions. Both are saved to directories and can be found on this project's figshare page.

### Inputs

In [None]:
# =========================================
# For accessing directories
# =========================================
root_dir = "/local/data/artemis/workspace/jfs2167/recon_eval" # Set this to the path of the project

reference_output_dir = f"{root_dir}/references"
data_output_dir = f"{root_dir}/data/processed"
model_output_dir = f"{root_dir}/models/trained"
recon_output_dir = f"{root_dir}/models/reconstructions"
other_output_dir = f"{root_dir}/models/performance_metrics"

# =========================================
# Number of cores you have access to for model training
# =========================================
jobs = 1

### Modules

In [None]:
# standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
from sklearn.ensemble import RandomForestRegressor # random forest regressor (RFR)
from sklearn.model_selection import GridSearchCV

# Python file with supporting functions
import pre

### Predefined values

In [None]:
# Loading references
path_LET = f"{reference_output_dir}/members_LET_dict.pickle"
path_seeds = f"{reference_output_dir}/random_seeds.npy"
path_loc = f"{reference_output_dir}/members_seed_loc_dict.pickle"

with open(path_LET,'rb') as handle:
    mems_dict = pickle.load(handle)
    
random_seeds = np.load(path_seeds)    
    
with open(path_loc,'rb') as handle:
    seed_loc_dict = pickle.load(handle)

In [None]:
# =========================================
# Defining some inputs for the modeling process
# =========================================

# Train-validate-test split proportions
val_prop = .2
test_prop = .2

# Parameter grids
rf_param_grid = {'n_estimators': [50,100, 200],
                 'max_depth': [30,40]
                }

# Feature and target lists for feeding into the models
features_sel = ['sst_detrend', 'sst_anom', 'sss', 'sss_anom', 'mld_clim_log', 'chl_log', 'chl_anom', 'xco2', 'A', 'B', 'C', 'T0', 'T1']
target_sel = ['pCO2']

In [None]:
approach_output_dir = f"{other_output_dir}/{approach}"
param_fname = f"{approach_output_dir}/{approach}_best_params_dict.pickle"

with open(param_fname, 'rb') as handle:
    best_params = pickle.load(handle)

### Load data, mask it, train/val/test split, run models

In [None]:
# best_params = {} # Uncomment if running cross validation to find best params
test_performance = defaultdict(dict)
unseen_performance = defaultdict(dict)

K_folds = 3
approach = "rf"

print(datetime.datetime.now())
for ens, mem_list in mems_dict.items():
    print(ens)
    first_mem = False # Set to true if you want to tune parameters with the first member from each ensemble
    for member in mem_list:
        print(member)
        seed_loc = seed_loc_dict[ens][member] # Figure out which column to use for seeds
        
        # Data file path
        data_dir = f"{data_output_dir}/{ens}/member_{member}"
        fname = f"data_clean_2D_mon_{ens}_{member}_1x1_198201-201701.pkl"
        file_path = f"{data_dir}/{fname}"
        
        # Read in data, create some selection filters, produce a reduced dataframe
        df = pd.read_pickle(file_path)
        # 816 represents 3 standard deviations above mean in SOCAT data -- observations above this threshold are unrealistic to observe in real-world data
        recon_sel = (~df[features_sel+target_sel+['net_mask']].isna().any(axis=1)) & (df[target_sel] < 816).to_numpy().ravel()
        sel = (recon_sel & (df['socat_mask'] == 1))
        unseen_sel = (recon_sel & (df['socat_mask'] == 0))
        
        # Convert dataframe to numpy arrays, train/val/test split
        X = df.loc[sel,features_sel].to_numpy()
        y = df.loc[sel,target_sel].to_numpy().ravel()
                
        N = X.shape[0]
        train_val_idx, train_idx, val_idx, test_idx = pre.train_val_test_split(N, test_prop, val_prop, random_seeds, seed_loc)
        X_train_val, X_train, X_val, X_test, y_train_val, y_train, y_val, y_test = pre.apply_splits(X, y, train_val_idx, train_idx, val_idx, test_idx) 
        
        # Define the model based on which approach to use    
        if first_mem:
            model = RandomForestRegressor(random_state=random_seeds[2,seed_loc], n_jobs=jobs)
            param_grid = rf_param_grid
            grid = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=K_folds, return_train_score=False, refit=False)
            grid.fit(X_train_val, y_train_val)
            best_params[ens] = grid.best_params_
            first_mem = False

        # Fit the model on train/validation data
        model = RandomForestRegressor(random_state=random_seeds[3,seed_loc], **best_params[ens], n_jobs=jobs)
        model.fit(X_train_val, y_train_val)          

        # Save the model
        pre.save_model(model, model_output_dir, approach, ens, member)

        # Calculate some test error metrics and store in a dictionary
        y_pred_test = model.predict(X_test)
        test_performance[ens][member] = pre.evaluate_test(y_test, y_pred_test)
        
        # Redo this analysis on the unseen data
        y_pred_unseen = model.predict(df.loc[unseen_sel,features_sel].to_numpy())
        y_unseen = df.loc[unseen_sel,target_sel].to_numpy().ravel()
        unseen_performance[ens][member] = pre.evaluate_test(y_unseen, y_pred_unseen)

        # Create the reconstruction and save it
        y_pred_seen = model.predict(X)
        df['pCO2_recon'] = np.nan
        df.loc[unseen_sel,['pCO2_recon']] = y_pred_unseen
        df.loc[sel,['pCO2_recon']] = y_pred_seen
        DS_recon = df[['net_mask','socat_mask','pCO2', 'pCO2_recon']].to_xarray()
        pre.save_recon(DS_recon, recon_output_dir, approach, ens, member)


print(datetime.datetime.now())

In [None]:
# Saving best parameters and performance metrics
approach_output_dir = f"{other_output_dir}/{approach}"
param_fname = f"{approach_output_dir}/{approach}_best_params_dict.pickle"
test_perform_fname = f"{approach_output_dir}/{approach}_test_performance_dict.pickle"
unseen_perform_fname = f"{approach_output_dir}/{approach}_unseen_performance_dict.pickle"

Path(approach_output_dir).mkdir(parents=True, exist_ok=True)

with open(param_fname, 'wb') as handle:
    pickle.dump(best_params, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(test_perform_fname, 'wb') as handle:
    pickle.dump(test_performance, handle)
with open(unseen_perform_fname, 'wb') as handle:
    pickle.dump(unseen_performance, handle)

In [None]:
# Convert performance metrics to dataframes
test_df = pd.DataFrame.from_dict({(i,j): test_performance[i][j]
                                  for i in test_performance.keys()
                                  for j in test_performance[i].keys()},
                                 orient='index')

unseen_df = pd.DataFrame.from_dict({(i,j): unseen_performance[i][j]
                                  for i in unseen_performance.keys()
                                  for j in unseen_performance[i].keys()},
                                 orient='index')

test_df.index.names = ["model","member"]
unseen_df.index.names = ["model","member"]

# Save the dataframes too
test_df_fname = f"{approach_output_dir}/{approach}_test_performance_df.pickle"
unseen_df_fname = f"{approach_output_dir}/{approach}_unseen_performance_df.pickle"

test_df.to_pickle(test_df_fname)
unseen_df.to_pickle(unseen_df_fname)