### Description

This file trains the feed forward neural network models and creates the reconstructions. Both are saved to directories and can be found on this project's figshare page.

### Inputs

In [None]:
# =========================================
# For accessing directories
# =========================================
root_dir = "/local/data/artemis/workspace/jfs2167/recon_eval" # Set this to the path of the project

reference_output_dir = f"{root_dir}/references"
data_output_dir = f"{root_dir}/data/processed"
model_output_dir = f"{root_dir}/models/trained"
recon_output_dir = f"{root_dir}/models/reconstructions"
other_output_dir = f"{root_dir}/models/performance_metrics"

### Modules

In [None]:
# standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import joblib
import pickle

# machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
from sklearn.model_selection import GridSearchCV

import tensorflow as tf
import keras

# Python file with supporting functions
import pre

### Predefined values

In [None]:
# Loading references
path_LET = f"{reference_output_dir}/members_LET_dict.pickle"
path_seeds = f"{reference_output_dir}/random_seeds.npy"
path_loc = f"{reference_output_dir}/members_seed_loc_dict.pickle"

with open(path_LET,'rb') as handle:
    mems_dict = pickle.load(handle)
    
random_seeds = np.load(path_seeds)    
    
with open(path_loc,'rb') as handle:
    seed_loc_dict = pickle.load(handle)

In [None]:
# =========================================
# Defining some inputs for the modeling process
# =========================================

# Train-validate-test split proportions
val_prop = .2
test_prop = .2

# Feature and target lists for feeding into the models
features_sel = ['sst_detrend', 'sst_anom', 'sss', 'sss_anom', 'mld_clim_log', 'chl_log', 'chl_anom', 'xco2', 'A', 'B', 'C', 'T0', 'T1']
target_sel = ['pCO2']

### Load data, mask it, train/val/test split, run models

In [None]:
val_dict = defaultdict(dict) # We train 5 models per member and keep track of validation set performance to determine which to pick as the best
test_dict = defaultdict(dict)
unseen_dict = defaultdict(dict)

for ens, members in mems_dict.items():
    for member in members:
        val_dict[ens][member] = {}
        test_dict[ens][member] = {}
        unseen_dict[ens][member] = {}

approach = "nn"
num_runs = 5

In [None]:
print(datetime.datetime.now())
for ens, mem_list in mems_dict.items():
    print(ens)
    for member in mem_list:
        print(member)
        seed_loc = seed_loc_dict[ens][member] # Figure out which column to use for seeds
        
        # Data file path
        data_dir = f"{data_output_dir}/{ens}/member_{member}"
        fname = f"data_clean_2D_mon_{ens}_{member}_1x1_198201-201701.pkl"
        file_path = f"{data_dir}/{fname}"
        
        # Read in data, create some selection filters, produce a reduced dataframe
        df = pd.read_pickle(file_path)
        # 816 represents 3 standard deviations above mean in SOCAT data -- observations above this threshold are unrealistic to observe in real-world data
        recon_sel = (~df[features_sel+target_sel+['net_mask']].isna().any(axis=1)) & (df[target_sel] < 816).to_numpy().ravel()
        sel = (recon_sel & (df['socat_mask'] == 1))
        unseen_sel = (recon_sel & (df['socat_mask'] == 0))
        
        # Convert dataframe to numpy arrays, train/val/test split
        X = df.loc[sel,features_sel].to_numpy()
        y = df.loc[sel,target_sel].to_numpy().ravel()
                
        N = X.shape[0]
        train_val_idx, train_idx, val_idx, test_idx = pre.train_val_test_split(N, test_prop, val_prop, random_seeds, seed_loc)
        X_train_val, X_train, X_val, X_test, y_train_val, y_train, y_val, y_test = pre.apply_splits(X, y, train_val_idx, train_idx, val_idx, test_idx) 
        
        X_unseen = df.loc[unseen_sel,features_sel].to_numpy()
        y_unseen = df.loc[unseen_sel,target_sel].to_numpy().ravel()
        
        #Standardize data based on the training data
        X_train_s = (X_train - np.mean(X_train,axis=0))/np.std(X_train,axis=0)
        X_val_s = (X_val - np.mean(X_train,axis=0))/np.std(X_train,axis=0)
        X_test_s = (X_test - np.mean(X_train,axis=0))/np.std(X_train,axis=0)
        X_unseen_s = (X_unseen - np.mean(X_train,axis=0))/np.std(X_train,axis=0)
        X_s = (X - np.mean(X_train,axis=0))/np.std(X_train,axis=0)
        
        # Fit the model on train data
        for i in range(num_runs):
            print(i)
            
            models = pre.build_nn_vf(num_features=len(features_sel))
            models.fit(X_train_s, y_train, epochs=200, batch_size=1000, verbose=0)

            y_pred_val = models.predict(X_val_s).ravel()
            y_pred_test = models.predict(X_test_s).ravel()
            y_pred_unseen = models.predict(X_unseen_s, batch_size=int(1e6)).ravel()
            y_pred_seen = models.predict(X_s, batch_size=int(1e5)).ravel()

            # update this function to handle multiple runs for NN
            pre.save_model(models, model_output_dir, approach, ens, member, run=i)
            
            val_dict[ens][member][i] = pre.evaluate_test(y_val,y_pred_val)
            test_dict[ens][member][i] = pre.evaluate_test(y_test,y_pred_test)
            unseen_dict[ens][member][i] = pre.evaluate_test(y_unseen,y_pred_unseen)  

            # Create the reconstruction and save it
            df['pCO2_recon'] = np.nan
            df.loc[unseen_sel,['pCO2_recon']] = y_pred_unseen
            df.loc[sel,['pCO2_recon']] = y_pred_seen
            DS_recon = df[['net_mask','socat_mask','pCO2', 'pCO2_recon']].to_xarray()
            
            # update this function to handle multiple runs for NN
            pre.save_recon(DS_recon, recon_output_dir, approach, ens, member, run=i)

print(datetime.datetime.now())

In [None]:
for ens, members in val_dict.items():
    for mem,runs in members.items():
        if runs:
            min_bias_idx = min(runs.items(), key=lambda x: np.abs(x[1]['bias']))[0]
            min_mse_idx = min(runs.items(), key=lambda x: np.abs(x[1]['mse']))[0]
            mse_threshold = sorted([value['mse'] for key,value in runs.items()])[1]
            options = [(key,value['bias']) for key,value in runs.items() if value['mse'] <= mse_threshold]
            min_bias_mse_idx = sorted(options, key=lambda x: np.abs(x[1]))[0][0]

            val_dict[ens][mem][min_bias_idx]['sel_min_bias'] = 1
            val_dict[ens][mem][min_mse_idx]['sel_min_mse'] = 1
            val_dict[ens][mem][min_bias_mse_idx]['sel_min_bias_mse'] = 1

            test_dict[ens][mem][min_bias_idx]['sel_min_bias'] = 1
            test_dict[ens][mem][min_mse_idx]['sel_min_mse'] = 1
            test_dict[ens][mem][min_bias_mse_idx]['sel_min_bias_mse'] = 1

            unseen_dict[ens][mem][min_bias_idx]['sel_min_bias'] = 1
            unseen_dict[ens][mem][min_mse_idx]['sel_min_mse'] = 1
            unseen_dict[ens][mem][min_bias_mse_idx]['sel_min_bias_mse'] = 1

In [None]:
# convert dictionaries to pandas data frames
val_df = pd.DataFrame.from_dict({(i,j,k):val_dict[i][j][k]
                                 for i in val_dict.keys()
                                 for j in val_dict[i].keys()
                                 for k in val_dict[i][j].keys()},
                                orient="index")

test_df = pd.DataFrame.from_dict({(i,j,k):test_dict[i][j][k]
                                  for i in test_dict.keys()
                                  for j in test_dict[i].keys()
                                  for k in test_dict[i][j].keys()},
                                 orient="index")

unseen_df = pd.DataFrame.from_dict({(i,j,k):unseen_dict[i][j][k]
                                    for i in unseen_dict.keys()
                                    for j in unseen_dict[i].keys()
                                    for k in unseen_dict[i][j].keys()},
                                   orient="index")

val_df.index.names = ["model", "member", "run"]
test_df.index.names = ["model", "member", "run"]
unseen_df.index.names = ["model", "member", "run"]

In [None]:
# Saving best parameters and performance metrics
approach_output_dir = f"{other_output_dir}/{approach}"

val_dict_fname = f"{approach_output_dir}/{approach}_val_performance_dict.pickle"
test_dict_fname = f"{approach_output_dir}/{approach}_test_performance_dict.pickle"
unseen_dict_fname = f"{approach_output_dir}/{approach}_unseen_performance_dict.pickle"

val_df_fname = f"{approach_output_dir}/{approach}_val_performance_df.pickle"
test_df_fname = f"{approach_output_dir}/{approach}_test_performance_df.pickle"
unseen_df_fname = f"{approach_output_dir}/{approach}_unseen_performance_df.pickle"

Path(approach_output_dir).mkdir(parents=True, exist_ok=True)

with open(val_dict_fname, 'wb') as handle:
    pickle.dump(val_dict, handle)
with open(test_dict_fname, 'wb') as handle:
    pickle.dump(test_dict, handle)
with open(unseen_dict_fname, 'wb') as handle:
    pickle.dump(unseen_dict, handle)
    
val_df.to_pickle(val_df_fname)
test_df.to_pickle(test_df_fname)
unseen_df.to_pickle(unseen_df_fname)