In [None]:
# Standard imports
import os
import datetime
from pathlib import Path
from collections import defaultdict
import scipy
import random
import numpy as np
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import joblib
import pickle

# Machine learning libraries
import sklearn            # machine-learning libary with many algorithms implemented
import xgboost as xgb     # extreme gradient boosting (XGB)
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error

# Python file with supporting functions
import model_utils

In [None]:
recon_model_path = '/home/julias/MLEE-final-project/models/saved_models/recon_models'

### To Set and Track Seeds for Reproducibility (Referencing Group Standard):

In [None]:
ens = 'CESM'
member = '009'
reference_output_dir = '/home/julias/MLEE-final-project/pickle_files'

path_seeds = f'{reference_output_dir}/random_seeds.npy'
random_seeds = np.load(path_seeds)  

path_loc = f'{reference_output_dir}/cesm_seed_loc_dict.pickle'
with open(path_loc,'rb') as handle:
    seed_loc_dict = pickle.load(handle)
seed_loc = seed_loc_dict[ens][member]

# for next project, where train and test datasets use all CESM members
#path_cesm = f"{reference_output_dir}/cesm_members_dict.pickle"
#with open(path_cesm,'rb') as handle:
#    cesm_mems_dict = pickle.load(handle)

In [None]:
# random_seeds

In [None]:
# seed_loc_dict # using CESM 009, so seed_loc should be 32

In [None]:
# seed_loc #confirmed 32

# Load Split Datasets and Create Versions for Experimentation

## Load Split Datasets

In [None]:
X_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X.nc').to_dataframe().dropna()
y_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y.nc').to_dataframe().dropna() 
X_train_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X_train.nc').to_dataframe().dropna() 
y_train_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y_train.nc').to_dataframe().dropna()
X_test_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/X_test.nc').to_dataframe().dropna() 
y_test_df = xr.open_dataset('/home/julias/MLEE-final-project/proc_data/split_datasets/y_test.nc').to_dataframe().dropna() 

Check that data was saved and loaded properly:

In [None]:
X_train_df

In [None]:
y_test_df

### Create Numpy Arrays for Original Data

In [None]:
X_original = X_df.to_numpy()         
y_original = y_df.to_numpy().ravel() 
X_train_original = X_train_df.to_numpy() 
y_train_original = y_train_df.to_numpy().ravel()
X_test_original = X_test_df.to_numpy()  
y_test_original = y_test_df.to_numpy().ravel()

## Create Normalized Dataframes

*Note: Done here, as opposed to in test/train split, so that I can save the original train/test datasets and later determine whether normalization led to improvement. (My group has not historically normalized data before training.)*

In [None]:
#X_df_norm = (X_df - X_df.mean())/X_df.std() 
#y_df_norm = (y_df - y_df.mean())/y_df.std()
#X_train_df_norm = (X_train_df - X_train_df.mean())/X_train_df.std()
#y_train_df_norm = (y_train_df - y_train_df.mean())/y_train_df.std()
#X_test_df_norm = (X_test_df - X_test_df.mean())/X_test_df.std()
#y_test_df_norm = (y_test_df - y_test_df.mean())/y_test_df.std()

CHANGE METHOD TO BELOW
- This way, time and lat/lon conversions are not normalized
- Note that coordinates don't seem to be input into ML algorithms, so T0,T1,A,B,C are the inputs of time and space (as would want, don't want two forms of time and space input)

In [None]:
X_df_norm = X_df
y_df_norm = y_df
X_train_df_norm = X_train_df
y_train_df_norm = y_train_df
X_test_df_norm = X_test_df
y_test_df_norm = y_test_df

for df in X_df_norm, X_train_df_norm, X_test_df_norm:
    df.SSS = (df.SSS - df.SSS.mean())/df.SSS.std() 
    df.SST = (df.SST - df.SST.mean())/df.SST.std() 
    df.MLD = (df.MLD - df.MLD.mean())/df.MLD.std() 
    df.Chl = (df.Chl - df.Chl.mean())/df.Chl.std() 
    df.XCO2 = (df.XCO2 - df.XCO2.mean())/df.XCO2.std() 

for df in y_df_norm, y_train_df_norm, y_test_df_norm:
    df.pCO2 = (df.pCO2 - df.pCO2.mean())/df.pCO2.std() 

In [None]:
X_train_df_norm

## Create Numpy Arrays 

In [None]:
X_n = X_df_norm.to_numpy()         
y_n = y_df_norm.to_numpy().ravel() 
X_train_n = X_train_df_norm.to_numpy()
y_train_n = y_train_df_norm.to_numpy().ravel()
X_test_n = X_test_df_norm.to_numpy() 
y_test_n = y_test_df_norm.to_numpy().ravel()

# Preliminary XBG Model

***Goal: Compare auto-XGB with lowest end of group XGB parameter matrix***

## Build XBG Model

In [None]:
XGB_model = xgb.XGBRegressor(verbosity=0) 

In [None]:
XGB_model.fit(X_train_df, y_train_df)

In [None]:
model = XGBRegressor(n_estimators=3000, max_depth=5)
model.fit(X_train_df, y_train_df) 

In [None]:
train_score = model.score(X_train_df, y_train_df) 
train_score

In [None]:
scores = cross_val_score(model, X_train_df, y_train_df,cv=10)

In [None]:
y_pred = model.predict(X_test_df)
mse = mean_squared_error(y_test_df, y_pred)
print("MSE: %.2f" % mse)

In [None]:
len(y_train_df.pCO2)

In [None]:
len(y_pred)

In [None]:
y_test_df

In [None]:
x_ax = range(len(y_test_df))
plt.plot(x_ax, y_test_df, label="original")
plt.plot(x_ax, y_pred, label="predicted")
plt.title("pC02 test and predicted data")
plt.legend()
plt.show()

# XGB Model with Normalized Data 

# Select Approach & Parameters

## Approach

Based on preliminary experimentation with dataframes, numpy, and normalized data, will train XBG model on 

## Parameters

### Reference Best Parameters from Previous Group Work

Published in Bennington 2022, trained XGB to learn pCO2 residual (pC02 change with direct temperature effects removed)

In [None]:
path_bp='/data/artemis/workspace/vbennington/full_sst/pCO2_DIC/models/performance_metrics/xg/xg_best_params_dict.pickle'
with open(path_bp,'rb') as handle:
    best_params = pickle.load(handle)
print(best_params)

Use previous CESM best parameters as a starting point:
- max_depth = 6
- n_estimators = 4000

In [None]:
# Test with three different n_estimators and 3 different depths
# Advice from group post-doc: For XGB, too many depth layers may lead to overfitting (usually 8 or less layers for XGB)
# We want the combo of xg_param_grid that gives the lowest RMSE

xg_param_grid = {'n_estimators':[3000, 4000, 5000],
                 'max_depth':[5, 6, 7]}

### Investigate Ideal Parameters for Selected Approach

In [None]:
best_params = {}
model = xgb.XGBRegressor(verbosity=0) 
xg_param_grid = {'n_estimators':[3000, 4000],
                 'max_depth':[5, 6]}
grid = GridSearchCV(model, xg_param_grid, scoring='neg_mean_squared_error') #, cv=K_folds, return_train_score=False, refit=False


In [None]:
grid.fit(X_train, y_train)
best_params[ens] = grid.best_params_
print(best_params)

In [None]:
best_params = {}
model = xgb.XGBRegressor(verbosity=0,njobs=30) 
xg_param_grid = {'n_estimators':[3000, 4000, 5000],
                 'max_depth':[5, 6, 7]}
grid = GridSearchCV(model, xg_param_grid, scoring='neg_mean_squared_error') #, cv=K_folds, return_train_score=False, refit=False
grid.fit(X_train, y_train)
best_params[ens] = grid.best_params_
print(best_params)

# XGB Model

## Build XGB Model 
(based on best parameters found in previous section)

## Train XGB Model

## Save XGB Model

## Test XBG Model

# For Future Work

## Additional Validation Split 

*Note: Group does not use this method when test years are used (as I did in processed_data_split.ipynb)*

### Group's Version

#### Train/validate/test split proportions

In [None]:
# 20% of dataset for validation, 20% of dataset for testing, the rest for training
# Training set will be split into validation and another training set

val_prop = .2
test_prop = .2

In [None]:
# Splitting the training data into validation and another training set   
N = X_train.shape[0]
train_val_idx, train_idx, val_idx, test_idx = model_utils.train_val_test_split(N, test_prop, val_prop, random_seeds, seed_loc)
X_train_val, X_train, X_val, X_test_tmp, y_train_val, y_train, y_val, y_test_tmp = model_utils.apply_splits(X_train, y_train, train_val_idx, train_idx, val_idx, test_idx) 

## Starting New Version

In [None]:
X_minitest = X_df #.to_numpy()
y_minitest = y_df #.to_numpy()

In [None]:
xtrain_minitest, xtest_minitest, ytrain_minitest, ytest_minitest=train_test_split(X_minitest, y_minitest, test_size=0.15)

In [None]:
xtrain_minitest.max()

In [None]:
xtrain_minitest

In [None]:
XGB_model_minitest = xgb.XGBRegressor(verbosity=0) 

In [None]:
XGB_model_minitest.fit(xtrain_minitest, ytrain_minitest)

# eXtreme Gradient Boosting 

(to compare performance to other methods from group)

In [None]:
# For next step of testing new hyperparameters, clear best_params
# best_params = {}  # opened above

In [None]:
# A, B and C represent lon and lat (3 components of the n-vector; so that the algorithm doesn't interpret 0 and 360
# degrees to be far apart 
# T0 and T1 represent time

#features_sel = ['SSS','SST','MLD','Chl','XCO2','T0', 'T1','A', 'B', 'C'] 
#target_sel = ['pCO2'] 

## Building and Training the XGB Model

In [None]:
# Number of cores you have access to for model training, group standard:
jobs = 30

### Create two dictionaries

In [None]:
test_performance = defaultdict(dict)
unseen_performance = defaultdict(dict)

### XGB-Specific Inputs

In [None]:
# K_folds: cross validation; number of splits for training set (in this case 3 splits; see below)
# Train on the first split, test on the remaining 2. Total 3 numbers for the final RMSE

K_folds = 3       # Split training set into 3 parts
approach = "xg"   # XGB approach
#first_mem = False # Initialize if using gridsearch to find best_params

In [None]:
# model: define which approach to use
# param_grid: the n_estimators (decision trees) and depths
# GridSearchCV: applying the K-fold cross validation
# first_mem = False: checks only the best params for first_mem (and not all members). For the next members, the  
# parameters from first_mem are re-used
# you could try to find the best params for a few members (but not all of them)
# 9 possible combinations (3 different n_estimators, 3 different max depths) x 3 (K-fold; 3 training sets)
if first_mem:
            model = XGBRegressor(random_state=random_seeds[4,seed_loc], n_jobs=jobs)
            param_grid = xg_param_grid
            grid = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=K_folds, return_train_score=False, refit=False)
            grid.fit(X_train_val, y_train_val)
            best_params[ens] = grid.best_params_
            print(best_params)
            first_mem = False

In [None]:
model = XGBRegressor(random_state=random_seeds[4,seed_loc], n_jobs=jobs)
param_grid = xg_param_grid
grid = GridSearchCV(model, param_grid, scoring='neg_mean_squared_error', cv=K_folds, return_train_score=False, refit=False)
grid.fit(X_train, y_train)
best_params[ens] = grid.best_params_
print(best_params)

### Train the XGB model

In [None]:
model = XGBRegressor(random_state=random_seeds[5,seed_loc], **best_params[ens], n_jobs=jobs)
model.fit(X_train_val, y_train_val) 

### Save the XGB model

In [None]:
#Uncomment when actually running

# pre_saildrone now called utils, carry through this change
utils.save_model(model, recon_model_path, approach, ens, member)
print(datetime.datetime.now())
print(ens)
print(member)

## Test the XGB Model

### Preliminary Analysis on XBG Test Error Metrics

In [None]:
# Calculate some test error metrics and store in a dictionary
# evaluate_test is a function from pre_saildrone. it includes MSE, MAE, bias etc

y_pred_test = model.predict(X_test)

test_performance[ens][member] = utils.evaluate_test(y_test, y_pred_test)
print(test_performance[ens][member])

In [None]:
# Redo this analysis on the unseen data
y_pred_unseen = model.predict(df.loc[unseen_sel,features_sel].to_numpy())

y_unseen = df.loc[unseen_sel,target_sel].to_numpy().ravel()
unseen_performance[ens][member] = utils.evaluate_test(y_unseen, y_pred_unseen)
print(unseen_performance[ens][member])

## Create the reconstruction

In [None]:
# Create the reconstruction and save it
# Jake calls it seen
# This should just be all SOCAT locations for all training years (not test years)
y_pred_seen = model.predict(X)

In [None]:
# Full reconstruction 
df['pCO2_DIC_recon'] = np.nan
df.loc[unseen_sel,['pCO2_DIC_recon']] = y_pred_unseen   # Not in a SOCAT location, not even in test year
df.loc[sel,['pCO2_DIC_recon']] = y_pred_seen

In [None]:
# All time/locations not sampled by SOCAT
df['pCO2_DIC_nosocat'] = np.nan
df.loc[unseen_sel,['pCO2_DIC_nosocat']] = y_pred_unseen
df.loc[sel,['pCO2_DIC_nosocat']] = np.nan

In [None]:
# Only at time/locations of SOCAT sampling
df['pCO2_DIC_socat'] = np.nan
df.loc[unseen_sel,['pCO2_DIC_socat']] = np.nan
df.loc[sel,['pCO2_DIC_socat']] = y_pred_seen
     
df['pCO2_DIC'] = df['pCO2_pCO2T_diff']
             
#DS_recon = df[['net_mask','socat_mask','pCO2_DIC','pCO2_DIC_recon','pCO2_DIC_socat','pCO2_DIC_nosocat']].to_xarray()
DS_recon = df[['net_mask','combined_mask','pCO2_DIC','pCO2_DIC_recon','pCO2_DIC_socat','pCO2_DIC_nosocat']].to_xarray()

## Save reconstructions

In [None]:
# Uncomment when actually running            
#pre_saildrone_thea.save_recon(DS_recon, recon_output_dir, approach, ens, member)   

## Save best parameters and performance metrics

In [None]:
# Saving best parameters and performance metrics

approach_output_dir = f"{other_output_dir}/{approach}"
param_fname = f"{approach_output_dir}/{approach}_best_params_dict.pickle"
test_perform_fname = f"{approach_output_dir}/{approach}_test_performance_dict.pickle"
unseen_perform_fname = f"{approach_output_dir}/{approach}_unseen_performance_dict.pickle"

Path(approach_output_dir).mkdir(parents=True, exist_ok=True)

with open(param_fname, 'wb') as handle: #WHAT DOES wb MEAN
    pickle.dump(best_params, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open(test_perform_fname, 'wb') as handle:
    pickle.dump(test_performance, handle)
with open(unseen_perform_fname, 'wb') as handle:
    pickle.dump(unseen_performance, handle)
    
# Convert performance metrics to dataframes
test_df = pd.DataFrame.from_dict({(i,j): test_performance[i][j]
                                  for i in test_performance.keys()
                                  for j in test_performance[i].keys()},
                                 orient='index')

unseen_df = pd.DataFrame.from_dict({(i,j): unseen_performance[i][j]
                                  for i in unseen_performance.keys()
                                  for j in unseen_performance[i].keys()},
                                 orient='index')

test_df.index.names = ["model","member"]
unseen_df.index.names = ["model","member"]

# Save the dataframes too
test_df_fname = f"{approach_output_dir}/{approach}_test_performance_df.pickle"
unseen_df_fname = f"{approach_output_dir}/{approach}_unseen_performance_df.pickle"

test_df.to_pickle(test_df_fname)
unseen_df.to_pickle(unseen_df_fname)    

In [None]:
#just checking what the saved trained datafiles look like 
test_2 = pd.read_pickle("/data/artemis/workspace/theimdal/saildrone/models/trained/xg/CESM/member_016/xg_model_pC02_2D_mon_CESM_016_1x1_198201-201701.joblib")

In [None]:
test_2

In [None]:
#checking out what the input data for the XGB looks like
#this table was generated in script 01
df