# *WORKFLOW MAPPING*

In [None]:
#typical ML imports (from NN example ipynb)
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xarray as xr
from glob import glob

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import *
from tensorflow.keras import Sequential
from utils import * 

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

plt.rcParams['savefig.dpi'] = 400
plt.rcParams['font.size'] = 13
plt.rcParams["legend.frameon"] = False

In [None]:
#additional ML imports (from RF example ipynb)
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import RandomizedSearchCV
from utils import * 

## 1. Data Import 

### 1.a. Import all predictor and "true" data

In [8]:
# pull and save into 1_raw_data

## 2. Data Pre-Processing

### 2.a. Split into training and test datasets

In [None]:
# pull from 1_raw_data and place in 2_proc_data

In [None]:
# to create directories:
cwd = os.getcwd()

train_path = os.path.join(cwd,'Data','train_val')
test_path  = os.path.join(cwd,'Data','test')

make_dir(train_path)
make_dir(test_path)

In [None]:
# Training set
train_files = ["historical", "ssp585", "ssp126", "ssp370","hist-aer","hist-GHG"]
X_train_xr, X_length  = prepare_predictor(train_files,train_path)
y_train_xr, y_length  = prepare_predictand(train_files,train_path)

# Test set
X_test_xr, _ = prepare_predictor('ssp245', data_path=test_path,time_reindex=False)
y_test_xr, _ = prepare_predictand('ssp245',data_path=test_path,time_reindex=False)

### 2.b. Organize all required variables into a dataframe 

In [None]:
X_train_df = pd.DataFrame({"CO2": X_train_xr["CO2"].data,
                           "CH4": X_train_xr["CH4"].data
                          }, index=X_train_xr["CO2"].coords['time'].data)

X_test_df  = pd.DataFrame({"CO2": X_test_xr["CO2"].data,
                           "CH4": X_test_xr["CH4"].data
                          }, index=X_test_xr["CO2"].coords['time'].data)


y_train_df = y_train_xr["tas"].stack(dim=["latitude", "longitude"])
y_train_df = pd.DataFrame(y_train_df.to_pandas())

### NOTE: Additional step for training (and test?) datasets: need to filter out predictor information where don't have SOCAT? or simply leave/will be fine with lat/lon coordinates?

### 2.c. Data Normalization

In [None]:
# Standardization
mean, std = X_train_df.mean(), X_train_df.std()

X_train_df   = (X_train_df - mean)/std
X_test_df    = (X_test_df - mean)/std

X_train = X_train_df.to_numpy()
y_train = y_train_df.to_numpy()
X_test = X_test_df.to_numpy()

print(X_train.shape,y_train.shape,X_test.shape)

In [None]:
# get the NaN values in a mask, will need to mask out all predictors where don't have SOCAT?
mask = ~np.isnan(SOCAT.isel(time=0)) #isel(time=0) won't work here though because observations could be at different times

## 3. Train different machine learning methods

### 3.a. Random Forest (to be used as a baseline)

In [12]:
# name everything with RF
# will be Notebook A in 3_code

#### 3.a.i. Building and Training the RF Model

In [None]:
# try using cross-validation to get the best hyperparameters

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 300, num = 5)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5,55, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 15, 25]

# Minimum number of samples required at each leaf node
min_samples_leaf = [4, 8, 12,16]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
print(random_grid)

In [None]:
reg0 = RandomForestRegressor(random_state=0)
# perform cross validation
rf_random0 = RandomizedSearchCV(estimator = reg0, param_distributions = random_grid, 
                                n_iter = 5, cv = 3, verbose=2, n_jobs = -1)
rf_tas = rf_random0.fit(X_train,y_train)

print("The best hyperparameters: \n",rf_tas.best_params_)

#### 3.a.ii. Testing the RF Model

### 3.b. eXtreme Gradient Boosting (to compare performance to other methods from group)

In [13]:
# name everything with XGB
# will be Notebook B in 3_code

#### 3.b.i. Building and Training the XGB Model

#### 3.b.ii. Testing the XGB Model

### 3.c. Neural Network (to experiment with hyperparameters)

In [14]:
# name everything with NN
# will be Notebook C in 3_code

#### 3.c.i. Building and Training the NN Model

In [9]:
# NOTE: The NN input layer has # NOTE that number of input layer neurons must correspond to number of predictor variables

In [None]:
# set hyperparameters
n_neuron       = 64
activation     = 'relu'
num_epochs     = 50
learning_rate  = 0.001
minibatch_size = 64
model_num      = 1

In [None]:
model = Sequential()

model.add(Dense(n_neuron,  activation=activation,input_shape=(X_train.shape[1],))) #  the 1st hidden layer 
model.add(Dense(n_neuron,  activation=activation)) # the 2nd hidden layer
model.add(Dense(n_neuron,  activation=activation)) # the 3rd hidden layer
model.add(Dense(y_train.shape[1],  activation='linear')) # the output layer


model.compile(loss='mse',optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))

In [None]:
model.summary()

In [None]:
# after training, save:
model_path = os.path.join(cwd,'saved_model')
make_dir(model_path)

In [None]:
model.save(os.path.join(model_path,'NN_model.h5'))

#### 3.c.ii. Testing the NN Model

In [None]:
# then reload before start working with test data
model = load_model(os.path.join(model_path,'NN_model.h5'))

### 3.d. Other (to explore further if time permits)
#### *If proceed, choose among the following:*
- support vector regression (SVR); 
- long short-term memory (LSTM) network; or 
- an ensemble of RF, NN, and XGB. 

In [15]:
# if do, will be Notebook D in 3_code

## 3. Figures for Results and Discussion

In [16]:
# standardize/compare all test figures

In [2]:
#At end, can create function to_netcdf so can export all data
newvariable.to_netcdf('data\\newvariable.nc')
# more info: https://docs.xarray.dev/en/stable/generated/xarray.Dataset.to_netcdf.html

NameError: name 'newvariable' is not defined