# Dask Grid Search for Neural Network Prediction of Sahelian S
***

#### Resources:
* [Mardata Course](https://github.com/mardatade/Course-Python-for-Machine-Learning/blob/master/3.%20Neural%20Network.ipynb)
* [Keras for Data Scientists](https://keras.io/getting_started/intro_to_keras_for_engineers/#data-loading-amp-preprocessing)

#### Packages

In [1]:
import numpy as np
import pandas as pd 
import xarray as xr

from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

import scipy.stats as st

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn import metrics


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import TensorBoard 
# from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

import tensorflow_addons as tfa

from tensorboard.plugins.hparams import api as hp
%load_ext tensorboard

import dask
from dask import delayed
import dask.bag as db


#### Dask Client

In [2]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:34479")
client

0,1
Client  Scheduler: tcp://127.0.0.1:34479  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 16  Memory: 135.09 GB


In [3]:
# dask.config.set(scheduler='synchronous')

<br>
<br>

## 1. Data Loading & Preprocessing
***

<br>

### a) Loading & Normalization

**predictor:** contains the data used for the inputs  
**label:** from Sahelrainfall data serves as validation data

In [2]:
predictor = xr.open_dataset('data/da_pred_all.nc').to_dataframe()

predictor_unit = pd.DataFrame(
    data = StandardScaler().fit_transform(predictor), 
    columns = predictor.columns,
    index =  predictor.index
)


# load validatoin data (Summer Rainfall over Sahel and scale to [cm/month]) 
labels = np.mean(np.loadtxt("data/da_o_sahelprecip19012017.txt", skiprows=8,)[:,7:10] * 0.01,  axis=1)

predictor_unit.head()

Unnamed: 0_level_0,siod_e,siod_w,sst_med,tsa,tna,sst_mdr,sata_lnh,sata_lsh,sata_onh,sata_osh,slp_darwin,slp_tahiti,amo,nao,pdo,np,nino12,nino3,nino34,nino4
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1901,-1.100027,-1.152764,-0.74553,-0.595366,0.388372,0.608415,-0.123443,-0.732091,-0.497808,-0.737797,0.074807,1.634819,0.923204,0.917456,-0.193321,1.938388,-0.950168,-0.595561,-0.214314,-0.07927
1902,0.088643,0.340415,-1.507314,-0.954566,-0.346586,-0.173588,-1.289978,-0.20181,-1.175314,-0.987096,1.443896,2.682485,-0.620146,-1.17259,0.819716,-0.162154,0.991321,0.969845,1.099218,1.070532
1903,-0.900789,0.669332,-2.243639,-2.186294,-0.10197,0.283583,-1.333183,-1.076056,-1.415719,-1.333946,-0.071881,1.535042,-0.45829,-1.03041,-0.186187,0.530864,-0.371251,0.000784,0.524139,0.842095
1904,-0.949568,-1.056219,-0.079925,-1.975498,-2.214111,-1.894743,-1.135674,-1.133384,-1.863746,-1.778347,-0.903114,1.235708,-1.872482,1.447076,-0.892459,0.756497,-0.307712,-0.234313,-0.475713,-0.741738
1905,-0.03435,-0.632249,-0.718895,-1.684676,-1.334312,-1.014906,-1.314666,-0.595938,-1.284589,-0.954579,0.759351,-2.655622,-0.499163,-1.289888,0.545055,-0.326007,1.22783,1.497381,1.439037,1.032459


<br>

### b) PCA

In [3]:
# Scikit PCA transformation
pca = PCA()
principalComponents = pca.fit_transform(predictor_unit)


# Create Create Pandas DF from PCs
col = []
for i in range(1, 21):
    col.append(f'PC{i}')

predictor_pc = pd.DataFrame(
    data = principalComponents,
    columns = col,
    index =  predictor.index
)

# Test for unit-variance and zero mean:
# np.std(pred_pc)
# np.mean(pred_pc)
# pred_pc.head()

predictor_pc.head()

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1901,-1.568434,-0.814406,-1.565711,-1.237071,1.472947,-0.593357,-1.660153,0.222967,0.241259,0.837099,0.098283,-0.767361,-0.099698,0.875102,-0.391004,0.067938,-0.012741,0.176348,0.188827,0.012466
1902,-0.52167,2.782115,-0.365347,2.152565,2.364542,0.248512,-1.855028,-0.231364,0.177858,0.788062,-0.135515,0.526785,-0.439029,0.236945,0.613192,-0.173206,-0.049516,0.003676,0.020811,-0.0227
1903,-2.357779,1.78352,-1.194862,1.3687,1.273662,-1.280782,-1.766921,0.44111,-0.41269,0.913848,-1.644854,0.649893,-0.240188,0.273618,0.151983,0.152255,-0.063746,0.036216,0.1285,0.007985
1904,-5.168501,0.642722,1.054827,-0.759804,1.351015,1.020594,-0.413493,-0.695127,-0.606953,-0.289135,-0.644362,0.201273,-0.556953,0.270713,0.003502,-0.383464,0.148037,-0.071637,0.241812,0.073306
1905,-1.282539,4.379097,0.640607,0.433165,-0.940971,-1.43371,1.068731,-1.191238,0.581742,-0.393126,-0.344745,-0.274467,0.226232,-0.207231,0.295327,-0.322379,0.324228,-0.052399,0.166036,0.116238


<br>
<br>

## 2. MODEL SETUP AND TUNING
***

<br>

### Set Logs
***

##### Set Parent Directory

In [4]:
parent_dir = 'logs/testrun/'

##### Clear Directory

In [5]:
rm -rf logs/testrun/*

<br>

### Hyperparameter Selection
***

In [6]:
###################################
#####EXAMPLE SETUP FOR TESTING#####
###################################


#GRID SERACH HYPERPARAMETER#
#---------------------------
HP_INPUT_VAR_NINE = hp.HParam('input_var_nine', hp.Discrete(['PC9']),display_name='9th Input Variable')
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['AdamW']),display_name='Optimizer')
HP_LEARN_RATE = hp.HParam('learn_rate', hp.Discrete([0.09, 0.1]),display_name='Learning Rate')
HP_WEIGHT_DECAY = hp.HParam('weight_decay', hp.Discrete([0.001]),display_name='Weight Decay')
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([1]),display_name='Batch Size')
HP_EPOCHS = hp.HParam('n_epochs', hp.Discrete([80]),display_name='Epochs')


#CROSS VALIDATION PARAMETER (NO PART OF GRID SEARCH)#
#----------------------------------------------------
cv_param={
    'N_FOLDS': 10,         # number of folds -> small for Test Runs
    'TEST_FRAC': .1    # factrion that is held out for test
}


#BAGGING PARAMETER (NO PART OF GRID SEARCH)#
#-------------------------------------------
n_baggs = 5  # number of baggs -> small for test runs

earlystop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

# hist_dir = parent_dir + f"/run-{HPARAM['grid_num']:04d}" + datetime.now().strftime("%Y%m%d-%H%M%S")
# tensorboard_callback = keras.callbacks.TensorBoard(log_dir=hist_dir)


####################
#####FULL SETUP#####
####################


# #GRID SERACH HYPERPARAMETER#
# #---------------------------
# HP_INPUT_VAR_NINE = hp.HParam('input_var_nine', hp.Discrete(['PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16']),display_name='9th Input Variable')
# HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['AdamW', 'SGDW']),display_name='Optimizer')
# HP_LEARN_RATE = hp.HParam('learn_rate', hp.Discrete([0.01, 0.1, 0.2]),display_name='Learning Rate')
# HP_WEIGHT_DECAY = hp.HParam('weight_decay', hp.Discrete([0.001, 0.01, 0.1]),display_name='Weight Decay')
# HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([1, 3, 10, 30]),display_name='Batch Size')
# HP_EPOCHS = hp.HParam('n_epochs', hp.Discrete([30, 80, 120]),display_name='Epochs')


# #CROSS VALIDATION PARAMETER (NO PART OF GRID SEARCH)#
# #----------------------------------------------------
# cv_param={
#     'N_FOLDS': 105,      # number of folds -> sample size as in Badr
#     'TEST_FRAC': .1    # factrion that is held out for test
# }


# #BAGGING PARAMETER (NO PART OF GRID SEARCH)#
# #-------------------------------------------
# n_baggs = 10 # number of baggs -> 10 as in Badr

In [26]:
HP_Dropout = hp.HParam('dropout', hp.RealInterval(0.0, 1.0),display_name='Dropout')

In [40]:
for i in (HP_Dropout.domain.min_value , HP_Dropout.domain.max_value):
    print (i)

0.0
1.0


In [36]:
hp.HParam.domain?

[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x7f8ef5a574f0>
[0;31mDocstring:[0m   <no docstring>


In [24]:
dir(hp)

['Discrete',
 'Domain',
 'HParam',
 'IntInterval',
 'KerasCallback',
 'Metric',
 'RealInterval',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'hparams',
 'hparams_config',
 'hparams_config_pb',
 'hparams_pb']

<br>

### Metric Selection
---

In [7]:
METRIC_TRAIN_MSE_MU= 'train_mse_mu'
METRIC_TRAIN_MSE_SIG= 'train_mse_sig'
METRIC_TRAIN_CORR_MU= 'train_corr_mu'
METRIC_TRAIN_CORR_SIG= 'train_corr_sig'

METRIC_TEST_MSE_MU= 'test_mse_mu'
METRIC_TEST_MSE_SIG= 'test_mse_sig'
METRIC_TEST_CORR_MU= 'test_corr_mu'
METRIC_TEST_CORR_SIG= 'test_corr_sig'

<br>

### Log Experiment Confiuration to TensorBoard
---

In [8]:
with tf.summary.create_file_writer(parent_dir).as_default():
    hp.hparams_config(
        hparams=[HP_INPUT_VAR_NINE, HP_OPTIMIZER, HP_LEARN_RATE, HP_WEIGHT_DECAY, HP_BATCH_SIZE, HP_EPOCHS],
        metrics=[
            hp.Metric(METRIC_TRAIN_MSE_MU, display_name='Training Sample MSE µ'),
            hp.Metric(METRIC_TRAIN_MSE_SIG, display_name='Training Sample  MSE σ'),
            hp.Metric(METRIC_TRAIN_CORR_MU, display_name='Training Sample Correlation µ'),
            hp.Metric(METRIC_TRAIN_CORR_SIG, display_name='Training Sample  Correlation σ'),
            hp.Metric(METRIC_TEST_MSE_MU, display_name='Test Sample MSE µ'),
            hp.Metric(METRIC_TEST_MSE_SIG, display_name='Test Sample  MSE σ'),
            hp.Metric(METRIC_TEST_CORR_MU, display_name='Test Sample Correlation µ'),
            hp.Metric(METRIC_TEST_CORR_SIG, display_name='Test Sample  Correlation σ')
        ],
    )

<br>

### Build Model Function
---

In [10]:
def BuildModel(HPARAM):      
    
    
    model = keras.Sequential([
            layers.Dense(3, activation="sigmoid", name="layer1", input_shape=(9,)),
            layers.Dense(3, activation="sigmoid", name="layer2"),
            layers.Dense(3, activation="sigmoid", name="layer3"),
            layers.Dense(1, activation='linear', name='output')
        ])
    
    model.compile(
        loss='mean_squared_error',
        optimizer=getattr(tfa.optimizers, HPARAM['optimizer'])(
            learning_rate=HPARAM['learn_rate'],
            weight_decay=HPARAM['weight_decay']
        )
    )
    return model

In [11]:
testmodel = BuildModel({'optimizer': 'AdamW', 'learn_rate': 0.1, 'weight_decay': 0.01})
print(testmodel.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
layer1 (Dense)               (None, 3)                 30        
_________________________________________________________________
layer2 (Dense)               (None, 3)                 12        
_________________________________________________________________
layer3 (Dense)               (None, 3)                 12        
_________________________________________________________________
output (Dense)               (None, 1)                 4         
Total params: 58
Trainable params: 58
Non-trainable params: 0
_________________________________________________________________
None


<br>

### Single Run Training & Error Calculation Funktion
---

In [13]:
hist_dir = parent_dir + f"/run-{hparams[0]['grid_num']:04d}" + f"/split-{SPLIT['split_num']:02d}"
hist_dir

NameError: name 'hparams' is not defined

In [30]:
def TrainModel(SPLIT, HPARAM, features):
    
    model = BuildModel(HPARAM)
    
    hist_dir = parent_dir + f"/run-{HPARAM['grid_num']:04d}" + f"/split-{SPLIT['split_num']:02d}"
    
    train_history = model.fit(
        features[SPLIT['train_index']],
        labels[SPLIT['train_index']],
        batch_size=HPARAM['batch_size'],
        epochs=HPARAM['n_epochs'],
        verbose=0,
#         callbacks=[earlystop],
    )
    

    y_train = np.squeeze(model.predict(features[SPLIT['train_index']]))
    y_test = np.squeeze(model.predict(features[SPLIT['test_index']]))

    
    train_error = y_train - labels[SPLIT['train_index']]
    train_mse = np.mean(train_error**2)
    train_corr = st.pearsonr(y_train, labels[SPLIT['train_index']])[0]
    
    test_error = y_test - labels[SPLIT['test_index']]
    test_mse = np.mean(test_error**2)
    test_corr = st.pearsonr(y_test, labels[SPLIT['test_index']])[0]

    
    metrics = {
        'train_mse': train_mse,
        'train_corr': train_corr,
        'test_mse': test_mse,
        'test_corr': test_corr,
    }
    
    return metrics

<br>

### Cross-Validation and Log Function
---

In [31]:
def TuneModel(HPARAM):
    
    
    with tf.summary.create_file_writer(parent_dir + f"/run-{HPARAM['grid_num']:04d}").as_default():
        hp.hparams({
            HP_INPUT_VAR_NINE: HPARAM['input_var_nine'],
            HP_OPTIMIZER: HPARAM['optimizer'],
            HP_LEARN_RATE: HPARAM['learn_rate'],
            HP_WEIGHT_DECAY: HPARAM['weight_decay'],
            HP_BATCH_SIZE: HPARAM['batch_size'],
            HP_EPOCHS: HPARAM['n_epochs']
        })
        
        features = predictor_pc.loc[:,['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', HPARAM['input_var_nine']]].to_numpy()
        
        metrics = SPLITS.map(lambda SPLIT: TrainModel(SPLIT, HPARAM, features)).compute()

        train_mse = [metric['train_mse'] for metric in metrics]
        train_corr = [metric['train_corr'] for metric in metrics]
        test_mse = [metric['test_mse'] for metric in metrics]
        test_corr = [metric['test_corr'] for metric in metrics]

        
        tf.summary.scalar(METRIC_TRAIN_MSE_MU,   np.mean(train_mse),  step=1)
        tf.summary.scalar(METRIC_TRAIN_MSE_SIG,  np.std(train_mse),   step=1)
        tf.summary.scalar(METRIC_TRAIN_CORR_MU,  np.mean(train_corr), step=1)
        tf.summary.scalar(METRIC_TRAIN_CORR_SIG, np.std(train_corr),  step=1)
        tf.summary.scalar(METRIC_TEST_MSE_MU,    np.mean(test_mse),   step=1)
        tf.summary.scalar(METRIC_TEST_MSE_SIG,   np.std(test_mse),    step=1)
        tf.summary.scalar(METRIC_TEST_CORR_MU,   np.mean(test_corr),  step=1)
        tf.summary.scalar(METRIC_TEST_CORR_SIG,  np.std(test_corr),   step=1)
        
#     return test_corr

<br>

###  Perform Grid Search
---

#### Create HP Bag

In [25]:
grid_num = 0
hparams = []
for input_var_nine in HP_INPUT_VAR_NINE.domain.values:
    for optimizer in HP_OPTIMIZER.domain.values:
        for learn_rate in HP_LEARN_RATE.domain.values:
            for weight_decay in HP_WEIGHT_DECAY.domain.values:
                for batch_size in HP_BATCH_SIZE.domain.values:
                    for n_epochs in HP_EPOCHS.domain.values:


                        hparams.append(
                                {
                                'input_var_nine': input_var_nine,
                                'optimizer': optimizer,
                                'learn_rate': learn_rate,
                                'weight_decay': weight_decay,
                                'batch_size': batch_size,
                                'n_epochs': n_epochs,       
                                'grid_num': grid_num
                                }
                            )
                        grid_num += 1
                        
HPARAMS = db.from_sequence(hparams)                        

In [26]:
HPARAMS.take(1)

({'input_var_nine': 'PC9',
  'optimizer': 'AdamW',
  'learn_rate': 0.01,
  'weight_decay': 0.001,
  'batch_size': 1,
  'n_epochs': 80,
  'grid_num': 0},)

#### Create Data Splits Bag (RRHCV)

In [27]:
split_num = 0
splits = []
for train, test in ShuffleSplit(n_splits=cv_param['N_FOLDS'], test_size=cv_param['TEST_FRAC']).split(predictor_pc):
    splits.append(
        {
        'train_index': train,
        'test_index': test,
        'split_num': split_num
        }
    )
    split_num += 1 
SPLITS = db.from_sequence (splits)

In [28]:
SPLITS.take(2, 2)

({'train_index': array([ 18,  53, 100,  80, 115, 101,  75,  64,  63,  43,  35, 107,  90,
         110,  12, 104,  95,  56,  36,  93,  48,  14,  39,  58,  54, 111,
          77,  44,   5,  46,  82, 108,  29,   4,  41,  72,  20,  89,  25,
         116,   3,  37,  86,  73,  52,  65,  47,  59,  19, 114,  94,   1,
          96,  87,  34,  51,  33,  30,  79,  69,  38,  49,   9,  83,  13,
          22,  67,  32,  11,  61,  78,  57,  27, 113,  24,  50,  15,  92,
          28,  68,  91,  74,  26,  62,  70,   2,  31,  60,  23,   6,  97,
          16, 103,  98,  81, 112,  21,  85,  88,  55,  45,  40,  99,  42,
          66]),
  'test_index': array([  0,  17,  84,   8,  76,  71, 105,  10, 106,   7, 102, 109]),
  'split_num': 0},
 {'train_index': array([  5, 104,  51,  18, 106,  31, 103,  20,  35,  63,  73,  70,  57,
          71,  68, 100,  81,  76, 102,  24,  11,  54,  86,  25,  94,  33,
          98,  59,   9,   1,  16, 110,  74,  95,  41,  38,  78, 114,  10,
         115,   3,  48, 113,  26,  2

#### Run Model

In [32]:
%%time
results = HPARAMS.map(lambda HPARAM: TuneModel(HPARAM)).compute()

CPU times: user 49.7 ms, sys: 4.94 ms, total: 54.6 ms
Wall time: 20.5 s


# results

In [57]:
%pidof tensorboard

UsageError: Line magic function `%pidof` not found.


In [42]:
!kill 3072859

In [41]:
%tensorboard --logdir logs/testrun

Reusing TensorBoard on port 6006 (pid 3072859), started 1:44:58 ago. (Use '!kill 3072859' to kill it.)

<br>

### Close Client after finishing / before using it in another Notebook

In [None]:
client.close()

<br>
<br>

***
***
<br>
<br>

<br>

### Bagging Function (Not used curretnly)
---

In [22]:
# def Bagging(hparams, features, model, train_index, test_index):
    
    
    
#     # set emty output matrices
#     y_train_bagging = np.zeros((train_index.size, n_baggs))
#     y_test_bagging = np.zeros((test_index.size, n_baggs))    
    
    
#     #Train the model 'n_baggs' times and store model predictions into matrice
#     for n in range(n_baggs):
        
# #         print ('baggin run', n)
# #         print ('PREDICTION ON TEST DATA:', y_test_bagging)
        
#         # Bootstrap sampling from training Data with Size(Training Data)
#         train_index_bootstrap = np.random.choice(train_index, train_index.size)

#         #Train the model 
#         model.fit(
#             features[train_index_bootstrap],
#             labels[train_index_bootstrap],
#             batch_size=hparams[HP_BATCH_SIZE],
#             epochs=hparams[HP_EPOCHS],
#             verbose=0
#         )
        
#         #Run the model for insample data and store in one matrix:
#         y_train_bagging[:, n] = np.squeeze(model.predict(features[train_index]))
        
#         # ... and for out of sample data        
#         y_test_bagging[:, n] = np.squeeze(model.predict(features[test_index]))

#     # return mean of the outputs over baggins (1st dimension)
#     return y_train_bagging.mean(1), y_test_bagging.mean(1)
# #     return y_test_bagging.mean(1)