# Neural Network Prediction of Sahelian Summer Rainfall
***

#### Resources:
* [Mardata Course](https://github.com/mardatade/Course-Python-for-Machine-Learning/blob/master/3.%20Neural%20Network.ipynb)
* [Keras for Data Scientists](https://keras.io/getting_started/intro_to_keras_for_engineers/#data-loading-amp-preprocessing)

In [1]:
import numpy as np
import pandas as pd 
import xarray as xr

import matplotlib.pyplot as plt
%matplotlib inline

import scipy.stats as st

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit, GridSearchCV
from sklearn import metrics


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import regularizers
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

import tensorflow_addons as tfa

from tensorboard.plugins.hparams import api as hp
%load_ext tensorboard


from dask import delayed

<br>
<br>

## 1. Data Loading & Preprocessing
***

<br>

### a) Loading & Normalization

**predictor:** contains the data used for the inputs  
**label:** from Sahelrainfall data serves as validation data

In [2]:
predictor = xr.open_dataset('data/da_pred_all.nc').to_dataframe()

predictor_unit = pd.DataFrame(
    data = StandardScaler().fit_transform(predictor), 
    columns = predictor.columns,
    index =  predictor.index
)


# load validatoin data (Summer Rainfall over Sahel and scale to [cm/month]) 
labels = np.mean(np.loadtxt("data/da_o_sahelprecip19012017.txt", skiprows=8,)[:,7:10] * 0.01,  axis=1)

predictor_unit.head()

Unnamed: 0_level_0,siod_e,siod_w,sst_med,tsa,tna,sst_mdr,sata_lnh,sata_lsh,sata_onh,sata_osh,slp_darwin,slp_tahiti,amo,nao,pdo,np,nino12,nino3,nino34,nino4
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1901,-1.100027,-1.152764,-0.74553,-0.595366,0.388372,0.608415,-0.123443,-0.732091,-0.497808,-0.737797,0.074807,1.634819,0.923204,0.917456,-0.193321,1.938388,-0.950168,-0.595561,-0.214314,-0.07927
1902,0.088643,0.340415,-1.507314,-0.954566,-0.346586,-0.173588,-1.289978,-0.20181,-1.175314,-0.987096,1.443896,2.682485,-0.620146,-1.17259,0.819716,-0.162154,0.991321,0.969845,1.099218,1.070532
1903,-0.900789,0.669332,-2.243639,-2.186294,-0.10197,0.283583,-1.333183,-1.076056,-1.415719,-1.333946,-0.071881,1.535042,-0.45829,-1.03041,-0.186187,0.530864,-0.371251,0.000784,0.524139,0.842095
1904,-0.949568,-1.056219,-0.079925,-1.975498,-2.214111,-1.894743,-1.135674,-1.133384,-1.863746,-1.778347,-0.903114,1.235708,-1.872482,1.447076,-0.892459,0.756497,-0.307712,-0.234313,-0.475713,-0.741738
1905,-0.03435,-0.632249,-0.718895,-1.684676,-1.334312,-1.014906,-1.314666,-0.595938,-1.284589,-0.954579,0.759351,-2.655622,-0.499163,-1.289888,0.545055,-0.326007,1.22783,1.497381,1.439037,1.032459


<br>

### b) PCA

In [3]:
# Scikit PCA transformation
pca = PCA()
principalComponents = pca.fit_transform(predictor_unit)


# Create Create Pandas DF from PCs
col = []
for i in range(1, 21):
    col.append(f'PC{i}')

predictor_pc = pd.DataFrame(
    data = principalComponents,
    columns = col,
    index =  predictor.index
)

# Test for unit-variance and zero mean:
# np.std(pred_pc)
# np.mean(pred_pc)
# pred_pc.head()

predictor_pc.head()

Unnamed: 0_level_0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1901,-1.568434,-0.814406,-1.565711,-1.237071,1.472947,-0.593357,-1.660153,0.222967,0.241259,0.837099,0.098283,-0.767361,-0.099698,0.875102,-0.391004,0.067938,-0.012741,0.176348,0.188827,0.012466
1902,-0.52167,2.782115,-0.365347,2.152565,2.364542,0.248512,-1.855028,-0.231364,0.177858,0.788062,-0.135515,0.526785,-0.439029,0.236945,0.613192,-0.173206,-0.049516,0.003676,0.020811,-0.0227
1903,-2.357779,1.78352,-1.194862,1.3687,1.273662,-1.280782,-1.766921,0.44111,-0.41269,0.913848,-1.644854,0.649893,-0.240188,0.273618,0.151983,0.152255,-0.063746,0.036216,0.1285,0.007985
1904,-5.168501,0.642722,1.054827,-0.759804,1.351015,1.020594,-0.413493,-0.695127,-0.606953,-0.289135,-0.644362,0.201273,-0.556953,0.270713,0.003502,-0.383464,0.148037,-0.071637,0.241812,0.073306
1905,-1.282539,4.379097,0.640607,0.433165,-0.940971,-1.43371,1.068731,-1.191238,0.581742,-0.393126,-0.344745,-0.274467,0.226232,-0.207231,0.295327,-0.322379,0.324228,-0.052399,0.166036,0.116238


<br>
<br>

## 2. MODEL SETUP AND TUNING
***

<br>

### Clear Logs
***

In [4]:
rm -rf logs/*

<br>

### Hyperparameter Selection
***

In [5]:
###################################
#####EXAMPLE SETUP FOR TESTING#####
###################################


#GRID SERACH HYPERPARAMETER#
#---------------------------
HP_INPUT_VAR_NINE = hp.HParam('input_var_nine', hp.Discrete(['PC9', 'PC14']),display_name='9th Input Variable')
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['AdamW', 'SGDW']),display_name='Optimizer')
HP_LEARN_RATE = hp.HParam('learn_rate', hp.Discrete([0.1]),display_name='Learning Rate')
HP_WEIGHT_DECAY = hp.HParam('weight_decay', hp.Discrete([1e-1]),display_name='Weight Decay')
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([1]),display_name='Batch Size')
HP_EPOCHS = hp.HParam('n_epochs', hp.Discrete([3]),display_name='Epochs')


#CROSS VALIDATION PARAMETER (NO PART OF GRID SEARCH)#
#----------------------------------------------------
cv_param={
    'N_FOLDS': 2,         # number of folds -> small for Test Runs
    'TEST_FRAC': .1    # factrion that is held out for test
}


#BAGGING PARAMETER (NO PART OF GRID SEARCH)#
#-------------------------------------------
n_baggs = 2  # number of baggs -> small for test runs





####################
#####FULL SETUP#####
####################


# #GRID SERACH HYPERPARAMETER#
# #---------------------------
# HP_INPUT_VAR_NINE = hp.HParam('input_var_nine', hp.Discrete(['PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16']),display_name='9th Input Variable')
# HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['AdamW', 'SGDW']),display_name='Optimizer')
# HP_LEARN_RATE = hp.HParam('learn_rate', hp.Discrete([0.01, 0.1, 0.2]),display_name='Learning Rate')
# HP_WEIGHT_DECAY = hp.HParam('weight_decay', hp.Discrete([0.001, 0.01, 0.1]),display_name='Weight Decay')
# HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([1, 3, 10, 30]),display_name='Batch Size')
# HP_EPOCHS = hp.HParam('n_epochs', hp.Discrete([30, 80, 120]),display_name='Epochs')


# #CROSS VALIDATION PARAMETER (NO PART OF GRID SEARCH)#
# #----------------------------------------------------
# cv_param={
#     'N_FOLDS': 105,      # number of folds -> sample size as in Badr
#     'TEST_FRAC': .1    # factrion that is held out for test
# }


# #BAGGING PARAMETER (NO PART OF GRID SEARCH)#
# #-------------------------------------------
# n_baggs = 10 # number of baggs -> 10 as in Badr

<br>

### Metric Selection
---

In [6]:
METRIC_TRAIN_MSE_MU= 'train_mse_mu'
METRIC_TRAIN_MSE_SIG= 'train_,mse_sig'
METRIC_TRAIN_CORR_MU= 'train_corr_mu'
METRIC_TRAIN_CORR_SIG= 'train_corr_sig'

METRIC_TEST_MSE_MU= 'test_mse_mu'
METRIC_TEST_MSE_SIG= 'test_mse_sig'
METRIC_TEST_CORR_MU= 'test_corr_mu'
METRIC_TEST_CORR_SIG= 'test_corr_sig'

<br>

### Log Experiment Confiuration to TensorBoard
---

In [7]:
with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_INPUT_VAR_NINE, HP_OPTIMIZER, HP_LEARN_RATE, HP_WEIGHT_DECAY, HP_BATCH_SIZE, HP_EPOCHS],
        metrics=[
            hp.Metric(METRIC_TRAIN_MSE_MU, display_name='Training Sample MSE µ'),
            hp.Metric(METRIC_TRAIN_MSE_SIG, display_name='Training Sample  MSE σ'),
            hp.Metric(METRIC_TRAIN_CORR_MU, display_name='Training Sample Correlation µ'),
            hp.Metric(METRIC_TRAIN_CORR_SIG, display_name='Training Sample  Correlation σ'),
            hp.Metric(METRIC_TEST_MSE_MU, display_name='Test Sample MSE µ'),
            hp.Metric(METRIC_TEST_MSE_SIG, display_name='Test Sample  MSE σ'),
            hp.Metric(METRIC_TEST_CORR_MU, display_name='Test Sample Correlation µ'),
            hp.Metric(METRIC_TEST_CORR_SIG, display_name='Test Sample  Correlation σ')
        ],
    )

<br>

### Build Model Function
---

In [8]:
def BuildModel(hparams):      
    
    
    model = keras.Sequential([
            layers.Dense(3, activation="sigmoid", name="layer1", input_shape=(9,)),
            layers.Dense(1, activation='linear', name='output')
        ])
    
    model.compile(
        loss='mean_squared_error',
        optimizer=getattr(tfa.optimizers, hparams[HP_OPTIMIZER])(
            learning_rate=hparams[HP_LEARN_RATE],
            weight_decay=hparams[HP_WEIGHT_DECAY]
        )
    )
    return model

<br>

### Bagging Function
---

In [51]:
@delayed
def Bagging(hparams, features, model, train_index, test_index):
    
    
    
    # set emty output matrices
    y_train_bagging = np.zeros((train_index.size, n_baggs))
    y_test_bagging = np.zeros((test_index.size, n_baggs))    
    
    
    #Train the model 'n_baggs' times and store model predictions into matrice
    for n in range(n_baggs):
        
#         print ('baggin run', n)
#         print ('PREDICTION ON TEST DATA:', y_test_bagging)
        
        # Bootstrap sampling from training Data with Size(Training Data)
        train_index_bootstrap = np.random.choice(train_index, train_index.size)

        #Train the model 
        model.fit(
            features[train_index_bootstrap],
            labels[train_index_bootstrap],
            batch_size=hparams[HP_BATCH_SIZE],
            epochs=hparams[HP_EPOCHS],
            verbose=1
        )
        
        #Run the model for insample data and store in one matrix:
        y_train_bagging[:, n] = np.squeeze(model.predict(features[train_index]))
        
        # ... and for out of sample data        
        y_test_bagging[:, n] = np.squeeze(model.predict(features[test_index]))

    # return mean of the outputs over baggins (1st dimension)
    return y_train_bagging.mean(1), y_test_bagging.mean(1)

In [64]:
hparams = {
                            HP_INPUT_VAR_NINE: 'PC14',
                            HP_OPTIMIZER: 'AdamW',
                            HP_LEARN_RATE: 0.1,
                            HP_WEIGHT_DECAY: 0.1,
                            HP_BATCH_SIZE: 2,
                            HP_EPOCHS: 2,                
                        }

features = predictor_pc.loc[:,['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', hparams[HP_INPUT_VAR_NINE]]].to_numpy()

for train_index, test_index in ShuffleSplit(n_splits=2, test_size=cv_param['TEST_FRAC']).split(features):
    model = BuildModel(hparams)
    
    for n in range(n_baggs):
        
#         print ('baggin run', n)
#         print ('PREDICTION ON TEST DATA:', y_test_bagging)
        
        # Bootstrap sampling from training Data with Size(Training Data)
        train_index_bootstrap = np.random.choice(train_index, train_index.size)

        #Train the model 
        delayed(model.fit)(
            features[train_index_bootstrap],
            labels[train_index_bootstrap],
            batch_size=hparams[HP_BATCH_SIZE],
            epochs=hparams[HP_EPOCHS],
            verbose=1
        )
        print (1)
        print(train_index_bootstrap)
    print (2)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [58]:
result.compute()

Epoch 1/2
Epoch 2/2
Epoch 1/2
Epoch 2/2


(array([1.11739838, 1.23990098, 0.48289761, 1.0686627 , 1.36163718,
        0.89569843, 1.37432522, 1.10477647, 1.17239708, 1.3715536 ,
        1.22822797, 1.01563278, 1.15959752, 1.28290015, 0.62352872,
        1.16564068, 1.18425566, 1.13892633, 0.67885071, 0.49271929,
        1.12655601, 1.20708767, 1.20702472, 1.07973254, 0.66773185,
        1.35773224, 0.61714071, 0.68854991, 0.87080178, 1.42444152,
        0.85274279, 1.10196218, 0.92388058, 0.65844119, 0.90855756,
        1.29325613, 0.8403244 , 0.65514585, 1.03674126, 0.48486894,
        1.43158782, 1.33575529, 1.35264063, 0.74879757, 1.2925784 ,
        1.01509029, 0.9297657 , 0.89216855, 1.1768671 , 1.29426891,
        0.88363096, 1.0449194 , 1.34580421, 0.75758216, 0.81056172,
        0.77526391, 1.03279221, 0.54241636, 1.3684212 , 0.96079496,
        0.86120987, 1.15987235, 0.43783817, 1.20041078, 1.12119615,
        0.56498587, 0.89609101, 1.38589126, 1.06642717, 0.93917245,
        0.89819843, 1.40392476, 0.84337202, 1.10

<br>

### Cross Validation Training & Error Calculation Funktion
---

In [40]:
def TrainModel(hparams, cv_param, predictor_pc, labels):
        
    
    train_mse = np.empty(cv_param['N_FOLDS'])
    train_corr = np.empty(cv_param['N_FOLDS'])
    
    test_mse = np.empty(cv_param['N_FOLDS'])
    test_corr = np.empty(cv_param['N_FOLDS'])
    
    #choose Inputs
    features = predictor_pc.loc[:,['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', hparams[HP_INPUT_VAR_NINE]]].to_numpy()
    
    
    
    
    #Cross Validation#
    ##################
    
    cv_fold = 0
    
    for train_index, test_index in ShuffleSplit(n_splits=cv_param['N_FOLDS'], test_size=cv_param['TEST_FRAC']).split(features):
        
#         print(cv_fold)
#         print("TRAIN:", train_index, "TEST:", test_index)
        
    
        # Build the model according to definition:
        model = BuildModel(hparams)
        
        # Train and predict using Bagging    
        y_train, y_test = Bagging(hparams, features, model, train_index, test_index)
        
        
        #Compute error metrics for in sample data
        train_err=  y_train - labels[train_index]
        train_mse[cv_fold] = np.mean(train_err**2)
        train_corr[cv_fold] = st.pearsonr(y_train, labels[train_index])[0]
        
        # ... and for out of sample data
        test_err=  y_test - labels[test_index]
        test_mse[cv_fold] = np.mean(test_err**2)
        test_corr[cv_fold] = st.pearsonr(y_test, labels[test_index])[0]
        
        
#         print ( "BAGGING OUT Test", y_test)
        cv_fold += 1
    
    #######################################################################################################
    
    
    #Error Moments#
    ###############
    
    eval_metrics = {
        'train_mse_mu': np.mean(train_mse),
        'train_mse_sig': np.std(train_mse),
        'train_corr_mu': np.mean(train_corr),
        'train_corr_sig': np.std(train_corr),            
        'test_mse_mu': np.mean(test_mse),
        'test_mse_sig': np.std(test_mse),
        'test_corr_mu': np.mean(test_corr),
        'test_corr_sig': np.std(test_corr),
    }
    
#     print(eval_metrics)
    
    return eval_metrics


<br>

### Model Run and Log Function
---

In [35]:
hparams = {
                            HP_INPUT_VAR_NINE: 'PC14',
                            HP_OPTIMIZER: 'AdamW',
                            HP_LEARN_RATE: 0.1,
                            HP_WEIGHT_DECAY: 0.1,
                            HP_BATCH_SIZE: 2,
                            HP_EPOCHS: 2,                
                        }

In [48]:
%%time
eval_metrics = TrainModel(hparams, cv_param, predictor_pc, labels)

CPU times: user 1.03 ms, sys: 0 ns, total: 1.03 ms
Wall time: 922 µs


In [50]:
%%time
for n in range(2):
     eval_metrics = TrainModel(hparams, cv_param, predictor_pc, labels)
    
    
# eval_metrics.visualize()

CPU times: user 1.63 ms, sys: 0 ns, total: 1.63 ms
Wall time: 1.59 ms


In [19]:

def RunModel(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)  # record the values used in this trial
        
        eval_metrics = delayed(TrainModel)(hparams, cv_param, predictor_pc, labels)
        eval_metrics.a()
        tf.summary.scalar(METRIC_TRAIN_MSE_MU,   eval_metrics['train_mse_mu'],   step=1)
        tf.summary.scalar(METRIC_TRAIN_MSE_SIG,  eval_metrics['train_mse_sig'],  step=1)
        tf.summary.scalar(METRIC_TRAIN_CORR_MU,  eval_metrics['train_corr_mu'],  step=1)
        tf.summary.scalar(METRIC_TRAIN_CORR_SIG, eval_metrics['train_corr_sig'], step=1)
        tf.summary.scalar(METRIC_TEST_MSE_MU,    eval_metrics['test_mse_mu'],    step=1)
        tf.summary.scalar(METRIC_TEST_MSE_SIG,   eval_metrics['test_mse_sig'],   step=1)
        tf.summary.scalar(METRIC_TEST_CORR_MU,   eval_metrics['test_corr_mu'],   step=1)
        tf.summary.scalar(METRIC_TEST_CORR_SIG,  eval_metrics['test_corr_sig'],  step=1)

<br>

### Grid Search
---

In [20]:
%%time

session_num = 0        

for input_var_nine in HP_INPUT_VAR_NINE.domain.values:
    for optimizer in HP_OPTIMIZER.domain.values:
        for learn_rate in HP_LEARN_RATE.domain.values:
            for weight_decay in HP_WEIGHT_DECAY.domain.values:
                for batch_size in HP_BATCH_SIZE.domain.values:
                    for n_epochs in HP_EPOCHS.domain.values:


                        hparams = {
                            HP_INPUT_VAR_NINE: input_var_nine,
                            HP_OPTIMIZER: optimizer,
                            HP_LEARN_RATE: learn_rate,
                            HP_WEIGHT_DECAY: weight_decay,
                            HP_BATCH_SIZE: batch_size,
                            HP_EPOCHS: n_epochs,                
                        }

                        run_name = f"run-{session_num}"
#                         print(f'--- Starting trial: {run_name}')
#                         print({h.name: hparams[h] for h in hparams})
                        RunModel('logs/hparam_tuning/' + run_name, hparams)
                        session_num += 1    

ValueError: TypeError: Delayed objects of unspecified length have no len()
Traceback (most recent call last):

  File "/opt/conda/lib/python3.8/site-packages/dask/delayed.py", line 568, in __len__
    raise TypeError("Delayed objects of unspecified length have no len()")

TypeError: Delayed objects of unspecified length have no len()



<br>
<br>

***
***
<br>
<br>