<a href="https://colab.research.google.com/github/kconstable/market_predictions/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing for Deep Learning


## Import Libraries 

In [82]:
!pip install -q -U keras-tuner

[?25l[K     |███▍                            | 10 kB 23.5 MB/s eta 0:00:01[K     |██████▊                         | 20 kB 26.4 MB/s eta 0:00:01[K     |██████████                      | 30 kB 29.9 MB/s eta 0:00:01[K     |█████████████▍                  | 40 kB 31.5 MB/s eta 0:00:01[K     |████████████████▊               | 51 kB 33.7 MB/s eta 0:00:01[K     |████████████████████            | 61 kB 36.4 MB/s eta 0:00:01[K     |███████████████████████▍        | 71 kB 37.6 MB/s eta 0:00:01[K     |██████████████████████████▊     | 81 kB 38.7 MB/s eta 0:00:01[K     |██████████████████████████████▏ | 92 kB 41.0 MB/s eta 0:00:01[K     |████████████████████████████████| 97 kB 6.5 MB/s 
[?25h

In [121]:
# deep learning /RNN
import tensorflow as tf
from keras import backend as K
from tensorflow import keras
from keras.models import Sequential,backend
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
import keras_tuner as kt
from kerastuner.tuners import Hyperband
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

# stats, data structures and plotting
import random as rn
import math
import numpy as np
import pandas as pd
import timeit
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from statsmodels.tsa.stattools import adfuller
from scipy import stats

# File operations
import os
import pickle
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')


# Set Seeds
seed = 1985
np.random.seed(seed)
rn.seed(seed)
os.environ['PYTHONHASHSEED']=str(seed)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Data

In [12]:
stock = 'VMW'
df_vmw = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data.pickle')
df_vmw_features = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data_features.pickle')
print(f"\n{stock}")
print("="*60)
print(f"{stock}-all features: ",df_blx.shape)
print(f"{stock}-optimal features: ",df_blx_features.shape)


crypto='BTC'
df_btc = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{crypto}_market_data.pickle')
df_btc_features = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{crypto}_market_data_features.pickle')
print(f"\n{crypto}")
print("="*60)
print(f"{crypto}-all features: ",df_btc.shape)
print(f"{crypto}-optimal features: ",df_btc_features.shape)


stock='BLX.TO'
df_blx = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data.pickle')
df_blx_features = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data_features.pickle')
print(f"\n{stock}")
print("="*60)
print(f"{stock}-all features: ",df_blx.shape)
print(f"{stock}-optimal features: ",df_blx_features.shape)


VMW
VMW-all features:  (2610, 29)
VMW-optimal features:  (2570, 10)

BTC
BTC-all features:  (1000, 28)
BTC-optimal features:  (940, 14)

BLX.TO
BLX.TO-all features:  (2610, 29)
BLX.TO-optimal features:  (2570, 10)


## Prepare Data for Deep Learning


### Prepare the Data 
+ Reverse the date-time index
+ Convert the date-time index to an integer index
+ Convert to numpay arrays

In [13]:
def prepare_data(df,y='close',features=[]):
  """
  Filter, scale and convert dataframe data to numpy arrays

  Inputs: 
    df       => A dataframe of observations with features and y-labels
    y        => The name of the column that is the truth labels
    features => A list of features.  Used to subset columns

  Outputs:
    scaled_y => numpy array of the y-label data
    scaled_x => numpy array of the training features

  """

  # reverse the index such that dates are in chronological order
  df = df.iloc[::-1]

  # Subset features, get the y-label values
  df_y = df[y]
  df_X = df[features]

  # replace the date index with an integer index
  df_X.reset_index(drop=True,inplace=True)

  # convert to numpay arrays
  array_X = np.array(df_X)
  array_y = np.array(df_y).reshape(-1,1)

  # print the output
  print("\nData Preparation")
  print("="*60)
  print(f"=> {len(features)} Features")
  print(f"=> Input Dimensions :{array_X.shape}")
  print(f"=> Output Dimensions:{array_y.shape}")
  print("\n")

  return array_y, array_X


### Split the Data into Train/Test Sets
+ Time series data cannot be split randomly like other observational data because the order is important.  
+ The data is split into train/test based on dates

In [14]:
def split_train_test(X,idx_close,train_pct=0.80):
  """
  """

  # Train
  train_num = math.ceil(X.shape[0]*train_pct)
  train_X = X[0:train_num,:]
  train_str = f'{round(train_X.shape[0]/X.shape[0],3)*100}%'

  # Test
  # test_X = X[train_num-n_steps:,:]  # if you use this, need to add n_steps as a param!
  test_X = X[train_num:,:]
  test_str =f'{round(test_X.shape[0]/X.shape[0],3)*100}%'

  # print the results
  print("\nSplit Data (X)")
  print("="*60)
  print(f'Data  :100.0% {X.shape}')
  print(f'Train :{train_str:>6} {train_X.shape}')
  print(f'Test  :{test_str:>6} {test_X.shape}')
  print('\n')

  # create the line plots
  plot_train_test_data(train_X,test_X,idx_close)

  return train_X,test_X


In [15]:
def plot_train_test_data(train_x,test_x,idx_close):
  """
  """

  ts0 = list(range(0,train_x.shape[0]))
  ts1 = list(range(train_x.shape[0],train_x.shape[0]+test_x.shape[0]))

  fig = go.Figure()
  fig.add_trace(go.Scatter(x=ts0,y=train_x[:,idx_close],name='Train'))
  fig.add_trace(go.Scatter(x=ts1,y=test_x[:,idx_close],name='Test'))
  fig.add_shape(type ='rect',
                  x0=ts1[0],x1=ts1[-1],
                  y0=0,y1=1,
                  line=dict(color='#F6B28D'),
                  fillcolor='#F6B28D',
                  opacity=0.1)
  fig.add_shape(type ='rect',
                  x0=ts0[0],x1=ts0[-1],
                  y0=0,y1=1,
                  line=dict(color='#7BA1AA'),
                  fillcolor='#7BA1AA',
                  opacity=0.1)
  fig.update_layout(title = 'Train/Test Datasets (Scaled)',
                      template="plotly_white",
                      yaxis_title='Closing Price (Scaled)',
                      xaxis_title='time steps',
                      width = 600,
                      height =600)
  fig.update_shapes(dict(xref='x',yref='paper'))
  fig.show()



### Partition the Data

In [16]:
def create_partitions(data,idx_close,n_steps,n_predict,visualize=False):
  """
  This function partitions the train/test data into batches with times-step windows for training

  Each batch consists of n_steps of training data, and n_predict steps of label data
  The function outputs an x array [samples, time steps, features] and a y array [samples, time steps]

  Inputs:
    data      => train or test array
    idx_close => the position of y value in the data
    n_steps   => the number of time steps in each training batch
    n_predict => the number of time steps that will be predicted
    visualize => boolean, will plot a visual of training/prediction windows
  Outputs:
    array(i)  => np.array of batched & partitioned training data with features
    array(p)  => np.array of batched & partitioned y-lables 

  Reference:
    https://www.relataly.com/time-series-forecasting-multi-step-regression-using-neural-networks-with-multiple-outputs-in-python/5800/
  """
  n = data.shape[0]
  window = n_steps + n_predict
  i, p = [],[]

  # print the moving window
  if visualize:
    print("Data Window: I(Input), P(Predict),-(scanned), +(to be scanned)")
    print("="*max(n,100))

  # create the partitions
  for step in range(n_steps, n-n_predict):
    # get the input window + all features

    # train window
    i.append(data[step-n_steps:step,:])

    # get the prediction window + the closing price
    p.append(data[step:step+n_predict,idx_close])

    # print the moving window
    if visualize and step <= 50:
      scanned = n-((step-n_steps)+window)
      print("-"*(step-n_steps),'I'*n_steps,'P'*n_predict,"+"*scanned,sep="")
    
  return np.array(i),np.array(p)

In [17]:
def plot_training_window(x_array,y_array,idx_close,n_steps,n_predict,batch):
  """
  Inputs: 
    x_array/y_array => partitioned data from create_partitions
    idx_close       => the position of the y-label (close price)
    n_steps         => the number of time steps in each training batch
    n_predict       => the number of time steps predicted each training batch
    batch           => the batch number to plot
  Outputs:
    A line chart with the train/predict values
  """
  # convert the arrays to dataframes
  # align the x indexes to compare
  df_y = pd.DataFrame(y_array[batch],index=range(n_steps-1,n_predict+n_steps-1),columns=['y'])
  df_x = pd.DataFrame(x_array[batch+1])[idx_close]
  df_x = pd.DataFrame(df_x)
  df_x.columns = ['x']

  # create the plots
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df_x.index,y=df_x['x'],name='train window'))
  fig.add_trace(go.Scatter(x=df_y.index,y=df_y['y'],name='predict window'))
  fig.update_layout(template='plotly_white',
                    title='Train/Predict Windows',
                    yaxis_title = 'Closing Price (Scaled)',
                    xaxis_title='Period',
                    width = 600,
                    height = 600)
  fig.show()


### Data Transformations

#### Check for stationarity
+ Augmented Dickey-Fuller (ADF) test for stationarity
+ null: the data is non-stationary
+ alternative: the data is stationary
+ => if the test stat is less than all critical values,  we cannot reject the null, therefore the data is non-stationary

**Reference**

 https://analyzingalpha.com/check-time-series-stationarity-python#augmented-dickey-fuller-adf

In [18]:
def transform_stationary(df,features_to_transform,transform='log'):
  """
  transform: log, boxcox
  """
  # transform each column in the features_to_transform list
  for feature in df.columns:
    if feature in features_to_transform:
      # log transform
      if transform=='log':
        df[feature] = df[feature].apply(np.log)

      # boxcox transform  
      elif transform=='boxcox':
        bc,_ = stats.boxcox(df[feature])
        df[feature] = bc

      else:
        print("Transformation not recognized")

  # check the closing price for stationarity using the augmented dicky fuller test
  t_stat, p_value, _, _, critical_values, _  = adfuller(df.close.values, autolag='AIC')
  print('Augmented Dicky Fuller Test for Stationarity')
  print("="*60)
  print(f'ADF Statistic: {t_stat:.2f}')
  for key, value in critical_values.items():
    print('Critial Values:')
    if t_stat < value:
      print(f'   {key}, {value:.2f} => non-stationary')
    else:
      print(f'   {key}, {value:.2f} => stationary')

### Process the Data

In [79]:
# Training Config

# uncomment the desired data
# df = df_btc.copy()
# df = df_btc_features.copy()
df = df_vmw.copy()
# df = df_vmw_features.copy()
# df = df_blx.copy()
# df = df_blx_features.copy()

# Extract the features needed for training
features = [c for c in df.columns if c not in ['symbol']]
n_steps  = 40   # The number of time steps included in each training batch
n_predict = 5   # The number of time steps into the future the model will predict
idx_close = df.columns.get_loc("close") # index postiion variable (closing price)
transform = 'log'

# difference the price data
features_to_transform = ['open','high','low','close']  # blx_features
transform_stationary(df,features_to_transform,transform)


# Scale the data and convert to numpy arrays
array_y,array_X = prepare_data(df,'close',features)


# scale the input and outputs
scaler_X = MinMaxScaler(feature_range=(0,1))
scaler_y = MinMaxScaler(feature_range=(0,1))
scaled_X = scaler_X.fit_transform(array_X)
scaled_y = scaler_y.fit_transform(array_y)


# split into train, test
train_pct = 0.80
train_x,test_x=split_train_test(scaled_X,idx_close,train_pct)


# Partition the train/test data into time series windows for training
# LSTM input format: [samples, time steps, features]
x_train, y_train = create_partitions(train_x,idx_close,n_steps, n_predict)
x_test,  y_test  = create_partitions(test_x, idx_close,n_steps, n_predict)


# Print the results
print('Train/Test Dimensions')
print('='*60)
print("Train Data Dimensions: ","x",x_train.shape,"y",y_train.shape)
print("Test Data Dimensions : ","x",x_test.shape," y",y_test.shape)

print("\n\nCheck the dimensions of the training windows")
print("="*60)
print('The last value in the training batch should match the first value in the predict batch')
print("Last Training Value:   ",x_train[1][n_steps-1][idx_close])
print("First Prediction Value:",y_train[0][0])

# Make sure the train/predict batch windows are aligned
plot_training_window(x_train,y_train,idx_close,n_steps,n_predict,seed//10)

# save to google drive
np.save('/content/drive/MyDrive/Colab Notebooks/data/x_train',x_train)
np.save('/content/drive/MyDrive/Colab Notebooks/data/y_train',y_train)
np.save('/content/drive/MyDrive/Colab Notebooks/data/x_test',x_test)
np.save('/content/drive/MyDrive/Colab Notebooks/data/y_test',y_test)


Augmented Dicky Fuller Test for Stationarity
ADF Statistic: -1.75
Critial Values:
   1%, -3.43 => stationary
Critial Values:
   5%, -2.86 => stationary
Critial Values:
   10%, -2.57 => stationary

Data Preparation
=> 28 Features
=> Input Dimensions :(2569, 28)
=> Output Dimensions:(2569, 1)



Split Data (X)
Data  :100.0% (2569, 28)
Train : 80.0% (2056, 28)
Test  : 20.0% (513, 28)




Train/Test Dimensions
Train Data Dimensions:  x (2011, 40, 28) y (2011, 5)
Test Data Dimensions :  x (468, 40, 28)  y (468, 5)


Check the dimensions of the training windows
The last value in the training batch should match the first value in the predict batch
Last Training Value:    0.496163264852131
First Prediction Value: 0.496163264852131


# Deep Learning LSTM

### Build the LSTM Model

In [20]:
def create_lstm_model(config,x_train,n_predict):
  """
  """
  # get window size
  n_steps = config['data']['n_steps']
  n_predict = config['data']['n_predict']
  n_features = x_train.shape[2]

  # clear previous models
  backend.clear_session()

  # LSTM Model+ first layer
  model = Sequential(name='LSTM')
  model.add(LSTM(n_steps,return_sequences=True,input_shape=(n_steps,n_features)))

  # add additional layers
  for layer,nodes,ret_seq,drop in config['layers']:
    # add LSTM layers + dropout
    if layer=='lstm':
      model.add(LSTM(nodes,return_sequences =ret_seq))
      if drop is not None:
        model.add(Dropout(drop))

    # Add Dense Layers + dropout
    elif layer =='dense':
      model.add(Dense(nodes))
      if drop is not None:
        model.add(Dropout(drop))

  # add the prediction layer
  model.add(Dense(n_predict))
  
  # compile
  model.compile(optimizer='adam',loss='mse')
  model.summary()

  return model


### Calculate Model Performance

In [21]:
def calculate_performance(x_test,y_test,model,scaler_y):
  """
  """

  # Predict the prices
  y_pred = model.predict(x_test)

  # convert units back to the original scale
  y_pred_unscaled = scaler_y.inverse_transform(y_pred)
  y_test_unscaled = scaler_y.inverse_transform(y_test)

  # # Root mean squarred error,mean abs error, mean abs percent error, median abs percent error
  rmse  = math.sqrt(mean_squared_error(y_test_unscaled, y_pred_unscaled))
  mae   = mean_absolute_error(y_test_unscaled, y_pred_unscaled)
  mape  = np.mean((np.abs(np.subtract(y_test_unscaled, y_pred_unscaled)/ y_test_unscaled))) * 100
  mdape = np.median((np.abs(np.subtract(y_test_unscaled, y_pred_unscaled)/ y_test_unscaled)) ) * 100

  print("\nModel Error")
  print("="*62)
  print(f'{"Mean Absolute Error (MAE)" :-<55} {np.round(mae, 2):>5}')
  print(f'{"Root Mean Squared Error (MSE)" :-<55} {np.round(rmse,2):>5}')
  print(f'{"Mean Absolute Percentage Error (MAPE)" :-<55} {np.round(mape, 2):>5}%')
  print(f'{"Median Absolute Percentage Error (MDAPE)" :-<55} {np.round(mdape, 2):>5}%')

  return y_pred


### Plot Training Metrics

In [22]:

def plot_training_metrics(history):
  """
  """
  # get the number of epochs
  epochs = list(range(1, len(history.history['loss']) + 1))

  # min_train_loss = min(history.history['loss'])
  # min_val_loss = min(history.history['val_loss'])

  # create the line plots
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=epochs,
                           y=history.history['loss'],
                           name = 'train-loss',
                           line=dict(width=3,color='royalblue')))
  fig.add_trace(go.Scatter(x=epochs,
                           y=history.history['val_loss'],
                           name='val-loss',
                           line=dict(width=3,color='crimson')))

  # fig.add_shape(type ='rect',
  #               x0=0,
  #               x1=1,
  #               y0=min(min_train_loss,min_val_loss),
  #               y1=max(min_train_loss,min_val_loss),
  #               line=dict(color='#ccc'),
  #               fillcolor='#ccc',
  #               opacity=0.4)
  fig.update_layout(title = 'Training Metrics',
                    template="plotly_white",
                    width = 700,
                    height= 500,
                    yaxis_title='loss',
                    xaxis_title='epochs')
  # fig.update_shapes(dict(xref='paper',yref='y'))
  fig.show()



## Plot Predicted vs Actual Price

In [40]:
def plot_price_predictions(batch, idx_close, x_test, y_pred_scaled,scaler_y): 
  """
  """

  # unscale the y predictions
  y_pred_unscaled = scaler_y.inverse_transform(y_pred_scaled)

  # unscale the x_test data
  x_test_np = np.array(pd.DataFrame(x_test[batch])[idx_close]).reshape(-1,1)
  x_test_unscaled = scaler_y.inverse_transform(x_test_np)
  x_test_df = pd.DataFrame(x_test_unscaled)

  # set the indexes for plotting
  max_test_idx=x_test_df.shape[0]
  max_pred_idx =y_pred_unscaled[0].shape[0]
  test_idx = list(range(batch,batch + max_test_idx))
  pred_idx = list(range(batch + max_test_idx,batch + max_test_idx + max_pred_idx))


  # combine the actual + predicted prices
  data = pd.DataFrame(list(zip(y_pred_unscaled[batch], x_test_df[0])), columns=['pred', 'actual'])

  # create the plot
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=test_idx, y=x_test_df[0],
                        mode='lines',
                        name='Test Data',
                        fill='tozeroy',
                        line_color='#ccc'))
  fig.add_trace(go.Scatter(x=pred_idx, y=data['actual'],
                        mode='lines+markers', 
                        name='Actual Price',
                        fill='tozeroy',
                        line_color ='#ccc')) 
  fig.add_trace(go.Scatter(x=pred_idx, y=data['pred'],
                        mode='lines+markers',
                        name='Predicted Price',
                        line_color='red'))

  fig.update_layout(template = 'plotly_white',
                      title= 'Actual vs Predicted Price',
                      xaxis_title = 'Batch',
                      yaxis_title = 'Price',
                      width=600,
                      height=400)

  fig.show()
  

In [None]:
def plot_n_price_predictions(idx_close,x_test,y_pred_scaled,scaler_y,n_batches=2):
  """
  """

  # select random batch numbers
  batches=[]
  [batches.append(rn.randint(0,y_pred_scaled.shape[0])) for r in list(range(n_batches))]

  # plot predictions
  for batch in batches:
    plot_price_predictions(batch, idx_close, x_test, y_pred_scaled,scaler_y)


## Train the Model
Training config file.

In [23]:
config ={
    'data': {'n_steps':n_steps,'n_predict':n_predict},
    'layers': [('lstm',256,True,0.3),('lstm',128,False,0.2),('dense',32,None,0.1),('dense',16,None,0.1)],
    'train':{'epochs':250,'batch_size':128,'early_stop':EarlyStopping(monitor='loss',patience=10)}
}
config

{'data': {'n_predict': 5, 'n_steps': 40},
 'layers': [('lstm', 256, True, 0.3),
  ('lstm', 128, False, 0.2),
  ('dense', 32, None, 0.1),
  ('dense', 16, None, 0.1)],
 'train': {'batch_size': 128,
  'early_stop': <keras.callbacks.EarlyStopping at 0x7f2976dea590>,
  'epochs': 250}}

### Bitcoin
+ train with all features
+ train with the optimal features

In [76]:
# BTC - all features
modelname = f'model_{crypto}_{transform}'
model = create_lstm_model(config,x_train,n_predict)

# Train the model
tic = timeit.default_timer()
history = model.fit(x_train,
                    y_train,
                    batch_size=config['train']['batch_size'],
                    epochs=config['train']['epochs'],
                    callbacks = [config['train']['early_stop']],
                    validation_split =0.10, 
                    verbose = 0
                    )

# print the training time
toc =timeit.default_timer()
print('\nTraining Time')
print('='*60)
print(f'Minutes:{round((toc-tic)/60,2)}\n')


# predict prices
# calculate the performance metrics
# plot training metrics
# plot predictions for random batches
y_pred_scaled = calculate_performance(x_test,y_test,model,scaler_y)
plot_training_metrics(history)
plot_n_price_predictions(idx_close,x_test,y_pred_scaled,scaler_y,2)


# save model and training performance
model.save(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_all')

df_hist = pd.DataFrame(history.history) 
df_hist.to_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_all/train_history.csv')


Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 40)            10880     
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 256)           304128    
_________________________________________________________________
dropout (Dropout)            (None, 40, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0      



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BTC_log_all/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BTC_log_all/assets


In [78]:
# BTC -optimal features
modelname = f'model_{crypto}_{transform}'
model = create_lstm_model(config,x_train,n_predict)

# train
tic = timeit.default_timer()
history = model.fit(x_train,
                    y_train,
                    batch_size=config['train']['batch_size'],
                    epochs=config['train']['epochs'],
                    callbacks = [config['train']['early_stop']],
                    validation_split =0.10, 
                    verbose = 0
                    )

# print the training time
toc =timeit.default_timer()
print('\nTraining Time')
print('='*60)
print(f'Minutes:{round((toc-tic)/60,2)}\n')

# predict prices
# calculate the performance metrics
# plot training metrics
# plot predictions for random batches
y_pred_scaled = calculate_performance(x_test,y_test,model,scaler_y)
plot_training_metrics(history)
plot_n_price_predictions(idx_close,x_test,y_pred_scaled,scaler_y,2)


# save model and training performance
model.save(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_features')

df_hist = pd.DataFrame(history.history) 
df_hist.to_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_features/train_history.csv')

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 40)            8800      
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 256)           304128    
_________________________________________________________________
dropout (Dropout)            (None, 40, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0      



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BTC_log_features/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BTC_log_features/assets


### VM Ware
+ train with all features
+ train with optimal features

In [80]:
# VW Ware -all features
stock= 'VWM'
modelname = f'model_{stock}_{transform}'
model = create_lstm_model(config,x_train,n_predict)

# train
tic = timeit.default_timer()
history = model.fit(x_train,
                    y_train,
                    batch_size=config['train']['batch_size'],
                    epochs=config['train']['epochs'],
                    callbacks = [config['train']['early_stop']],
                    validation_split =0.10, 
                    verbose = 0
                    )


# print the training time
toc =timeit.default_timer()
print('\nTraining Time')
print('='*60)
print(f'Minutes:{round((toc-tic)/60,2)}\n')

# predict prices
# calculate the performance metrics
# plot training metrics
# plot predictions for random batches
y_pred_scaled = calculate_performance(x_test,y_test,model,scaler_y)
plot_training_metrics(history)
plot_n_price_predictions(idx_close,x_test,y_pred_scaled,scaler_y,2)


# save model and training performance
model.save(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_all')

df_hist = pd.DataFrame(history.history) 
df_hist.to_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_all/train_history.csv')

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 40)            11040     
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 256)           304128    
_________________________________________________________________
dropout (Dropout)            (None, 40, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0      



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_VWM_log_all/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_VWM_log_all/assets


In [58]:
# VW Ware -optimal features
stock= 'VWM'
modelname = f'model_{stock}_{transform}'
model = create_lstm_model(config,x_train,n_predict)

# train
tic = timeit.default_timer()
history = model.fit(x_train,
                    y_train,
                    batch_size=config['train']['batch_size'],
                    epochs=config['train']['epochs'],
                    callbacks = [config['train']['early_stop']],
                    validation_split =0.10, 
                    verbose = 0
                    )

# print the training time
toc =timeit.default_timer()
print('\nTraining Time')
print('='*60)
print(f'Minutes:{round((toc-tic)/60,2)}\n')

# predict prices
# calculate the performance metrics
# plot training metrics
# plot predictions for random batches
y_pred_scaled = calculate_performance(x_test,y_test,model,scaler_y)
plot_training_metrics(history)
plot_n_price_predictions(idx_close,x_test,y_pred_scaled,scaler_y,2)

# save model and training performance
model.save(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_features')

df_hist = pd.DataFrame(history.history) 
df_hist.to_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_features/train_history.csv')

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 40)            8160      
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 256)           304128    
_________________________________________________________________
dropout (Dropout)            (None, 40, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0      



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_VWM_log_features/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_VWM_log_features/assets


### BLX.TO

In [50]:
# BLX  -all features
stock= 'BLX.TO'
modelname = f'model_{stock}_{transform}'
model = create_lstm_model(config,x_train,n_predict)

# train
tic = timeit.default_timer()
history = model.fit(x_train,
                    y_train,
                    batch_size=config['train']['batch_size'],
                    epochs=config['train']['epochs'],
                    callbacks = [config['train']['early_stop']],
                    validation_split =0.10, 
                    verbose = 0
                    )

# print the training time
toc =timeit.default_timer()
print('\nTraining Time')
print('='*60)
print(f'Minutes:{round((toc-tic)/60,2)}\n')

# predict prices
# calculate the performance metrics
# plot training metrics
# plot predictions for random batches
y_pred_scaled = calculate_performance(x_test,y_test,model,scaler_y)
plot_training_metrics(history)
plot_n_price_predictions(idx_close,x_test,y_pred_scaled,scaler_y,2)


# save model and training performance
model.save(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_all')

df_hist = pd.DataFrame(history.history) 
df_hist.to_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_all/train_history.csv')

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 40)            11040     
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 256)           304128    
_________________________________________________________________
dropout (Dropout)            (None, 40, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0      



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BLX.TO_log_all/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BLX.TO_log_all/assets


In [52]:
# BLX  -optimal features
stock= 'BLX.TO'
modelname = f'model_{stock}_{transform}'
model = create_lstm_model(config,x_train,n_predict)

# train
tic = timeit.default_timer()
history = model.fit(x_train,
                    y_train,
                    batch_size=config['train']['batch_size'],
                    epochs=config['train']['epochs'],
                    callbacks = [config['train']['early_stop']],
                    validation_split =0.10, 
                    verbose = 0
                    )

# print the training time
toc =timeit.default_timer()
print('\nTraining Time')
print('='*60)
print(f'Minutes:{round((toc-tic)/60,2)}\n')

# predict prices
# calculate the performance metrics
# plot training metrics
# plot predictions for random batches
y_pred_scaled = calculate_performance(x_test,y_test,model,scaler_y)
plot_training_metrics(history)
plot_n_price_predictions(idx_close,x_test,y_pred_scaled,scaler_y,2)


# save model and training performance
model.save(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_all')

df_hist = pd.DataFrame(history.history) 
df_hist.to_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}_all/train_history.csv')

Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 40)            8160      
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 256)           304128    
_________________________________________________________________
dropout (Dropout)            (None, 40, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0      



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BLX.TO_log_all/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BLX.TO_log_all/assets


## Load Saved Model

In [15]:
# get saved training evaluation
modelname = 'model_BTC_log_features'
model = keras.models.load_model(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}')
df_hist = pd.read_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}/train_history.csv')

## Old keep for now

In [42]:

# Plot train+test+predict
# test numbers don't line up 
#=> check create paritions to see why the number of records don't match

y_test_unscaled = scaler_y.inverse_transform(y_test)

# train data
train_unscaled = scaler_X.inverse_transform(train_x)
df_train=pd.DataFrame(train_unscaled[:,idx_close])
df_train.columns=['price']
print(df_train.shape)


# test data
test_unscaled = scaler_X.inverse_transform(test_x)
df_test = pd.DataFrame(test_unscaled[:,idx_close])
df_test.columns = ['price']


# reindex the the test set to start from the train set
idx_last_train = df_train.shape[0]
idx_last_test = df_test.shape[0]
idx_rng = list(range(idx_last_train,idx_last_train + idx_last_test))

df_test['idx'] = idx_rng
df_test.set_index('idx',drop=True,inplace=True)
print(df_test.shape)



# get the predictions
y_pred = model.predict(x_test)

# convert units back to the original scale
y_pred_unscaled = scaler_y.inverse_transform(y_pred)

# reset the index to align with the train/test indexes
# idx will be supplied by the caller function
idx = y_test_unscaled.shape[0]-1
idx_pred_start = idx_last_train  + idx_last_test -(idx_last_test - idx)
idx_rng = list(range(idx_pred_start,idx_pred_start+n_predict))


y_pred =y_pred_unscaled[idx,:]
df_pred = pd.DataFrame(y_pred)
df_pred.columns =['price']
df_pred['idx']=idx_rng
df_pred.set_index('idx',drop=True,inplace=True)
print(df_pred.shape)


fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.index,y=df_train.price,name='Train'))
fig.add_trace(go.Scatter(x=df_test.index,y=df_test.price,name='Test'))
fig.add_trace(go.Scatter(x=df_pred.index,y=df_pred.price,name='Predict'))
fig.show()  

print(idx,idx_last_train,idx_last_test,idx_pred_start)
df_pred



(2317, 1)
(579, 1)
(5, 1)


533 2317 579 2850


Unnamed: 0_level_0,price
idx,Unnamed: 1_level_1
2850,3.470706
2851,3.44346
2852,3.424234
2853,3.414137
2854,3.422799


# Hyperparameter Tuning

In [139]:
def build_tuner_model(hp):
  """
  """
  # Clear previous session
  backend.clear_session()

  # searchable parameters
  lstm_nodes    = hp.Choice('lstm_units',values=LSTM_UNITS)
  dense_nodes   = hp.Choice('dense_units',values=DENSE)
  dropout_lstm  = hp.Choice('lstm_dropout',values=DROPOUT_LSTM)
  dropout_dense = hp.Choice('dense_dropout',values=DROPOUT_DENSE)
  learn_rate    = hp.Choice('learn_rate',values = LEARN_RATE)


  # LSTM Model+ first layer
  model = Sequential()
  model.add(LSTM(N_STEPS,return_sequences=True,input_shape=(N_STEPS,N_FEATURES)))


  # add layers
  model.add(LSTM(units=lstm_nodes,return_sequences=True))
  model.add(Dropout(dropout_lstm))
  model.add(LSTM(lstm_nodes//2,return_sequences=False))
  model.add(Dropout(max(0.1,dropout_lstm//2)))
  model.add(Dense(units=dense_nodes))
  model.add(Dropout(dropout_dense))
  model.add(Dense(dense_nodes//2))
  model.add(Dropout(max(0.1,dropout_dense//2)))

  # add prediction layer
  model.add(Dense(5))

  # compile
  model.compile(loss='mse',optimizer =tensorflow.keras.optimizers.Adam(lr=learn_rate))
  model.summary()

  return model

In [128]:
# batch_size
# epochs

# Searchable hyperparameters
LSTM_UNITS    = [32,64,128,256,512]
DENSE         = [32,64,128,256,512]
DROPOUT_LSTM  = [0.1,0.20,0.3,0.50]
DROPOUT_DENSE = [0.1,0.20,0.3,0.50]
N_FEATURES    = x_train.shape[2]
N_STEPS       = config['data']['n_steps']
N_PREDICT     = config['data']['n_predict']
LEARN_RATE    = [0.001,0.0001]

# initialize
hp = kt.HyperParameters()
batch_size = hp.Int('batch_size',32,256,step=32)

# Setup the Tuner
tuner = Hyperband(
    build_tuner_model,
    max_epochs = 50,
    objective = 'val_loss',
    directory = 'tune_hyperband',
    project_name = 'prices_tune_hyperband',
    overwrite = True
)

# Implement early stopping
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)


# Perform hyper
tuner.search(x_train,
             y_train,
             batch_size = 128,
             epochs = 50,
             validation_split=0.10,
             callbacks=[stop_early]
)



# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.get_best_models(num_models=1)[0]

print("Hyperparmeter Search")
print("="*60)
print(f"First LSTM Node Size:          {best_hps.get('lstm_units')}")
print(f"First LSTM Dropout Rate:       {best_hps.get('lstm_dropout')}")
print(f"Second LSTM Node Size:         {best_hps.get('lstm_units')//2}")
print(f"Second LSTM Dropout Rate:      {max(0.1,best_hps.get('lstm_dropout')//2)}")
print(f"First Dense Node Size:         {best_hps.get('dense_units')}")
print(f"First Dense Node Droput Rate:  {best_hps.get('dense_dropout')}")
print(f"Second Dense Node Size:        {best_hps.get('dense_units')//2}")
print(f"Second Dense Node Dropout Rate:{max(0.1,best_hps.get('dense_dropout')//2)}")
print(f"Learning Rate:                 {best_hps.get('learn_rate')}")


print('\n\n')
# show the optimal hyperparameters
tuner.oracle.get_best_trials(num_trials=1)[0].hyperparameters.values


Trial 21 Complete [00h 00m 22s]
val_loss: 0.004178881645202637

Best val_loss So Far: 0.001735389232635498
Total elapsed time: 00h 06m 50s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 40)            11040     
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 64)            26880     
_________________________________________________________________
dropout (Dropout)            (None, 40, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 32)                1056      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0

























The hyperparameter search is complete. The optimal number of units in the first lstm-connected
layer is 64, with a dropout rate of 0.1. The optimal number of units for the first dense layer is 32 
with a dropout of  0.1.






{'dense_dropout': 0.1,
 'dense_units': 32,
 'lstm_dropout': 0.1,
 'lstm_units': 64,
 'tuner/bracket': 2,
 'tuner/epochs': 10,
 'tuner/initial_epoch': 4,
 'tuner/round': 2,
 'tuner/trial_id': '4df11a93aff7c1472f777c2635b29d7a'}

In [129]:
best_hps.get('batch_size')

KeyError: ignored