<a href="https://colab.research.google.com/github/kconstable/market_predictions/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing for Deep Learning


## Import Libraries 

In [118]:
import pandas as pd
import numpy as np
import math
import pickle

import plotly.graph_objects as go

import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential,backend
from keras.layers import LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

from statsmodels.tsa.stattools import adfuller
from scipy import stats

from google.colab import files
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).



pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



## Load Data

In [132]:
stock = 'BLX.TO'
df = pd.read_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data_full.pickle')
# df = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data/market_data_compact.pickle')

# dff.to_pickle(f'/content/drive/MyDrive/Colab Notebooks/data/{stock}_market_data_{outputsize}.pickle')

print(df.shape)

df.head()

(3576, 28)


Unnamed: 0,open,high,low,close,volume,symbol,GLD,USO,SPY,PBD,XLE,EWC,yield5y,yield10y,yield30y,yield3m,ir,nfp,unemployment,cs,infl,b-upper,b-middle,b-lower,rsi,macd_hist,macd,macd_signal
2021-09-15,39.06,39.54,38.67,39.2,204267.0,BLX.TO,167.83,50.69,447.88,29.56,50.26,37.57,0.79,1.28,1.85,0.04,0.08,146856.0,5.2,81.2,4.7,40.4836,38.515,36.5464,56.2842,0.0231,0.4132,0.3901
2021-09-14,38.83,39.17,38.67,39.13,172000.0,BLX.TO,168.82,49.31,444.17,29.57,48.45,37.07,0.79,1.28,1.85,0.04,0.08,146856.0,5.2,81.2,4.7,40.4685,38.3965,36.3246,55.8474,0.037,0.4214,0.3843
2021-09-13,38.94,39.2,38.4,38.66,108400.0,BLX.TO,167.74,49.48,446.58,29.55,49.16,37.36,0.81,1.33,1.91,0.06,0.08,146856.0,5.2,81.2,4.7,40.4595,38.2671,36.0747,52.842,0.0578,0.4328,0.3751
2021-09-10,39.48,39.62,38.78,38.92,152100.0,BLX.TO,167.18,48.8,445.44,29.39,47.79,37.24,0.82,1.35,1.94,0.05,0.08,146856.0,5.2,81.2,4.7,40.5027,38.1448,35.7868,54.8024,0.1284,0.489,0.3606
2021-09-09,39.21,39.4,38.95,39.29,175400.0,BLX.TO,168.03,47.75,448.98,29.69,47.81,37.5,0.79,1.3,1.9,0.04,0.08,146856.0,5.2,81.2,4.7,40.5167,37.993,35.4692,57.6961,0.1986,0.5271,0.3285


## Prepare Data for Deep Learning


### Prepare the Data 
+ Reverse the date-time index
+ Convert the date-time index to an integer index
+ Convert to numpay arrays

In [3]:
def prepare_data(df,y='close',features=[]):
  """
  Filter, scale and convert dataframe data to numpy arrays

  Inputs: 
    df       => A dataframe of observations with features and y-labels
    y        => The name of the column that is the truth labels
    features => A list of features.  Used to subset columns

  Outputs:
    scaled_y => numpy array of the y-label data
    scaled_x => numpy array of the training features

  """

  # reverse the index such that dates are in chronological order
  df = df.iloc[::-1]

  # Subset features, get the y-label values
  df_y = df[y]
  df_X = df[features]

  # replace the date index with an integer index
  df_X.reset_index(drop=True,inplace=True)

  # convert to numpay arrays
  array_X = np.array(df_X)
  array_y = np.array(df_y).reshape(-1,1)

  # print the output
  print("\nData Preparation")
  print("="*60)
  print(f"=> {len(features)} Features")
  print(f"=> Input Dimensions :{array_X.shape}")
  print(f"=> Output Dimensions:{array_y.shape}")
  print("\n")

  return array_y, array_X


### Split the Data into Train/Test Sets
+ Time series data cannot be split randomly like other observational data because the order is important.  
+ The data is split into train/test based on dates

In [4]:
def split_train_test(X,idx_close,train_pct=0.80):
  """
  """

  # Train
  train_num = math.ceil(X.shape[0]*train_pct)
  train_X = X[0:train_num,:]
  train_str = f'{round(train_X.shape[0]/X.shape[0],3)*100}%'

  # Test
  # test_X = X[train_num-n_steps:,:]  # if you use this, need to add n_steps as a param!
  test_X = X[train_num:,:]
  test_str =f'{round(test_X.shape[0]/X.shape[0],3)*100}%'

  # print the results
  print("\nSplit Data (X)")
  print("="*60)
  print(f'Data  :100.0% {X.shape}')
  print(f'Train :{train_str:>6} {train_X.shape}')
  print(f'Test  :{test_str:>6} {test_X.shape}')
  print('\n')

  # create the line plots
  plot_train_test_data(train_X,test_X,idx_close)

  return train_X,test_X


In [5]:
def plot_train_test_data(train_x,test_x,idx_close):
  """
  """

  ts0 = list(range(0,train_x.shape[0]))
  ts1 = list(range(train_x.shape[0],train_x.shape[0]+test_x.shape[0]))

  fig = go.Figure()
  fig.add_trace(go.Scatter(x=ts0,y=train_x[:,idx_close],name='Train'))
  fig.add_trace(go.Scatter(x=ts1,y=test_x[:,idx_close],name='Test'))
  fig.add_shape(type ='rect',
                  x0=ts1[0],x1=ts1[-1],
                  y0=0,y1=1,
                  line=dict(color='#F6B28D'),
                  fillcolor='#F6B28D',
                  opacity=0.1)
  fig.add_shape(type ='rect',
                  x0=ts0[0],x1=ts0[-1],
                  y0=0,y1=1,
                  line=dict(color='#7BA1AA'),
                  fillcolor='#7BA1AA',
                  opacity=0.1)
  fig.update_layout(title = 'Train/Test Datasets (Scaled)',
                      template="plotly_white",
                      yaxis_title='Closing Price (Scaled)',
                      xaxis_title='time steps',
                      width = 600,
                      height =600)
  fig.update_shapes(dict(xref='x',yref='paper'))
  fig.show()



### Partition the Data

In [6]:
def create_partitions(data,idx_close,n_steps,n_predict,visualize=False):
  """
  This function partitions the train/test data into batches with times-step windows for training

  Each batch consists of n_steps of training data, and n_predict steps of label data
  The function outputs an x array [samples, time steps, features] and a y array [samples, time steps]

  Inputs:
    data      => train or test array
    idx_close => the position of y value in the data
    n_steps   => the number of time steps in each training batch
    n_predict => the number of time steps that will be predicted
    visualize => boolean, will plot a visual of training/prediction windows
  Outputs:
    array(i)  => np.array of batched & partitioned training data with features
    array(p)  => np.array of batched & partitioned y-lables 

  Reference:
    https://www.relataly.com/time-series-forecasting-multi-step-regression-using-neural-networks-with-multiple-outputs-in-python/5800/
  """
  n = data.shape[0]
  window = n_steps + n_predict
  i, p = [],[]

  # print the moving window
  if visualize:
    print("Data Window: I(Input), P(Predict),-(scanned), +(to be scanned)")
    print("="*max(n,100))

  # create the partitions
  for step in range(n_steps, n-n_predict):
    # get the input window + all features

    # train window
    i.append(data[step-n_steps:step,:])

    # get the prediction window + the closing price
    p.append(data[step:step+n_predict,idx_close])

    # print the moving window
    if visualize and step <= 50:
      scanned = n-((step-n_steps)+window)
      print("-"*(step-n_steps),'I'*n_steps,'P'*n_predict,"+"*scanned,sep="")
    
  return np.array(i),np.array(p)

In [131]:
def plot_training_window(x_array,y_array,idx_close,n_steps,n_predict,batch):
  """
  Inputs: 
    x_array/y_array => partitioned data from create_partitions
    idx_close       => the position of the y-label (close price)
    n_steps         => the number of time steps in each training batch
    n_predict       => the number of time steps predicted each training batch
    batch           => the batch number to plot
  Outputs:
    A line chart with the train/predict values
  """
  # convert the arrays to dataframes
  # align the x indexes to compare
  df_y = pd.DataFrame(y_array[batch],index=range(n_steps-1,n_predict+n_steps-1),columns=['y'])
  df_x = pd.DataFrame(x_array[batch+1])[idx_close]
  df_x = pd.DataFrame(df_x)
  df_x.columns = ['x']

  # create the plots
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df_x.index,y=df_x['x'],name='train window'))
  fig.add_trace(go.Scatter(x=df_y.index,y=df_y['y'],name='predict window'))
  fig.update_layout(template='plotly_white',
                    title='Train/Predict Windows',
                    yaxis_title = 'Closing Price (Scaled)',
                    xaxis_title='Period',
                    width = 600,
                    height = 600)
  fig.show()


### Data Transformations

#### Check for stationarity
+ Augmented Dickey-Fuller (ADF) test for stationarity
+ null: the data is non-stationary
+ alternative: the data is stationary
+ => if the test stat is less than all critical values,  we cannot reject the null, therefore the data is non-stationary

**Reference**

 https://analyzingalpha.com/check-time-series-stationarity-python#augmented-dickey-fuller-adf

In [122]:
def transform_stationary(df,features_to_transform,transform='log'):
  """
  transform: log, boxcox
  """
  # transform each columne in the features_to_transform list
  for feature in df.columns:
    if feature in features_to_transform:
      # log transform
      if transform=='log':
        df[feature] = df[feature].apply(np.log)

      # boxcox transform  
      elif transform=='boxcox':
        bc,_ = stats.boxcox(df[feature])
        df[feature] = bc

      else:
        print("Transformation not recognized")

  # check the closing price for stationarity using the augmented dicky fuller test
  t_stat, p_value, _, _, critical_values, _  = adfuller(df.close.values, autolag='AIC')
  print('Augmented Dicky Fuller Test for Stationarity')
  print("="*60)
  print(f'ADF Statistic: {t_stat:.2f}')
  for key, value in critical_values.items():
    print('Critial Values:')
    if t_stat < value:
      print(f'   {key}, {value:.2f} => non-stationary')
    else:
      print(f'   {key}, {value:.2f} => stationary')

### Process the Data

In [133]:
# Training Config
# set seed
seed = 1985
np.random.seed(seed)

# Extract the features needed for training
features = [c for c in df.columns if c not in ['symbol','b-middle']]
n_steps  = 40   # The number of time steps included in each training batch
n_predict = 5   # The number of time steps into the future the model will predict
idx_close = df.columns.get_loc("close") # index postiion variable (closing price)

# difference the price data
# features_to_transform = ['open','high','low','close','GLD','USO','SPY','VXX','XLE']
features_to_transform = ['open','high','low','close','GLD','USO','SPY','PBD','XLE','EWC']
transform_stationary(df,features_to_transform,'log')


# Scale the data and convert to numpy arrays
array_y,array_X = prepare_data(df,'close',features)


# scale the input and outputs
scaler_X = MinMaxScaler(feature_range=(0,1))
scaler_y = MinMaxScaler(feature_range=(0,1))
scaled_X = scaler_X.fit_transform(array_X)
scaled_y = scaler_y.fit_transform(array_y)


# split into train, test
train_pct = 0.80
train_x,test_x=split_train_test(scaled_X,idx_close,train_pct)


# Partition the train/test data into time series windows for training
# LSTM input format: [samples, time steps, features]
x_train, y_train = create_partitions(train_x,idx_close,n_steps, n_predict)
x_test,  y_test  = create_partitions(test_x, idx_close,n_steps, n_predict)


# Print the results
print('Train/Test Dimensions')
print('='*60)
print("Train Data Dimensions: ","x",x_train.shape,"y",y_train.shape)
print("Test Data Dimensions : ","x",x_test.shape," y",y_test.shape)

print("\n\nCheck the dimensions of the training windows")
print("="*60)
print('The last value in the training batch should match the first value in the predict batch')
print("Last Training Value:   ",x_train[1][n_steps-1][idx_close])
print("First Prediction Value:",y_train[0][0])

# Make sure the train/predict batch windows are aligned
plot_training_window(x_train,y_train,idx_close,n_steps,n_predict,seed//10)

# save to google drive
np.save('/content/drive/MyDrive/Colab Notebooks/data/x_train',x_train)
np.save('/content/drive/MyDrive/Colab Notebooks/data/y_train',y_train)
np.save('/content/drive/MyDrive/Colab Notebooks/data/x_test',x_test)
np.save('/content/drive/MyDrive/Colab Notebooks/data/y_test',y_test)


Augmented Dicky Fuller Test for Stationarity
ADF Statistic: -1.99
Critial Values:
   1%, -3.43 => stationary
Critial Values:
   5%, -2.86 => stationary
Critial Values:
   10%, -2.57 => stationary

Data Preparation
=> 26 Features
=> Input Dimensions :(3576, 26)
=> Output Dimensions:(3576, 1)



Split Data (X)
Data  :100.0% (3576, 26)
Train : 80.0% (2861, 26)
Test  : 20.0% (715, 26)




Train/Test Dimensions
Train Data Dimensions:  x (2816, 40, 26) y (2816, 5)
Test Data Dimensions :  x (670, 40, 26)  y (670, 5)


Check the dimensions of the training windows
The last value in the training batch should match the first value in the predict batch
Last Training Value:    0.4829226567077449
First Prediction Value: 0.4829226567077449


In [None]:
# array_y,array_X = prepare_data(df,'close',features)

# # scale the input and outputs

# scaled_X = scaler_X.fit_transform(array_X)
# scaled_y = scaler_y.fit_transform(array_y)

# print(scaled_X.shape)
# print(scaled_y.shape)


# # split into train, test
# train_pct = 0.80
# train_x,test_x=split_train_test(scaled_X,idx_close,train_pct)

# print(train_x.shape)
# print(test_x.shape)
# print(train_x.shape[0]+test_x.shape[0]-scaled_X.shape[0])

# ts0 = list(range(0,train_x.shape[0]))
# ts1 = list(range(train_x.shape[0],train_x.shape[0]+test_x.shape[0]))

# print(min(ts0),max(ts0))
# print(min(ts1),max(ts1))

# train=pd.DataFrame(train_x[:,idx_close])
# test=pd.DataFrame(test_x[:,idx_close],index=ts1)

# train_test=train.join(test,how='outer',lsuffix='train',rsuffix='test')
# train_test.iloc[3090:3096,]


# # Partition the train/test data into time series windows for training
# # LSTM input format: [samples, time steps, features]
# x_train, y_train = create_partitions(train_x,idx_close,n_steps, n_predict)
# x_test,  y_test  = create_partitions(test_x, idx_close,n_steps, n_predict)


# # Print the results
# print('Train/Test Dimensions')
# print('='*60)
# print("Train Data Dimensions: ","x",x_train.shape,"y",y_train.shape)
# print("Test Data Dimensions : ","x",x_test.shape," y",y_test.shape)

# print("\n\nCheck the dimensions of the training windows")
# print("="*60)
# print('The last value in the training batch should match the first value in the predict batch')
# print("Last Training Value:   ",x_train[1][n_steps-1][idx_close])
# print("First Prediction Value:",y_train[0][0])


In [None]:
# np.load('/content/drive/MyDrive/Colab Notebooks/data/y_test.npy')

# Deep Learning LSTM

### Build the LSTM Model

In [19]:
def create_lstm_model(config,x_train,n_predict):
  """
  """
  # get window size
  n_steps = config['data']['n_steps']
  n_predict = config['data']['n_predict']
  n_features = x_train.shape[2]

  # clear previous models
  backend.clear_session()

  # LSTM Model+ first layer
  model = Sequential(name='LSTM')
  model.add(LSTM(n_steps,return_sequences=True,input_shape=(n_steps,n_features)))

  # add additional layers
  for layer,nodes,ret_seq,drop in config['layers']:
    # add LSTM layers + dropout
    if layer=='lstm':
      model.add(LSTM(nodes,return_sequences =ret_seq))
      if drop is not None:
        model.add(Dropout(drop))

    # Add Dense Layers + dropout
    elif layer =='dense':
      model.add(Dense(nodes))
      if drop is not None:
        model.add(Dropout(drop))

  # add the prediction layer
  model.add(Dense(n_predict))
  
  # compile
  model.compile(optimizer='adam',loss='mse')
  model.summary()

  return model


### Calculate Model Performance

In [12]:
def calculate_performance(x_test,y_test,model,scaler_y):
  """
  """

  # Predict the prices
  y_pred = model.predict(x_test)

  # convert units back to the original scale
  y_pred_unscaled = scaler_y.inverse_transform(y_pred)
  y_test_unscaled = scaler_y.inverse_transform(y_test)

  # # Root mean squarred error,mean abs error, mean abs percent error, median abs percent error
  rmse  = math.sqrt(mean_squared_error(y_test_unscaled, y_pred_unscaled))
  mae   = mean_absolute_error(y_test_unscaled, y_pred_unscaled)
  mape  = np.mean((np.abs(np.subtract(y_test_unscaled, y_pred_unscaled)/ y_test_unscaled))) * 100
  mdape = np.median((np.abs(np.subtract(y_test_unscaled, y_pred_unscaled)/ y_test_unscaled)) ) * 100

  print("\nModel Error")
  print("="*62)
  print(f'{"Mean Absolute Error (MAE)" :-<55} {np.round(mae, 2):>5}')
  print(f'{"Root Mean Squared Error (MSE)" :-<55} {np.round(rmse,2):>5}')
  print(f'{"Mean Absolute Percentage Error (MAPE)" :-<55} {np.round(mape, 2):>5}%')
  print(f'{"Median Absolute Percentage Error (MDAPE)" :-<55} {np.round(mdape, 2):>5}%')

  return y_pred


In [94]:
# calculate_performance(x_test,y_test,model,scaler_y)

y_pred = model.predict(x_test)
y_pred_unscaled=scaler_y.inverse_transform(y_pred)
y_test_unscaled = scaler_y.inverse_transform(y_test)

# np.mean((np.abs(np.subtract(y_test_unscaled, y_pred_unscaled)/ y_test_unscaled))) * 100

# np.subtract(y_test_unscaled,y_pred_unscaled)/y_test_unscaled
tmp=pd.DataFrame(y_test_unscaled)
# tmp[tmp[0]==0]
tmp



Unnamed: 0,0,1,2,3,4
0,-1.54,0.98,-1.38,9.03,-4.89
1,0.98,-1.38,9.03,-4.89,-2.33
2,-1.38,9.03,-4.89,-2.33,-3.26
3,9.03,-4.89,-2.33,-3.26,2.88
4,-4.89,-2.33,-3.26,2.88,7.32
...,...,...,...,...,...
585,3.01,1.69,2.06,0.48,0.90
586,1.69,2.06,0.48,0.90,1.18
587,2.06,0.48,0.90,1.18,5.23
588,0.48,0.90,1.18,5.23,0.30


### Plot Training Metrics

In [13]:

def plot_training_metrics(history):
  """
  """
  # get the number of epochs
  epochs = list(range(1, len(history.history['loss']) + 1))

  min_train_loss = min(history.history['loss'])
  min_val_loss = min(history.history['val_loss'])

  # create the line plots
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=epochs,y=history.history['loss'],
                           name = 'train-loss',
                           line=dict(width=3,color='royalblue')))
  fig.add_trace(go.Scatter(x=epochs,y=history.history['val_loss'],
                           name='val-loss',
                           line=dict(width=3,color='crimson')))

  fig.add_shape(type ='rect',
                x0=0,
                x1=1,
                y0=min(min_train_loss,min_val_loss),
                y1=max(min_train_loss,min_val_loss),
                line=dict(color='#ccc'),
                fillcolor='#ccc',
                opacity=0.4)
  fig.update_layout(title = 'Training Metrics',template="plotly_white",yaxis_title='loss',xaxis_title='epochs')
  fig.update_shapes(dict(xref='paper',yref='y'))
  fig.show()

### Train the Model

In [134]:
config ={
    'data': {'n_steps':n_steps,'n_predict':n_predict},
    'layers': [('lstm',256,True,0.3),('lstm',128,False,0.2),('dense',32,None,0.1),('dense',16,None,0.1)],
    'train':{'epochs':100,'batch_size':128,'early_stop':EarlyStopping(monitor='loss',patience=10)}
}
config

{'data': {'n_predict': 5, 'n_steps': 40},
 'layers': [('lstm', 256, True, 0.3),
  ('lstm', 128, False, 0.2),
  ('dense', 32, None, 0.1),
  ('dense', 16, None, 0.1)],
 'train': {'batch_size': 128,
  'early_stop': <keras.callbacks.EarlyStopping at 0x7ff172828fd0>,
  'epochs': 100}}

In [135]:
np.random.seed(seed)

modelname = 'model_BLX_log'
model = create_lstm_model(config,x_train,n_predict)

history = model.fit(x_train,
                    y_train,
                    batch_size=config['train']['batch_size'],
                    epochs=config['train']['epochs'],
                    callbacks = [config['train']['early_stop']],
                    validation_split =0.10,
                    verbose = 0
                    )


# predict prices and calculate the performance metrics, plot training metrics
y_pred_scaled = calculate_performance(x_test,y_test,model,scaler_y)
plot_training_metrics(history)


# save model and training performance
model.save(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}')

df_hist = pd.DataFrame(history.history)
df_hist.to_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}/train_history.csv')


Model: "LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 40, 40)            10720     
_________________________________________________________________
lstm_1 (LSTM)                (None, 40, 256)           304128    
_________________________________________________________________
dropout (Dropout)            (None, 40, 256)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0      



INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BLX_log/assets


INFO:tensorflow:Assets written to: /content/drive/MyDrive/Colab Notebooks/models/model_BLX_log/assets


## Load Saved Model

In [15]:
# get saved training evaluation
modelname = 'model_1'
model = keras.models.load_model(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}')
df_hist = pd.read_csv(f'/content/drive/MyDrive/Colab Notebooks/models/{modelname}/train_history.csv')

## Plot Predicted vs Actual Price

In [136]:
def plot_price_predictions(batch, idx_close, x_test, y_pred_scaled,scaler_y): 
  """
  """

  # unscale the y predictions
  y_pred_unscaled = scaler_y.inverse_transform(y_pred_scaled)

  # unscale the x_test data
  x_test_np = np.array(pd.DataFrame(x_test[batch])[idx_close]).reshape(-1,1)
  x_test_unscaled = scaler_y.inverse_transform(x_test_np)
  x_test_df = pd.DataFrame(x_test_unscaled)

  # set the indexes for plotting
  max_test_idx=x_test_df.shape[0]
  max_pred_idx =y_pred_unscaled[0].shape[0]
  test_idx = list(range(batch,batch + max_test_idx))
  pred_idx = list(range(batch + max_test_idx,batch + max_test_idx + max_pred_idx))


  # combine the actual + predicted prices
  data = pd.DataFrame(list(zip(y_pred_unscaled[batch], x_test_df[0])), columns=['pred', 'actual'])

  # create the plot
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=test_idx, y=x_test_df[0],
                        mode='lines',
                        name='Test Data',
                        fill='tozeroy',
                        line_color='#ccc'))
  fig.add_trace(go.Scatter(x=pred_idx, y=data['actual'],
                        mode='lines+markers', 
                        name='Actual Price',
                        fill='tozeroy',
                        line_color ='#ccc')) 
  fig.add_trace(go.Scatter(x=pred_idx, y=data['pred'],
                        mode='lines+markers',
                        name='Predicted Price',
                        line_color='red'))

  fig.update_layout(template = 'plotly_white',
                      title= 'Actual vs Predicted Price',
                      xaxis_title = 'Batch',
                      yaxis_title = 'Price',
                      width=600,
                      height=400)

  fig.show()
    

# plot the predicted price vs actual for a specific batch
# plot_price_predictions(0, idx_close, x_test, y_pred_scaled,scaler_y)

for batch in range(100,102):
  plot_price_predictions(batch, idx_close, x_test, y_pred_scaled,scaler_y)
  

In [17]:

# Plot train+test+predict
# test numbers don't line up 
#=> check create paritions to see why the number of records don't match

y_test_unscaled = scaler.inverse_transform(y_test)

# train data
train_unscaled = scaler.inverse_transform(train_x)
df_train=pd.DataFrame(train_unscaled[:,idx_close])
df_train.columns=['price']
print(df_train.shape)


# test data
test_unscaled = scaler.inverse_transform(test_x)
df_test = pd.DataFrame(test_unscaled[:,idx_close])
df_test.columns = ['price']


# reindex the the test set to start from the train set
idx_last_train = df_train.shape[0]
idx_last_test = df_test.shape[0]
idx_rng = list(range(idx_last_train,idx_last_train + idx_last_test))

df_test['idx'] = idx_rng
df_test.set_index('idx',drop=True,inplace=True)
print(df_test.shape)



# get the predictions
y_pred = model.predict(x_test)

# convert units back to the original scale
y_pred_unscaled = scaler.inverse_transform(y_pred)

# reset the index to align with the train/test indexes
# idx will be supplied by the caller function
idx = y_test_unscaled.shape[0]-1
idx_pred_start = idx_last_train  + idx_last_test -(idx_last_test - idx)
idx_rng = list(range(idx_pred_start,idx_pred_start+n_predict))


y_pred =y_pred_unscaled[idx,:]
df_pred = pd.DataFrame(y_pred)
df_pred.columns =['price']
df_pred['idx']=idx_rng
df_pred.set_index('idx',drop=True,inplace=True)
print(df_pred.shape)


fig = go.Figure()
fig.add_trace(go.Scatter(x=df_train.index,y=df_train.price,name='Train'))
fig.add_trace(go.Scatter(x=df_test.index,y=df_test.price,name='Test'))
fig.add_trace(go.Scatter(x=df_pred.index,y=df_pred.price,name='Predict'))
fig.show()  

print(idx,idx_last_train,idx_last_test,idx_pred_start)
df_pred



(2859, 1)
(714, 1)
(1, 1)


672 2859 714 3531


Unnamed: 0_level_0,price
idx,Unnamed: 1_level_1
3531,31.464006
