<a href="https://colab.research.google.com/github/kconstable/market_predictions/blob/main/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preprocessing for Deep Learning
## Import Libraries

In [285]:
import pandas as pd
import numpy as np
import math
from sklearn.preprocessing import MinMaxScaler
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')


import plotly.graph_objects as go

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Load Data

In [287]:
# df = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data/market_data_full.pickle')
df = pd.read_pickle('/content/drive/MyDrive/Colab Notebooks/data/market_data_compact.pickle')

In [273]:
def scale_data(df,y='close',features=[]):
  """
  """

  # reverse the index such that dates are in chronological order
  df = df.iloc[::-1]

  # Subset features, get the y values
  df_y = df[y]
  df_X = df[features]

  # save the date index, then replace the date index with an integer index
  idx_date = df_X.index
  df_X.reset_index(drop=True,inplace=True)

  # convert to numpay arrays
  array_X = np.array(df_X)
  array_y = np.array(df_y).reshape(-1,1)

  # scale the input and outputs
  scaler = MinMaxScaler()
  scaled_X = scaler.fit_transform(array_X)
  scaled_y = scaler.fit_transform(array_y)

  # print the output
  print("Data Scaling")
  print("--"*50)
  print(f"=> {len(features)} Features")
  print("=>",features)
  print(f"=> Input Dimensions:{scaled_X.shape}")
  print(f"=> Output Dimensions:{scaled_y.shape}")
  print("\n")

  return scaled_y, scaled_X, idx_date

In [290]:
def create_partitions(data,idx_close,n_steps,n_predict,visualize=False):
  """
  This function partitions the train/test data into batches for training
  Each batch consists of n_steps of training data, and n_predict steps of label data
  The function outputs an x array [samples, time steps, features] and a y array [samples, time steps]

  Inputs:
    data     : train or test array
    idx_close: the position of y value in the data
    n_steps  : the number of time steps in each training batch
    n_predict: the number of time steps that will be predicted
    visualize: boolean, will plot a visual of training/prediction windows

  Reference:
    https://www.relataly.com/time-series-forecasting-multi-step-regression-using-neural-networks-with-multiple-outputs-in-python/5800/
  """
  n = data.shape[0]
  window = n_steps + n_predict
  i, p = [],[]

  # print the moving window
  if visualize:
    print("Data Window: I(Input), P(Predict),-(scanned), +(to be scanned)")
    print("="*n)

  # create the partitions
  for s in range(n_steps, n-n_predict):
    # get the input window + all features

    # print(s-n_steps,s+n_predict)
    i.append(data[s-n_steps:s,:])

    # get the prediction window + the closing price
    # print("X:",s-n_steps,s+n_predict,'\ty:',s,s+n_predict)
    p.append(data[s:s+n_predict,idx_close])

    # print the moving window
    if visualize and s <= 50:
      scanned = n-((s-n_steps)+window)
      print("-"*(s-n_steps),'I'*n_steps,'P'*n_predict,"+"*scanned,sep="")
    
  return np.array(i),np.array(p)

In [275]:
def plot_training_window(x_train,y_train,idx_close,n_steps,n_predict,batch):
  """
  """
  # convert the arrays to dataframes
  # need to align the x indexes to compare
  df_y = pd.DataFrame(y_train[batch],index=range(n_steps-1,n_predict+n_steps-1),columns=['y'])
  df_x = pd.DataFrame(x_train[batch+1])[idx_close]
  df_x = pd.DataFrame(df_x)
  df_x.columns = ['x']

  # create the plots
  fig = go.Figure()
  fig.add_trace(go.Scatter(x=df_x.index,y=df_x['x'],name='train window'))
  fig.add_trace(go.Scatter(x=df_y.index,y=df_y['y'],name='predict window'))
  fig.update_layout(template='plotly_white',title='Train/Predict Windows')
  fig.show()


In [277]:
# Training Config
features = ['open','high','low','close','volume','BAR','OILK','VXZ','b-lower','b-upper','rsi','macd_signal']
n_steps = 40    # The number of time steps included in each training batch
n_predict = 10   # The number of time steps into the future the model will predict

# Scale the data and convert to numpy arrays
scaled_y,scaled_X, idx_date = scale_data(df,'close',features)


# Index of the y variable (closing price)
idx_close = df.columns.get_loc("close")

# split into train, test
train_pct = 0.8
train_num = math.ceil(scaled_X.shape[0]*train_pct)
train = scaled_X[0:train_num,:]
test = scaled_X[train_num - n_steps:,:]


# Generate training data and test data
# LSTM input format: [samples, time steps, features]
x_train, y_train = create_partitions(train,idx_close,n_steps, n_predict)
x_test,  y_test  = create_partitions(test,idx_close,n_steps, n_predict)


# Print the results
print('Train/Test Dimensions')
print(x_train.shape,y_train.shape)
print(x_test.shape,y_test.shape)

print('\nThe last value in the training batch should match the first value in the predict batch')
print(x_train[1][n_steps-1][idx_close])
print(y_train[0])


# Make sure the train/predict batch windows are aligned
plot_training_window(x_train,y_train,idx_close,n_steps,n_predict,0)

Data Scaling
----------------------------------------------------------------------------------------------------
=> 12 Features
=> ['open', 'high', 'low', 'close', 'volume', 'BAR', 'OILK', 'VXZ', 'b-lower', 'b-upper', 'rsi', 'macd_signal']
=> Input Dimensions:(5494, 12)
=> Output Dimensions:(5494, 1)


Train/Test Dimensions
(4346, 40, 12) (4346, 10)
(1088, 40, 12) (1088, 10)

The last value in the training batch should match the first value in the predict batch
0.5647158131682611
[0.56471581 0.61029826 0.60692178 0.59454136 0.57512662 0.55936972
 0.56978053 0.5858188  0.62605515 0.64546989]
