In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from collections import deque
from sklearn.preprocessing import StandardScaler

### Specify some constants

In [2]:
# How many days do we want to predict into future
predict_len = 3
# How many percent of the data is training set
train_pct = 0.8
# Which stock do you want to predict
pre_stock = "Google"
# How many data do you use to predict
window_len = 120

### Load and concatenate the dataset

In [8]:
stock_list = ["google", "amazon", "microsoft"]
data_list = []
for stock in stock_list:
    df = pd.read_excel("data/%s.xlsx" % stock)
    df["Dates"] = df["Dates"].apply(lambda x: pd.to_datetime(x))
    df.set_index("Dates", inplace=True)
    df.drop(["Value", "Number Ticks"], axis=1, inplace=True)
    df.rename(lambda x: stock[0]+"_"+x.lower(), axis=1, inplace=True)
    data_list.append(df)
data = pd.concat(data_list, join="inner", axis=1)

### Check NaNs

In [9]:
data.isnull().sum()

g_open      0
g_close     0
g_high      0
g_low       0
g_volume    0
a_open      0
a_close     0
a_high      0
a_low       0
a_volume    0
m_open      0
m_close     0
m_high      0
m_low       0
m_volume    0
dtype: int64

### Create the target: return

In [10]:
ret = (data[["g_close", "a_close", "m_close"]].shift(-predict_len) - data[["g_close", "a_close", "m_close"]]) / data[["g_close", "a_close", "m_close"]].values

In [11]:
data["target"] = ret[pre_stock.lower()[0]+"_close"]

In [12]:
data.shape

(36007, 16)

In [13]:
data.dropna(axis=0, inplace=True)

### Split train and test

In [14]:
train_size = int(train_pct*data.shape[0])
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

### Scale the data (now we have scaled both the dependent and independent variables)

In [15]:
def scale_data(df):
    scaler = StandardScaler()
    scaler.fit(df)
    return scaler

In [16]:
scaler = scale_data(train_data)
train_arr = scaler.transform(train_data)
test_arr = scaler.transform(test_data)

### Create sequence data

In [17]:
def sequential_data(df, window_length=window_len):
    X = []
    y = []
    prediction_data = deque(maxlen=window_length)
    for row in df.values:
        prediction_data.append(row[:-1])
        if len(prediction_data) == window_length:
            X.append(np.array(prediction_data))
            y.append(row[-1])
    return np.array(X), np.array(y)

In [19]:
train_X, train_y = sequential_data(train_data)
test_X, test_y = sequential_data(test_data)

### Shuffle the train data

In [29]:
train_n = train_X.shape[0]
rind = np.random.choice(train_n, train_n, replace=False)
train_X = train_X[rind]
train_y = train_y[rind]

### building networks

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
## my question is just, what does it mean to have 128 cells for LSTM? 
## and what does batch normalization mean?

In [32]:
model = Sequential()

model.add(CuDNNLSTM(128, input_shape=(train_X.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1))

In [33]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
# Compile model
model.compile(
    loss='mean_squared_error',
    optimizer=opt,
    metrics=['mean_squared_error']
)