In [1]:
import glob
import numpy as np
import pandas as pd

import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)

from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import RFE, SelectFromModel
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler

import shap

In [2]:
from keras.models import Sequential
from keras import utils
from keras.layers import LSTM, Dense, Dropout
import sys
np.set_printoptions(threshold=sys.maxsize)

Using TensorFlow backend.


In [3]:
# Read data
# ---- not concating here in case we want to take a look at a particular date later on

file_names = dict()
for file in glob.glob('../data/*.csv'):
    file_name = file[(file.rfind("/") + 1):(file.rfind("."))].replace("-", "_")
    file_names[file_name] = pd.read_csv(file, parse_dates = ["DateTime"])
    
# Concat data

data = []
for key in sorted(file_names):
    data.append(file_names[key])
data = pd.concat(data).drop(columns = ["index"])

In [4]:
train_len = int(data.shape[0] * 0.7 * 0.8) + 1
valid_len = int(data.shape[0] * 0.7 * 0.2)
test_len = int(data.shape[0] * 0.3)

train_len+valid_len+test_len == data.shape[0]

True

In [36]:
nn_data = data.copy()
nn_data["trend"] = np.where(nn_data["trend"]=="up", 1, 0)
X_nn, y_nn = nn_data.drop(columns=["DateTime", "trend"]), nn_data["trend"].to_numpy().ravel()
X_train_nn, y_train_nn, X_valid_nn, y_valid_nn = [], [], [], []

seq_len = 30
pred_period = 0

for i in range(seq_len, train_len-pred_period):
    X_train_nn.append(np.array(X_nn[i-seq_len:i]))
    y_train_nn.append(y_nn[i+pred_period])

X_train_nn, y_train_nn = np.array(X_train_nn), np.array(y_train_nn)

for i in range(train_len+seq_len, train_len+valid_len-pred_period):
    X_valid_nn.append(np.array(X_nn[i-seq_len:i]))
    y_valid_nn.append(y_nn[i+pred_period])

X_valid_nn, y_valid_nn = np.array(X_valid_nn), np.array(y_valid_nn)

In [39]:
X_train_nn.shape

(48760, 30, 62)

In [41]:
stock_nn = Sequential()

stock_nn.add(LSTM(units=32, return_sequences=True,
                  input_shape=X_train_nn.shape[-2:], dropout=0.2))

stock_nn.add(LSTM(32, return_sequences=True))
stock_nn.add(LSTM(32, return_sequences=True))
stock_nn.add(LSTM(32, return_sequences=False))

stock_nn.add(Dense(1, activation='sigmoid'))

stock_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

stock_nn.fit(X_train_nn, y_train_nn, epochs=4, batch_size=200, verbose=1)


Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.callbacks.History at 0x2daa046d0>