In [None]:
# this is largely based on the tutorials at: https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3
import datetime
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from sklearn.metrics import mean_squared_error

In [2]:
# settings
model_settings = {'epochs': 100, 'batch_size': 4, 'train_test_ratio': 0.7, 'hidden_layers': 3, 'units': 100, 'start_date': '2020-01-01', 'n_steps_in': 60, 'n_steps_out': 30, 'symbol': 'CTXR'}
current_date = datetime.datetime.now().strftime("%Y-%m-%d")


In [3]:
# load and shape data
conn = sqlite3.connect('stockPrediction_06072021.db')

symbol_info = conn.execute(f"SELECT sector_id, industry_id FROM stock WHERE stock_symbol = \"{model_settings['symbol']}\";")
symbol_info = symbol_info.fetchall()
sector_id = symbol_info[0][0]
industry_id = symbol_info[0][1]

query = f"SELECT r.stock_symbol, l.price_datetime, l.open_price, l.high_price, l.low_price, l.close_price, l.volume, l.dividends, l.stock_splits FROM eod_price_history l INNER JOIN stock r ON r.stock_id = l.stock_id WHERE r.sector_id = {sector_id} OR r.industry_id = {industry_id};"

symbols = conn.execute('SELECT stock_symbol FROM stock')
symbols = symbols.fetchall()
symbols = [i[0] for i in symbols]
symbols = [i for i in symbols if i not in symbols]

df = pd.read_sql(query, conn, index_col=['stock_symbol', 'price_datetime'])
df = df.reset_index()

df['price_datetime'] = pd.to_datetime(df['price_datetime'], format='%Y-%m-%d')

df = df.set_index(['price_datetime', 'stock_symbol']).unstack(['stock_symbol'])

df = df.loc[model_settings['start_date']:current_date]  # date range from 2019-01-01 to 2021-05-31

close_df = df['close_price'].dropna(thresh=(len(df['close_price'] / 0.2)), axis=1)

close_df = close_df.fillna(method='ffill', axis=1)

# remove outliers
low_outlier = close_df.quantile(.1, axis=1).quantile(.1)
high_outlier = close_df.quantile(.9, axis=1).quantile(.9)
for column in close_df.columns:
    if (close_df[column].median() < low_outlier) or (close_df[column].median() > high_outlier):
        close_df = close_df.drop([column], axis=1)
close_df

stock_symbol,ABEO,ABIO,ABMC,ABMT,ABUS,ACER,ACHFF,ACRX,ACST,ACUR,...,XPHYF,XTLB,XTNT,XXII,YCBD,ZIOP,ZIVO,ZOM,ZSAN,ZYNE
price_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-02,3.210,5.6800,0.0700,0.10000,2.94,3.790,0.7410,2.070,2.3800,0.280,...,0.75,1.3000,1.580,1.140,2.3400,4.6200,0.160,0.3310,1.5200,5.8800
2020-01-03,2.900,5.5600,0.0700,0.10000,3.08,3.580,0.7410,2.040,2.3300,0.280,...,0.75,1.4100,1.530,1.090,2.2700,4.5000,0.160,0.3320,1.5500,5.8100
2020-01-06,2.770,5.6300,0.0700,0.10000,3.18,3.690,0.7410,2.010,2.2100,0.280,...,0.75,1.4200,1.550,1.100,2.2200,4.4900,0.160,0.3130,1.6400,5.7000
2020-01-07,2.570,5.6700,0.0800,0.10000,3.04,3.710,0.7250,2.070,2.3300,0.240,...,0.75,1.4100,1.590,1.090,1.8400,4.5300,0.160,0.3130,1.7100,5.5600
2020-01-08,2.620,6.0700,0.0800,0.10000,3.07,3.690,0.7330,2.030,2.1900,0.240,...,0.75,1.2600,1.580,1.070,1.5900,4.5900,0.160,0.3200,1.6200,5.3100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-06-01,1.710,3.4900,0.0636,0.07920,2.86,2.960,1.1180,1.430,0.5071,0.365,...,1.80,3.5400,1.450,4.600,2.9300,3.0700,4.255,0.8239,0.8025,5.0700
2021-06-02,1.745,3.5690,0.0645,0.11490,3.02,2.910,1.1400,1.440,0.5290,0.365,...,1.82,3.5500,1.488,4.650,3.1550,3.0400,4.145,0.8379,0.8006,5.2800
2021-06-03,1.825,3.7000,0.0631,0.07609,2.90,2.881,1.1400,1.445,0.5373,0.365,...,1.74,3.5242,1.660,4.645,3.1500,3.1400,4.290,1.0050,0.8069,5.1819
2021-06-04,1.705,3.6451,0.0620,0.08000,3.01,2.990,1.1174,1.490,0.5503,0.360,...,1.76,3.3500,1.610,4.695,3.0098,3.0529,4.350,0.9110,0.8187,5.1800


In [16]:
# split sequences function, splits multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find end of patterns
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        # check to see if we are beyond the dataset
        if end_ix > len(sequences):
            break
        # gather input, output parts of the pattern
        # slightly guessing here on how to combine these two examples
        seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix:out_end_ix, :]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [None]:
# ignore
# trying something new
# split sequences function, splits multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
    X, y = list(), list()
    seqs = list()
    for a in sequences:
        a = a.reshape((len(a), 1))
        seqs.append(a)
    dataset = np.hstack((seqs))
    for i in range(len(dataset)):
        # find end of patterns
        end_ix = i + n_steps_in
        out_end_ix = end_ix + n_steps_out
        # check to see if we are beyond the dataset
        if end_ix > len(dataset):
            break
        # gather input, output parts of the pattern
        # slightly guessing here on how to combine these two examples
        seq_x, seq_y = dataset[i:end_ix, :], dataset[end_ix:out_end_ix, :]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X, dtype=np.float), np.array(y, dtype=np.float)

In [5]:
# split into test/train data sets

# splitting test/training data
data_size = len(close_df)

# using a 90/10 train/test split
training_data = close_df.iloc[:(int(data_size * model_settings['train_test_ratio']))]
test_data = close_df.iloc[(int(data_size * model_settings['train_test_ratio'])):]

In [None]:
# testing data sets to see if they're correct
training_data

In [None]:
test_data

In [None]:
# ignore
columns = [i for i in training_data.columns]
arrays = list()
for i in columns:
    arrays.append(training_data[i].to_numpy())

In [7]:
#df['sum'] = df.sum(axis=1)

# ignore others, working w/ this cell currently

arrays = list()
out_seq = list()
for i in range(len(training_data)):
    arrays.append(training_data.iloc[i].to_numpy(dtype=np.float))
'''for i in range(len(training_data)):
    out_seq.append(training_data.iloc[i].sum())
out_seq = np.array(out_seq, dtype=np.float)'''

'for i in range(len(training_data)):\n    out_seq.append(training_data.iloc[i].sum())\nout_seq = np.array(out_seq, dtype=np.float)'

In [18]:
len(arrays[0])

412

In [8]:
# ignore
# reshape
arrays2 = list()
for i in arrays:
    arrays2.append(i.reshape((len(i), 1)))

In [None]:
# ignore
out_seq = out_seq.reshape((len(out_seq), 1))

In [None]:
# ignore
arrays2.append(out_seq)
dataset = np.hstack((arrays2))

In [9]:
# reshape arrays
# in_seq arrays
for i in range(len(arrays)):
    arrays[i] = arrays[i].reshape((len(arrays[i]), 1))
# out_seq = out_seq.reshape((len(out_seq), 1))

In [10]:
# arrays.append(out_seq)
dataset = np.hstack((arrays))

In [17]:
# may just be able to jump right into using the split_sequence func?
X, y = split_sequences(dataset, model_settings['n_steps_in'], model_settings['n_steps_out'])

  return np.array(X), np.array(y)


In [23]:
for i in range(len(y)):
    print(y[i].shape)

(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)
(30, 251)


In [18]:
n_features = X.shape[2]

In [None]:
X = X.reshape((X.shape[0], X.shape[1], n_features))

In [None]:
# define model
model = Sequential()
model.add(LSTM(100, activation='relu', return_sequences=True, input_shape=(model_settings['n_steps_in'], n_features)))
model.add(LSTM(100, activation='relu'))
model.add(Dense(model_settings['n_steps_out']))
model.compile(optimizer='adam', loss='mse')

In [None]:
# fit model
model.fit(X, y, epochs=400, verbose=1)

In [None]:
for i in range(len(X)):
    print(X[i], y[i])