### Imports

In [22]:
import os, sys
import pandas as pd
import numpy as np
from numpy import set_printoptions
import yfinance as yf
import matplotlib.pyplot as plt
import seaborn as sns
import random

import tensorflow as tf
from tensorflow.keras import Input, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

from time import time
from tensorflow.python.keras.callbacks import TensorBoard

from indicators import *
from data_loading import *
from util_functions import *


physical_devices = tf.config.list_physical_devices('None')
#tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


### Parameters

In [23]:
trail_size=14
predict_length = 5
time_period = '6y'

### Load data

In [24]:
top100tech = ['AAPL', 'MSFT', 'TSM', 'NVDA', 'INTC', 'ASML', 'ADBE', 'CRM', 'AVGO', 'ORCL', 'CSCO', 'ACN', 'TXN', 'SHOP', 'QCOM', 'SAP', 'SNE', 'AMAT', 'INTU', 'NOW', 'SQ', 'IBM', 'MU', 'AMD',
'UBER', 'FIS', 'LRCX', 'FISV', 'SNOW', 'INFY', 'TEAM', 'ADSK', 'DELL', 'WDAY', 'VMW', 'ADI', 'NXPI', 'KLAC',
'CRWD', 'PLTR', 'DOCU', 'TEL', 'ERIC', 'MCHP', 'CDNS', 'CTSH', 'APH', 'SNPS', 'HPQ',
'PANW', 'OKTA','RNG','STM','MRVL','XLNX','WIT','MSI','SWKS','GLW','DDOG','ANSS','U','ZS','FTNT','STNE','ZBRA','KEYS','FTV','MXIM','COUP','GRMN','SPLK','HUBS','WORK','NET','FLT','CDW','CAJ','AFRM','PAYC','TER',
'VRSN','WDC', 'ZI', 'UMC','ANET','EPAM', 'UI','XM',
'GIB','TRMB', 'TYL','WIX','HPE','LYFT', 'LOGI', 'MPWR']
top10tech = top100tech[:10]

In [25]:
top_energy = ["XOM", "CVX", "RDS-A", "RDS-B", "PTR", "TOT", "BP",
"SNP", "ENB", "COP", "EQNR", "PBR-A", "PBR", "EPD",
"TRP", "E", "EOG", "SLB", "KMI", "PSX", "CNQ",
"MPC", "PXD", "SU", "VLO"]

In [26]:
data = get_data_yf(top_energy, time_period)

[*********************100%***********************]  25 of 25 completed


In [27]:
data.isna().sum().sum()

0

In [28]:
data = process_stock_data_from_yf(data)

In [29]:
has_data_ratio, data = filter_has_all_data(data)
has_data_ratio

100.0

### Compute technical indicators

In [30]:
# compute all our technical markers for each of the stocks. 
dataset = data.groupby('Ticker').apply(lambda x: add_technical_markers(x, trail_size))

### Choose features

In [31]:
features = ['Close', 'High',
 'Low', 'Open', 'Volume',
 'HH', 'LL', 'AV',
 'SMA', 'SD', 'WILLR',
 'ATR', 'DMH', 'DML',
 'EMA', 'WMA', 'BBHIGH',
 'BBLOW', 'PERBHIGH', 'PERBLOW',
 'TRIMA', 'RSI', 'DX',
 'PDI', 'NDI', 'ADX',
 'ROC', 'MACD', 'CCI']
dataset = dataset[['Ticker', 'Date'] + features]
num_features = len(features)

Feature correlations with predict_length

In [32]:
target = dataset['Close'].shift(predict_length)
dataset['Target'] = target
dataset

Unnamed: 0,Ticker,Date,Close,High,Low,Open,Volume,HH,LL,AV,SMA,SD,WILLR,ATR,DMH,DML,EMA,WMA,BBHIGH,BBLOW,PERBHIGH,PERBLOW,TRIMA,RSI,DX,PDI,NDI,ADX,ROC,MACD,CCI,Target
0,BP,2015-04-01,26.598135,26.712607,26.443260,26.517331,4285800.0,26.712607,26.443260,4.285800e+06,26.598135,,-42.499785,0.269347,0.000000,0.000000,26.598135,,,,,,,,,0.000000,0.000000,,0.000000,0.000000,,
1,BP,2015-04-02,26.699135,26.773203,26.524057,26.557725,3501400.0,26.773203,26.443260,3.893600e+06,26.648635,0.071418,-29.728950,0.259246,0.060597,0.080797,26.652242,,26.791470,26.505800,375.137857,371.137857,,,14.286606,23.374149,31.166099,14.286606,0.379725,0.001804,-0.002898,
2,BP,2015-04-06,27.022358,27.157033,26.773209,26.833812,4656200.0,27.157033,26.443260,4.147800e+06,26.773209,0.221600,-35.087637,0.325464,0.383829,0.249152,26.793628,,27.216409,26.330009,122.817753,118.817753,,,21.276660,117.933061,76.552936,17.781633,1.594935,0.010259,38.490085,
3,BP,2015-04-07,27.103163,27.574523,27.089695,27.284973,8078200.0,27.574523,26.443260,5.130400e+06,26.855698,0.244857,-97.222163,0.382139,0.417490,0.316486,26.888323,,27.345412,26.365983,111.679041,107.679041,,,13.761218,109.250888,82.819644,16.441495,1.898734,0.016244,86.474221,
4,BP,2015-04-08,27.453314,28.045881,27.103161,27.103161,17083300.0,28.045881,26.443260,7.520980e+06,26.975221,0.341168,-62.857151,0.494255,0.471358,0.013466,27.035728,,27.657556,26.292886,81.067372,77.067372,,,94.445060,95.367352,2.724471,35.942386,3.215183,0.030225,78.071754,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37795,XOM,2021-03-26,57.709999,57.779999,56.520000,57.299999,34205300.0,62.549999,54.450001,3.429479e+07,58.465000,2.359817,-5.555539,1.742142,1.279999,2.020000,57.326589,59.489524,63.184634,53.745367,26.775230,22.775230,58.541939,55.344932,22.424298,73.472689,115.949225,86.920213,-5.191391,1.152200,-24.766535,56.490002
37796,XOM,2021-03-29,57.400002,57.820000,56.740002,57.340000,21144400.0,62.549999,54.450001,3.326187e+07,58.284286,2.335748,-38.888791,1.682142,0.040001,0.220001,57.336377,59.270096,62.955782,53.612790,26.953158,22.953158,58.674490,54.080566,69.230318,2.377975,13.078633,90.236196,-4.221590,1.077463,-19.109473,55.910000
37797,XOM,2021-03-30,56.689999,57.360001,56.490002,56.860001,20303400.0,62.549999,54.450001,3.245841e+07,57.921429,2.138881,-77.011817,1.605000,-0.459999,-0.250000,57.250194,58.756953,62.199192,53.643666,29.080244,25.080244,58.731378,51.195767,29.577374,-28.660387,-15.576328,92.152281,-8.224060,0.961549,-20.705067,55.220001
37798,XOM,2021-03-31,55.830002,56.630001,55.799999,56.349998,23672900.0,62.369999,54.450001,3.237984e+07,57.530000,1.965455,-96.385238,1.576428,-0.730000,-0.690002,57.060835,58.252953,61.460911,53.599090,31.270572,27.270572,58.714694,47.865218,2.816693,-46.307192,-43.769994,90.049332,-8.938182,0.803821,-29.735836,56.340000


In [33]:
close = dataset['Close']
indicate = dataset.columns[2:-1]
pearson_current_day = []
pearson_target = []
spearman_current_day = []
spearman_target = []
kendall_current_day = []
kendall_target = []

for i in indicate:
    pearson_current_day.append(close.corr(dataset[i]))
    pearson_target.append(target.corr(dataset[i]))
    spearman_current_day.append(close.corr(dataset[i], method='spearman'))
    spearman_target.append(target.corr(dataset[i], method='spearman'))
    kendall_current_day.append(close.corr(dataset[i], method='kendall'))
    kendall_target.append(target.corr(dataset[i], method='kendall'))
    
    
data_average_current_data = []
data_average_target_data = []
for i in range(len(indicate)):
    data_average_current = 0
    data_average_target = 0

    data_average_current += pearson_current_day[i] + spearman_current_day[i] + kendall_current_day[i]
    data_average_current /= 3
    data_average_current_data.append(data_average_current)

    data_average_target += pearson_target[i] + spearman_target[i] + kendall_target[i]
    data_average_target /= 3
    data_average_target_data.append(data_average_target)

indicators = {'Indicator':indicate}
data_pearson = {'Pearson Current Day':pearson_current_day,'Pearson Target':pearson_target}
data_spearman = {'Spearman Current Day':spearman_current_day,'Spearman Target':spearman_target}
data_kendall = {'Kendall Current Day':kendall_current_day,'Kendall Target':kendall_target}
data_average = {'Average Current Day':data_average_current_data,'Average Target':data_average_target_data}
df_indicator = pd.DataFrame(data=indicators)
df_pearson = pd.DataFrame(data=data_pearson)
df_spearman = pd.DataFrame(data=data_spearman)
df_kendall = pd.DataFrame(data=data_kendall)
df_average = pd.DataFrame(data=data_average)




In [34]:
#testing univariate, RFE, PCA and Feature elim

#Note that X and Y are meant to be our variables. Y is the compared variable, X is the rest of the variables lumped together
a = dataset.values[:, 2:]

for i in range (len(a)):
    for j in range(len(a[i])):
        if (np.isnan(a[i][j]) == True) or a[i][j] == 0.0:
            a[i][j] = 0.0


X = a[:, 0:29]
#0 is close, 29 is Target
Y = a[:, 0]
# feature extraction
test = SelectKBest(score_func=f_classif, k=28)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)

univariate = fit.scores_
for i in range(29):
    
    print(str(features[i]) + " score is " + str(fit.scores_[i]))


Close score is inf
High score is 10897.382991863653
Low score is 9763.134897031623
Open score is 4408.424328656729
Volume score is 1.5284779621482134
HH score is 559.0216894322533
LL score is 520.9457394536413
AV score is 2.0620468430355237
SMA score is 732.6983599124411
SD score is 7.996319843477611
WILLR score is 0.9945242783467667
ATR score is 22.01013545965104
DMH score is 3.233012119648176
DML score is 3.186438221381777
EMA score is 1108.7390510345426
WMA score is 196.04794738964716
BBHIGH score is 342.38078271539945
BBLOW score is 354.4015675891921
PERBHIGH score is 3.21263243942586
PERBLOW score is 3.2129089623456935
TRIMA score is 180.67345084622704
RSI score is 2.2132371825324246
DX score is 0.088394067334636
PDI score is 1.0663460634393327
NDI score is 1.0673665391790534
ADX score is 1.1487253107517474
ROC score is 1.2153695897938839
MACD score is 6.522815260626807
CCI score is 1.5280913780271457


  f = msb / msw


In [35]:
"""arr = [80.73226313
,82.46981065
,81.74249061
,84.16145739
,1.199889254
,91.03150464
,79.60920258
,2.146481987
,91.16997258
,8.172725817
,0.9650594746
,18.96888314
,2.775156858
,2.328551157
,91.40485523
,100.0849597
,87.94608567
,84.29864294
,2.830418765
,2.833064889
,95.08096898
,1.466452819
,405486775490
,1.004649009
,1.046218788
,1.150761876
,1.001427865
,5.872076648
,1.1129878]
np.sort(arr)"""

"""This was just to sort some of the results in the Feature Test Results excel sheet.

Note that the results of that sheet indicate that there are 12 extremely strong/correlated factors for both the current day and target values. Those being:

- Close
- High
- Low
- Open
- HH
- LL
- SMA
- EMA
- WMA
- BBHIGH
- BBLOW
- TRIMA

- DX (only in Univariate target which could be divisional error)

Proposed Feature list: 
1) features = ['Close', 'High', 'Low', 'Open','HH', 'LL','SMA', 'EMA', 'WMA', 'BBHIGH','BBLOW','TRIMA']

2) features = ['Close', 'High', 'Low', 'Open','HH', 'LL','SMA', 'EMA', 'WMA', 'BBHIGH','BBLOW','TRIMA', 'DX']

3) features = ['Close', 'High', 'Low', 'Open','Volume', 'AV', 'HH', 'LL','SMA', 'EMA', 'WMA', 'BBHIGH','BBLOW','TRIMA', 'DX']

"""

"This was just to sort some of the results in the Feature Test Results excel sheet.\n\nNote that the results of that sheet indicate that there are 12 extremely strong/correlated factors for both the current day and target values. Those being:\n\n- Close\n- High\n- Low\n- Open\n- HH\n- LL\n- SMA\n- EMA\n- WMA\n- BBHIGH\n- BBLOW\n- TRIMA\n\n- DX (only in Univariate target which could be divisional error)\n\nProposed Feature list: \n1) features = ['Close', 'High', 'Low', 'Open','HH', 'LL','SMA', 'EMA', 'WMA', 'BBHIGH','BBLOW','TRIMA']\n\n2) features = ['Close', 'High', 'Low', 'Open','HH', 'LL','SMA', 'EMA', 'WMA', 'BBHIGH','BBLOW','TRIMA', 'DX']\n\n3) features = ['Close', 'High', 'Low', 'Open','Volume', 'AV', 'HH', 'LL','SMA', 'EMA', 'WMA', 'BBHIGH','BBLOW','TRIMA', 'DX']\n\n"

In [36]:
from sklearn.decomposition import PCA
pca = PCA(n_components=27)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [9.384e-01 6.155e-02 1.883e-11 1.304e-12 2.085e-21 1.813e-21 6.038e-22
 3.598e-22 1.791e-22 1.683e-22 1.854e-23 1.193e-23 4.495e-24 2.398e-24
 9.934e-25 4.485e-25 1.892e-25 1.374e-25 8.534e-26 5.979e-26 3.547e-26
 2.933e-26 2.588e-26 1.723e-26 8.471e-27 7.126e-27 2.447e-27]
[[-6.295e-14 -6.384e-14 -6.182e-14 -6.283e-14 -5.725e-09 -6.867e-14
  -5.877e-14 -5.586e-09 -6.375e-14 -2.546e-15 -3.087e-14 -2.256e-15
  -2.112e-17  1.374e-17 -6.371e-14 -6.295e-14 -6.876e-14 -5.857e-14
   1.129e-13  1.129e-13 -6.356e-14  2.981e-14  9.971e-01 -9.884e-15
   3.377e-15  7.622e-02  9.534e-15  3.803e-16  1.611e-13]
 [-8.268e-13 -8.382e-13 -8.135e-13 -8.251e-13 -7.637e-08 -8.948e-13
  -7.600e-13 -7.669e-08 -8.282e-13 -3.498e-14 -2.303e-13 -3.008e-14
  -1.131e-15 -1.274e-15 -8.293e-13 -8.161e-13 -8.971e-13 -7.572e-13
   2.867e-12  2.867e-12 -8.223e-13  1.970e-13 -7.622e-02 -4.460e-13
  -5.070e-13  9.971e-01  2.397e-14  4.903e-15  2.881e-13]
 [-8.514e-07 -8.531e-07 -8.497e-07 -8.513e-07

In [37]:
"""from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression


model = LogisticRegression(solver='lbfgs')
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)"""

'from sklearn.feature_selection import RFE\nfrom sklearn.linear_model import LogisticRegression\n\n\nmodel = LogisticRegression(solver=\'lbfgs\')\nrfe = RFE(model, 3)\nfit = rfe.fit(X, Y)\nprint("Num Features: %d" % fit.n_features_)\nprint("Selected Features: %s" % fit.support_)\nprint("Feature Ranking: %s" % fit.ranking_)'

In [38]:
"""from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
print(model.feature_importances_)"""

'from sklearn.ensemble import ExtraTreesClassifier\nmodel = ExtraTreesClassifier(n_estimators=10)\nmodel.fit(X, Y)\nprint(model.feature_importances_)'

In [39]:
df_aggregate = df_indicator.join(df_pearson)
df_aggregate = df_aggregate.join(df_spearman)
df_aggregate = df_aggregate.join(df_kendall)
df_aggregate = df_aggregate.join(df_average)
df_aggregate

Unnamed: 0,Indicator,Pearson Current Day,Pearson Target,Spearman Current Day,Spearman Target,Kendall Current Day,Kendall Target,Average Current Day,Average Target
0,Close,1.0,0.993252,1.0,0.994352,1.0,0.954475,1.0,0.980693
1,High,0.999796,0.993589,0.999821,0.994659,0.990739,0.956366,0.996785,0.981538
2,Low,0.999796,0.993261,0.999817,0.994302,0.990709,0.954937,0.996774,0.980833
3,Open,0.999612,0.993686,0.99969,0.994715,0.986209,0.957022,0.99517,0.981808
4,Volume,-0.246616,-0.241689,-0.254666,-0.251544,-0.173742,-0.171519,-0.225008,-0.221584
5,HH,0.995259,0.99372,0.995335,0.994415,0.955394,0.959974,0.981996,0.982703
6,LL,0.996084,0.993391,0.996172,0.994243,0.954681,0.957424,0.982312,0.981686
7,AV,-0.276452,-0.274586,-0.273049,-0.272752,-0.187136,-0.186861,-0.245546,-0.244733
8,SMA,0.997155,0.995686,0.997272,0.996551,0.96097,0.974769,0.985133,0.989002
9,SD,0.565517,0.572644,0.658936,0.659067,0.474275,0.474004,0.566243,0.568571


### Normalization
We want to normalize the prices, and there are two challenges
1. We want to be able to recover performance metrics like RMSE in dollars, so we have to have the saved transforms for closing price to un transform after prediction. 
2. We will want to normalize the data within each year, because the stock will tend to have different averages in different years
3. We will want to normalize each different company differently, because the stocks will be of differing magnitudes

### Train test split

In [40]:
train = dataset.loc[dataset['Date'] < "2018-03-01"]
valid = dataset.loc[(dataset['Date'] > "2018-03-13") & (dataset['Date'] < "2019-03-01")]
test = dataset.loc[(dataset['Date'] > "2019-03-01") & (dataset['Date'] < "2020-03-01")]

#### Per stock normalization
We make sure to fit the transforms on the training, and
apply them on training, validation, and test

In [41]:
scaler_models = norm_per_stock_split(train, valid, test, features, MinMaxScaler)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value[:, i].tolist())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_in

#### Prepare training inputs and outputs
Convert the daily data points into a set of data points each with {trail_size} days.   
We keep track of the tickers so we can later inverse the normalization with the right model. 

In [42]:
x_train, y_train, tickers_train = roll_all_stocks(train, trail_size, predict_length)
x_valid, y_valid, tickers_valid = roll_all_stocks(valid, trail_size, predict_length)
x_test, y_test, tickers_test = roll_all_stocks(test, trail_size, predict_length)

#### Confirm shapes

In [43]:
print("x_train: {}".format(x_train.shape))
print("y_train: {}".format(y_train.shape))
print("x_valid: {}".format(x_valid.shape))
print("y_valid {}".format(y_valid.shape))
print("x_test: {}".format(x_test.shape))
print("y_test {}".format(y_test.shape))

x_train: (17550, 14, 30)
y_train: (17550,)
x_valid: (5600, 14, 30)
y_valid (5600,)
x_test: (5825, 14, 30)
y_test (5825,)


In [44]:
print("tickers_train: {}".format(tickers_train.shape))
print("tickers_train: {}".format(tickers_valid.shape))
print("tickers_train: {}".format(tickers_test.shape))

tickers_train: (17550,)
tickers_train: (5600,)
tickers_train: (5825,)


### Model design

In [45]:
model = Sequential()
model.add(LSTM(units=75, input_shape=(trail_size, num_features), return_sequences=True))
model.add(Dropout(0.25))
model.add(LSTM(units=30, return_sequences=True))
model.add(Dropout(0.25))

model.add(LSTM(units=30, return_sequences=True))
model.add(Dropout(0.25))

model.add(Flatten())
# model.add(Dense(50, activation='relu'))
# model.add(Dense(25, activation='relu'))
model.add(Dense(1))

In [46]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 14, 75)            31500     
_________________________________________________________________
dropout (Dropout)            (None, 14, 75)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 14, 30)            12720     
_________________________________________________________________
dropout_1 (Dropout)          (None, 14, 30)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 14, 30)            7320      
_________________________________________________________________
dropout_2 (Dropout)          (None, 14, 30)            0         
_________________________________________________________________
flatten (Flatten)            (None, 420)               0

In [47]:
time_stamp = time()
tensorboard = TensorBoard(log_dir='logs/{}'.format(time_stamp))

In [48]:
model.compile(optimizer='adam', loss='mean_squared_error')

### Train on many stocks

In [49]:
history1 = model.fit(x_train, y_train, epochs=45, validation_data=(x_valid, y_valid), batch_size=64, shuffle=False, callbacks=[tensorboard])

Epoch 1/45


ValueError: in user code:

    C:\Users\polob\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:571 train_function  *
        outputs = self.distribute_strategy.run(
    C:\Users\polob\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:951 run  **
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\polob\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:2290 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\polob\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:2649 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\polob\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:531 train_step  **
        y_pred = self(x, training=True)
    C:\Users\polob\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\base_layer.py:885 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    C:\Users\polob\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\input_spec.py:224 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) +

    ValueError: Input 0 is incompatible with layer sequential: expected shape=(None, None, 29), found shape=[None, 14, 30]


### Prepare data for one stock

In [None]:
target_stock = 'XOM'

In [None]:
single_stock_dataset = dataset[dataset['Ticker']==target_stock].reset_index(drop=True)

In [None]:
sns.lineplot(x=single_stock_dataset['Date'], y=single_stock_dataset['Close'])

### Train test split

In [None]:
train = single_stock_dataset.loc[single_stock_dataset['Date'] < "2018-03-01"]
valid = single_stock_dataset.loc[(single_stock_dataset['Date'] > "2018-03-13") & (single_stock_dataset['Date'] < "2019-03-01")]
test = single_stock_dataset.loc[(single_stock_dataset['Date'] > "2019-03-01") & (single_stock_dataset['Date'] < "2020-03-01")]

In [None]:
single_stock_scaler_models = norm_per_stock_split(train, valid, test, features, MinMaxScaler)

#### Prepare training inputs and outputs
Since we are using just one stock, we don't need to keep track of tickers

In [None]:
x_train, y_train, _ = roll_all_stocks(train, trail_size, predict_length)
x_valid, y_valid, _ = roll_all_stocks(valid, trail_size, predict_length)
x_test, y_test, _ = roll_all_stocks(test, trail_size, predict_length)

#### Confirm shapes

In [None]:
print("x_train: {}".format(x_train.shape))
print("y_train: {}".format(y_train.shape))
print("x_valid: {}".format(x_valid.shape))
print("y_valid {}".format(y_valid.shape))
print("x_test: {}".format(x_test.shape))
print("y_test {}".format(y_test.shape))

### Test model on single stock

In [None]:
print("Training RMSE = {}".format(evaluate_model_rmse(model.predict(x_train), 
                                                      y_train,
                                                     num_features,
                                                     single_stock_scaler_models[target_stock])))

In [None]:
print("Validation RMSE = {}".format(evaluate_model_rmse(model.predict(x_valid), 
                                                      y_valid,
                                                     num_features,
                                                     single_stock_scaler_models[target_stock])))

In [None]:
print("Testing RMSE = {}".format(evaluate_model_rmse(model.predict(x_test), 
                                                     y_test,
                                                    num_features,
                                                    single_stock_scaler_models[target_stock])))

In [None]:
model.layers[7].trainable = False
model.layers[4].trainable = False
model.layers[2].trainable = False
model.layers[0].trainable = False

In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='mean_squared_error')

### Train model on single stock

In [None]:
tensorboard = TensorBoard(log_dir='logs/{}_2'.format(time_stamp))

In [None]:
history2 = model.fit(x_train, y_train, epochs=45, validation_data=(x_valid, y_valid), batch_size=64, shuffle=False, callbacks=[tensorboard])

### Visualize results

In [None]:
plot_loss(history1)

In [None]:
plot_loss(history2)

In [None]:
print("Training RMSE = {}".format(evaluate_model_rmse(model.predict(x_train), 
                                                      y_train,
                                                     num_features,
                                                     single_stock_scaler_models[target_stock])))

In [None]:
print("Validation RMSE = {}".format(evaluate_model_rmse(model.predict(x_valid), 
                                                      y_valid,
                                                     num_features,
                                                     single_stock_scaler_models[target_stock])))

In [None]:
print("Testing RMSE = {}".format(evaluate_model_rmse(model.predict(x_test), 
                                                     y_test,
                                                    num_features,
                                                    single_stock_scaler_models[target_stock])))

#### Compare to simple baseline

In [None]:
def predict_no_change(x):
    return x[-1][0]

In [None]:
no_change_preds = np.array([predict_no_change(x) for x in x_train])
no_change_preds = np.expand_dims(no_change_preds, axis=1)

In [None]:
evaluate_model_rmse(no_change_preds, y_train, num_features, single_stock_scaler_models['XOM'])

In [None]:
no_change_preds = np.array([predict_no_change(x) for x in x_test])
no_change_preds = np.expand_dims(no_change_preds, axis=1)

In [None]:
evaluate_model_rmse(no_change_preds, y_test, num_features, single_stock_scaler_models['XOM'])

### Plot data against predictions

In [None]:
price_vs_preds = pd.DataFrame([x_train[:, -1, 0], model.predict(x_train)[:, 0]]).T
price_vs_preds.columns = ['Price', 'Prediction']
price_vs_preds.plot(figsize=(12,8), title="Training performance")

In [None]:
price_vs_preds = pd.DataFrame([x_valid[:, -1, 0], model.predict(x_valid)[:, 0]]).T
price_vs_preds.columns = ['Price', 'Prediction']
price_vs_preds.plot(figsize=(12,8), title="Validation performance")

In [None]:
price_vs_preds = pd.DataFrame([x_test[:, -1, 0], model.predict(x_test)[:, 0]]).T
price_vs_preds.columns = ['Price', 'Prediction']
price_vs_preds.plot(figsize=(12,8), title="Testing performance")