In [1]:
import os
import datetime
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import yfinance as yf
import pandas_ta as ta
from sklearn.preprocessing import MinMaxScaler
#globals
TOTAL_FEATURES = 8
BACK_CANDLES = 200
FUTURE_DAYS = 30

def create_stock_dictionary():
    dictionary = {}
    current_directory = os.getcwd()
    folder_name = 'S&P500Daily'
    folder_path = os.path.join(current_directory, folder_name)

    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            name = file_name.split("_")[0]
            file_path = os.path.join(folder_path, file_name)
            dictionary[name] = pd.read_csv(file_path)
            
    return dictionary

In [2]:
import random
stock_data = create_stock_dictionary()


In [23]:
#create sorted dictionary based on average 200 day closing price for training
stock_data_avg_price_dict = {}
for key in stock_data.keys():
    stock_data_avg_price_dict[key] = stock_data[key]['Close'].tail(200).mean()
stock_data_avg_price_dict = dict(sorted(stock_data_avg_price_dict.items(), key=lambda item: item[1]))

In [None]:
from sklearn.model_selection import train_test_split
n_days = BACK_CANDLES
m_days = FUTURE_DAYS
stock_data_training = {}
stock_data_testing = {}

#split data for training
for i in stock_data.keys():
    #add more features here
    stock_data[i].rename(columns = {'Datetime':'Date'}, inplace = True)
    stock_data[i]['MACD'] = ta.macd(stock_data[i]['Close'])['MACD_12_26_9']
    stock_data[i]['RSI'] = ta.rsi(stock_data[i]['Close'])
    bbands = ta.bbands(stock_data[i]['Close'])
    stock_data[i]['BBand'] = (bbands['BBU_5_2.0'] - bbands['BBL_5_2.0']) / stock_data[i]['Close']
    
    stock_data[i]["TargetNextClose"] = stock_data[i]["Adj Close"].shift(-1)
    stock_data[i].drop(['Close', 'Open', 'High', 'Low'], axis=1, inplace=True)
    
    #change scaling algorithm
    scaler = MinMaxScaler()
    for k in range(1, stock_data[i].shape[1]):
        column_data = stock_data[i].iloc[:, k].values.reshape(-1, 1)
        stock_data[i].iloc[:, k] = scaler.fit_transform(column_data)
    
    stock_data_training[i], stock_data_testing[i] = train_test_split(stock_data[i], test_size=0.2, shuffle=False)

In [28]:
stock_data['AAPL'].tail(100).iloc[1]

Date               2024-01-05
Adj Close            0.908945
Volume               0.036732
MACD                 0.375569
RSI                   0.11199
BBand                 0.30531
TargetNextClose      0.932502
Name: 2658, dtype: object

In [14]:
#Correlation testing
import copy
stock_data_correlation_test = copy.deepcopy(stock_data['MSFT'])
stock_data_correlation_test.drop(['Date', 'TargetNextClose'], axis=1, inplace=True)
stock_data_correlation_test.corr()

Unnamed: 0,Adj Close,Volume,MACD,RSI,BBand
Adj Close,1.0,-0.171603,0.284278,0.018142,0.093264
Volume,-0.171603,1.0,-0.175754,-0.201517,0.526921
MACD,0.284278,-0.175754,1.0,0.623016,-0.218319
RSI,0.018142,-0.201517,0.623016,1.0,-0.252285
BBand,0.093264,0.526921,-0.218319,-0.252285,1.0


In [15]:
#code that lets you select which stocks you want to train on, so you can use other stocks to train
vals = list(stock_data_avg_price_dict.keys())
training_vals = vals[200:220]
training_df_list = []
for key in stock_data_training.keys():
    for item in training_vals:
        if(key == item):
            training_df_list.append(stock_data_training[key])
            

stacked_stock_data = pd.concat(training_df_list, ignore_index=True, axis=0)
stacked_stock_data = stacked_stock_data.dropna()
stacked_stock_data


Unnamed: 0,Date,Adj Close,Volume,MACD,RSI,BBand,TargetNextClose
25,2013-07-22,0.055400,0.051573,0.879978,0.794074,0.021279,0.060676
26,2013-07-23,0.060676,0.061589,0.879075,0.835237,0.022631,0.052762
27,2013-07-24,0.052762,0.037938,0.869981,0.703673,0.029453,0.053739
28,2013-07-25,0.053739,0.060062,0.861863,0.712775,0.029039,0.055595
29,2013-07-26,0.055595,0.065402,0.855323,0.730448,0.028709,0.059797
...,...,...,...,...,...,...,...
58554,2022-03-09,0.666718,0.126550,0.163496,0.251624,0.313412,0.660566
58555,2022-03-10,0.660566,0.096320,0.160418,0.238488,0.223355,0.637829
58556,2022-03-11,0.637829,0.073158,0.145078,0.192046,0.147098,0.603991
58557,2022-03-14,0.603991,0.217361,0.111973,0.132749,0.249854,0.616229


In [16]:
xs, ys = [], []
#for actuallly training data
for i in range(len(stacked_stock_data) - n_days - m_days + 1): 

    v = stacked_stock_data.iloc[i:(i + n_days), :-1]

    if v['Date'].iloc[-1] <= v['Date'].iloc[0]: 
        continue

    xs.append(v.iloc[:, 1:].to_numpy())
    ys.append(stacked_stock_data.iloc[i + n_days:i + n_days + m_days, len(stacked_stock_data.columns) - 1])
xs = np.array(xs)
ys = np.array(ys)


In [17]:
X_test = []
Y_test = []
test = random.sample(training_vals, 1)[0]
for key, df in stock_data_testing.items():
    if(key == test):
        if len(df) >= n_days + m_days:
            for i in range(len(df) - n_days - m_days + 1):
                X_test.append(df.iloc[i:i+n_days, 1:-1].to_numpy())

            for i in range(len(df) - n_days - m_days + 1):
                Y_test.append(df.iloc[i+n_days:i+n_days+m_days, -1].to_numpy())

X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [None]:
from keras.layers import LSTM
from keras.layers import Dense
import tensorflow as tf
from keras import Sequential
from keras import optimizers
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Dense, GRU, LeakyReLU
import keras_tuner as kt
import keras
from keras_tuner import HyperParameters, RandomSearch

num_features = 5

In [19]:
n_days

200

In [20]:
def build_model(hp):
    model = Sequential()
    # automatic tuning stuff (that hopefully works)
    num_layers = hp.Int('num_layers', 1, 4)
    for i in range(num_layers):
        model.add(LSTM(units=hp.Int('units_' + str(i), min_value=50, max_value=300, step=20),
                       activation='relu',
                       input_shape=(n_days, num_features) if i == 0 else None,
                       return_sequences=i < num_layers - 1))
    model.add(Dense(m_days, activation='relu'))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

tuner = RandomSearch(
    build_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    project_name='lstm_tuning'
)

tuner.search(xs, ys, epochs=10, batch_size=32, validation_split=0.2, callbacks=[keras.callbacks.EarlyStopping(patience=3)])
best_model = tuner.get_best_models(num_models=1)[0]
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print("Best model architecture:")
best_model.summary()
print("Best hyperparameters:")
for param, val in best_hps.values.items():
    print(f"{param}: {val}")

Trial 1 Complete [00h 01m 47s]
val_loss: nan

Best val_loss So Far: nan
Total elapsed time: 00h 01m 47s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
4                 |1                 |num_layers
110               |90                |units_0

Epoch 1/10
[1m1307/1307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 88ms/step - loss: nan - val_loss: nan
Epoch 2/10
[1m  69/1307[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:37[0m 79ms/step - loss: nan

KeyboardInterrupt: 

In [None]:
model = Sequential()
num_layers = best_hps["num_layers"]
amount_of_units_0 = 140
amount_of_units_1 = 60
amount_of_units_2 = 80
amount_of_units = 0
for i in range(num_layers):
    model.add(LSTM(units=best_hps["units_" + str(i)],
                    activation='relu',
                    input_shape=(n_days, num_features) if i == 0 else None,
                    return_sequences=i < num_layers - 1))
model.add(Dense(m_days, activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(xs, ys, epochs=10, batch_size=32, validation_split=0.2, callbacks=[keras.callbacks.EarlyStopping(patience=3)])

In [None]:
y_pred = model.predict(X_test)


In [None]:
plt.figure(figsize=(16,8))
a, b = 0.4,0.55
plt.ylim((a, b))
plt.plot(Y_test[0], color = 'black', label = 'Test')
plt.plot(y_pred[0], color = 'green', label = 'pred')
plt.legend()
plt.show()