In [None]:
!pip install keras-tuner
!pip install yfinance

# LSTM Model_S&P500 FROM 2018 TO 2022

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller, acf, pacf
import yfinance as yf
import math
from sklearn.metrics import mean_squared_error
import pandas_datareader as pdr
import os
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

# Data Acquisition

In [None]:
df = yf.download('^GSPC',start="2018-01-31", end="2022-12-31",interval='1d')

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df

# Data Preparation

In [None]:
df["Return"] = df["Adj Close"].pct_change()

In [None]:
df1 = df.drop(columns = ["Open","High","Low","Adj Close", "Volume"])

In [None]:
df1

In [None]:
df1["Log_Return"] = np.log(1+df1["Return"])

In [None]:
df1 = df1.dropna()

In [None]:
df2 = df1["Log_Return"]

In [None]:
plt.figure(figsize=(16,6))
plt.title('Log Return History')
plt.plot(df2)
plt.xlabel('Date', fontsize=18)
plt.ylabel('Return USD ($)', fontsize=18)
plt.show()

In [None]:
# Create a histogram of stock data
plt.figure(figsize = (15,6))
plt.hist(df2, bins=30, alpha=0.5, color='blue')

# Add a vertical line for the mean
plt.axvline(df2.mean(), color='red', linestyle='dashed', linewidth=2)

# Add labels and title
plt.xlabel("Date")
plt.ylabel("Log_Return")
plt.title('Histogram of Stock Data')

# Show the plot
plt.show()


# Scaling Data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df3 = scaler.fit_transform(np.array(df2).reshape(-1, 1))

In [None]:
df3

# Train Test Split

In [None]:
train_size = int(len(df3)*0.8)
test_size = len(df3)-train_size
train_size,test_size

In [None]:
train_data,test_data = df3[0:train_size],df3[train_size:]

In [None]:
train_data.shape,test_data.shape

In [None]:
def creat_dataset(data, time_stamp):
    data_x, data_y = [], []
    for i in range(len(data)-time_stamp-1):
        a = data[i: i + time_stamp, 0]
        data_x.append(a)
        b = data[i+time_stamp,0]
        data_y.append(b)
    return data_x, data_y
                

In [None]:
x_train, y_train = creat_dataset(train_data,50)
x_test, y_test = creat_dataset(test_data,50)

In [None]:
x_train = np.asarray(x_train)
x_test = np.asarray(x_test)
y_train = np.asarray(y_train)
y_test = np.asarray(y_test)
x_train,x_test

# Find the correlation in b/w daily Returns

In [None]:
import seaborn as sns 
from sklearn.preprocessing import Normalizer
normalizedx_train = Normalizer().fit_transform(x_train)

normalizedx_train = pd.DataFrame(normalizedx_train)
normalizedx_train

# Pearson Correlation

In [None]:
# plot correlation heatmap
plt.figure(figsize = (40,20))
sns.heatmap(normalizedx_train.corr(), annot = True)

In [None]:
x_train = x_train.reshape(x_train.shape[0], x_train.shape[1],1)
x_test = x_test.reshape(x_test.shape[0], x_test.shape[1],1)

In [None]:
x_train.shape, x_test.shape

In [None]:
# Building Model

In [None]:
import keras_tuner
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import LSTM
from kerastuner import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters
import tensorflow as tf

# Hyperparameter Tuning

In [None]:
my_learning_rate = .001

In [None]:
def build_model(hp):
    model = Sequential()
    model.add(LSTM(hp.Int('input_unit',min_value=32,max_value=512,step=32),return_sequences=True, input_shape=(50,1)))
    for i in range(hp.Int('n_layers', 1, 4)):
        model.add(LSTM(hp.Int(f'lstm_{i}_units',min_value=32,max_value=512,step=32),return_sequences=True))
    model.add(LSTM(hp.Int('layer_2_neurons',min_value=32,max_value=512,step=32)))
    model.add(Dropout(hp.Float('Dropout_rate',min_value=0,max_value=0.5,step=0.1)))
    model.add(Dense(1, activation=hp.Choice('dense_activation',values=['relu', 'sigmoid'],default='relu')))
    model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.RMSprop(learning_rate=my_learning_rate),metrics = ['mse'])
    return model

In [None]:
tuner_search = RandomSearch(
    build_model,
    objective='mse',
    max_trials=1,
    directory='output',
    project_name='R1_Final_FGMS_LSTM_S&P500_New-data_model_5'
)

In [None]:
tuner_search.search(
        x=x_train,
        y=y_train,
        epochs=50,
        batch_size=10,
        validation_data=(x_test,y_test),
)

In [None]:
model_1=tuner_search.get_best_models(num_models=1)[0]

In [None]:
# Fitting Model

In [None]:
model_1.summary()

In [None]:
model_1.fit(x=x_train,
        y=y_train,
        epochs=100,
        batch_size=10,
        validation_data=(x_test,x_test),
)

In [None]:
train_pred = model_1.predict(x_train)
test_pred = model_1.predict(x_test)

In [None]:
train_pred.shape

In [None]:
y_train.shape, train_pred.shape

# Accuracy of Model

In [None]:
from tensorflow.keras.utils import plot_model

In [None]:
plot_model(model_1, show_shapes=True, show_layer_names=True)

In [None]:
rmse = np.sqrt(mean_squared_error(y_true = y_train, y_pred = train_pred))
print("RMSE: {:.2f}".format(rmse))

In [None]:
rmse = np.sqrt(mean_squared_error(y_true = y_test, y_pred = test_pred))
print("RMSE: {:.2f}".format(rmse))

In [None]:
train_pred = scaler.inverse_transform(train_pred)
test_pred = scaler.inverse_transform(test_pred)
train_pred.shape, test_pred.shape

In [None]:
math.sqrt(mean_squared_error(y_train, train_pred))

In [None]:
math.sqrt(mean_squared_error(y_test, test_pred))

In [None]:
df3.shape

In [None]:
import numpy
### Plotting 
# shift train predictions for plotting
look_back = 50
trainPredictPlot = numpy.empty_like(df3)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_pred)+look_back, :] = train_pred

In [None]:
# shift test predictions for plotting
testPredictPlot = numpy.empty_like(df3)
testPredictPlot[:, :] = numpy.nan
testPredictPlot[len(train_pred)+(look_back*2)+1:len(df3)-1, :] = test_pred

In [None]:
testPredictPlot

In [None]:
df3 = scaler.inverse_transform(df3)

# Visualization of Results

In [None]:
# plot baseline and predictions
plt.figure(figsize = (18,10))
plt.plot((df3),label = "Original Dataset ")
plt.plot((trainPredictPlot),label= "Prediction X_train ")
plt.plot((testPredictPlot),label = "Prediction X_test ")
plt.xlabel('Dateset', fontsize=18)
plt.ylabel('Return USD ($)', fontsize=18)
plt.legend()
plt.show()

In [None]:
len(train_data),len(test_data)

In [None]:
time_stamp = 50
var = len(test_data)-time_stamp
x_input = test_data[var:].reshape(1,-1)
x_input.shape

In [None]:
temp_input=list(x_input)
temp_input=temp_input[0].tolist()


In [None]:
# demonstrate prediction for next 1 days
from numpy import array

lst_output=[]
n_steps=50
i=0
while(i<1):
    
    if(len(temp_input)>n_steps):
        #print(temp_input)
        x_input=np.array(temp_input[1:])
        #print("{} day input {}".format(i,x_input))
        x_input=x_input.reshape(1,-1)
        x_input = x_input.reshape((1, n_steps, 1))
        #print(x_input)
        yhat = model_1.predict(x_input, verbose=0)
        print("{} day output {}".format(i,yhat))
        temp_input.extend(yhat[0].tolist())
        temp_input=temp_input[1:]
        #print(temp_input)
        lst_output.extend(yhat.tolist())
        i=i+1
    else:
        x_input = x_input.reshape((1, n_steps,1))
        yhat = model_1.predict(x_input, verbose=0)
        print(yhat[0])
        temp_input.extend(yhat[0].tolist())
        print(len(temp_input))
        lst_output.extend(yhat.tolist())
        i=i+1
    


In [None]:
lst_output = scaler.inverse_transform(lst_output)

In [None]:
day_new=np.arange(1,51)
day_pred=np.arange(51,52)

In [None]:
import matplotlib.pyplot as plt

In [None]:
len(df3)

# Ploting Predicted Value

In [None]:
var_1 = len(df3)-time_stamp
plt.figure(figsize = (20,10))
plt.plot(day_new,(df3[var_1:]))
plt.scatter(day_pred,(lst_output), c = "r")
plt.xlabel('Dateset', fontsize=18)
plt.ylabel('Return USD ($)', fontsize=18)

In [None]:
plt.figure(figsize = (20,10))
df3 = df3.tolist()
df3.extend((lst_output))
plt.plot((df3[var_1:]))
plt.xlabel('Dateset', fontsize=18)
plt.ylabel('Return USD ($)', fontsize=18)

# Loading New Data for Cross Checking

In [None]:
check_df = yf.download('^GSPC',start="2022-12-01", end="2023-01-04",interval='1d')

In [None]:
check_df["Close"].tail(10)

In [None]:
check_df["Return"] = check_df.Close.pct_change()

In [None]:
check_df1 = check_df["Return"].dropna()
check_df1,df1[1425:]

In [None]:
for_adding_date = yf.download('^GSPC',start="2018-01-31", end="2022-12-31",interval='1d')
for_adding_date.shape
for_adding_date.tail()

In [None]:
df4 = pd.DataFrame(df3)

In [None]:
df4 = df4.set_index(for_adding_date.index)

In [None]:
# Ploting Final comparision 
plt.figure(figsize = (20,10))
plt.subplot(3,1,1)
plt.plot(check_df1,label = "New Data")
plt.ylabel('Return USD ($)', fontsize=18)
plt.legend()
plt.subplot(3,1,2)
plt.plot(df2[var_1+30:], label = "Original Data", color = "r")
plt.ylabel('Return USD ($)', fontsize=18)
plt.legend()
plt.subplot(3,1,3)
plt.plot(df4[var_1+30:], label = "Predicted Data", c = "g")
plt.xlabel('Dateset', fontsize=18)
plt.ylabel('Return USD ($)', fontsize=18)
plt.legend()