<a href="https://colab.research.google.com/github/leomercanti/SP500_Predictors_LSTM_Linear_Regression/blob/main/SP500_Predictors_LSTM_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **S&P 500 Prediction Project - Linear Regression and LSTM Models**



### **Import Libraries:**

Import the necessary libraries for data fetching, preprocessing, modeling, and visualization.

In [None]:
# Import necessary libraries
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input

In [None]:
# Install keras-tuner
!pip install keras-tuner

In [None]:
from keras_tuner import HyperModel, RandomSearch

### **Step 1: Fetch Historical Data**

Use yfinance to download historical data for the S&P 500 and extract yearly closing prices.

In [None]:
# Download historical S&P 500 data
sp500 = yf.Ticker('^GSPC')
data = sp500.history(period='max')

In [None]:
# Extract yearly closing prices
yearly_data = data['Close'].resample('Y').last().dropna()

### **Step 2: Visualize Historical Data**

Plot the yearly closing prices to understand the trend visually.

In [None]:
# Plot the yearly closing data
plt.figure(figsize=(10, 6))
plt.plot(yearly_data, label='S&P 500 Yearly Closing Price', color='blue')
plt.title('S&P 500 Yearly Closing Price (Historical Data)')
plt.xlabel('Year')
plt.ylabel('Closing Price')
plt.legend()
plt.grid(True)
plt.show()

### **Step 3: Prepare Data for Modeling**

Compute yearly returns and normalize the data to fit the models.

In [None]:
# Compute yearly returns
yearly_data = pd.DataFrame(yearly_data)
yearly_data['Returns'] = yearly_data['Close'].pct_change()

In [None]:
# Normalize the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(yearly_data[['Close', 'Returns']].dropna())

In [None]:
# Create features and labels for modeling
X = scaled_data[:-1]
y = scaled_data[1:, 0]  # Predicting next year’s closing price

### **Step 4: Split Data into Train and Test Sets**

Divide the dataset into training and testing subsets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

### **Step 5: Train Linear Regression Model**

Fit a linear regression model on the training data.

In [None]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

### **Step 6: Evaluate Linear Regression Model**

Make predictions using the test set and calculate MSE.

In [None]:
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f"Linear Regression MSE: {mse_lr}")

### **Step 7: Prepare Data for LSTM**

Reshape the data for the LSTM model, which requires 3D input.

In [None]:
# Reshape data for LSTM (input should be 3D for LSTM)
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

### **Step 8: Build and Train LSTM Model**

Construct and train the LSTM model.

In [None]:
model = Sequential()
model.add(Input(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
model.add(LSTM(50, return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))

In [None]:
# Compile and fit the model
model.compile(optimizer='adam', loss='mse')
model.fit(X_train_lstm, y_train, epochs=50, batch_size=16, verbose=1)

### **Step 9: Evaluate LSTM Model**

Calculate MSE for the LSTM predictions.

In [None]:
y_pred_lstm = model.predict(X_test_lstm)

In [None]:
# Prepare the array for inverse transformation
y_test_lstm = scaler.inverse_transform(np.column_stack((X_test[:, 0], np.zeros(X_test.shape[0]))))[:, 0]

In [None]:
# Inverse transform the predictions with the appropriate shape
y_pred_lstm_inverse = scaler.inverse_transform(np.column_stack((y_pred_lstm.flatten(), np.zeros(y_pred_lstm.shape[0]))))[:, 0]

In [None]:
# Calculate MSE
mse_lstm = mean_squared_error(y_test_lstm, y_pred_lstm_inverse)
print(f"LSTM MSE: {mse_lstm}")

### **Step 10: Visualize Predictions**

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(y_test, label='Actual', color='blue')
plt.plot(y_pred_lr, label='LR Predicted', color='red', linestyle='dashed')
plt.plot(y_pred_lstm, label='LSTM Predicted', color='orange', linestyle='dotted')
plt.title('Actual vs Predicted S&P 500 Yearly Closing Prices')
plt.xlabel('Year')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

### **Step 11: Hyperparameter Tuning for LSTM**

Use Keras Tuner to optimize hyperparameters for the LSTM model.

In [None]:
class LSTMHyperModel(HyperModel):
    def build(self, hp):
        model = Sequential()
        model.add(Input(shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
        model.add(LSTM(hp.Int('units', min_value=32, max_value=128, step=32), return_sequences=True))
        model.add(LSTM(hp.Int('units', min_value=32, max_value=128, step=32)))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')
        return model

tuner = RandomSearch(
    LSTMHyperModel(),
    objective='val_loss',
    max_trials=5,
    executions_per_trial=1,
    directory='lstm_tuning',
    project_name='sp500_lstm_tuning'
)

tuner.search(X_train_lstm, y_train, epochs=50, batch_size=16, validation_data=(X_test_lstm, y_test))
best_model = tuner.get_best_models(num_models=1)[0]

### **Step 12: Combine Predictions from Different Models**

Average predictions from the Linear Regression and LSTM models.

In [None]:
combined_predictions = (y_pred_lr + y_pred_lstm.flatten()) / 2
mse_combined = mean_squared_error(y_test, combined_predictions)
print(f"Combined Model MSE: {mse_combined}")

### **Step 13: Visualize Combined Predictions**

Plot actual vs. predicted values from both models and the combined predictions.

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(y_test, label='Actual', color='blue')
plt.plot(y_pred_lr, label='LR Predicted', color='red', linestyle='dashed')
plt.plot(y_pred_lstm, label='LSTM Predicted', color='orange', linestyle='dotted')
plt.plot(combined_predictions, label='Combined Predicted', color='green', linestyle='dashdot')
plt.title('Actual vs Combined Predicted S&P 500 Yearly Closing Prices')
plt.xlabel('Year')
plt.ylabel('Closing Price')
plt.legend()
plt.show()

### **Step 14: Generate Future Predictions**

Make predictions for the next ten years and print the predicted values.

In [None]:
future_years = 10  # Number of years to predict
last_data_point = scaled_data[-1].reshape(1, -1)  # Start with the last known data point

for i in range(future_years):
    future_pred = lr_model.predict(last_data_point)  # Make prediction

    # Inverse transform the prediction
    predicted_value = scaler.inverse_transform(np.array([[future_pred[0], 0]]))[0][0]
    print(f"Predicted closing price for year {i+1}: {predicted_value}")

    # Update last_data_point with the new prediction for the next iteration
    last_data_point[0, 0] = future_pred[0]  # Update the relevant feature with the prediction