## Import Libraries

In [None]:
import yfinance as yf
import pandas as pd
import os
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

## Load and Clean S&P 500 Data

In [None]:
def load_sp500_data():
    sp500 = yf.Ticker("^GSPC").history(period="max")
    sp500.drop(columns=["Dividends", "Stock Splits"], inplace=True)
    return sp500

## Plot Closing Price Data

In [None]:
def plot_data(df, column, title, xlabel, ylabel, x_locator, x_formatter):
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(df.index, df[column])
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.xaxis.set_major_locator(x_locator)
    ax.xaxis.set_major_formatter(x_formatter)
    fig.autofmt_xdate()
    plt.show()

## Create Predictors

In [None]:
def create_predictors(df, horizons):
    new_predictors = []
    for horizon in horizons:
        rolling_averages = df.rolling(horizon).mean()
        df[f"Close_Ratio_{horizon}"] = df["Close"] / rolling_averages["Close"]
        df[f"Trend_{horizon}"] = df["Target"].rolling(horizon).sum().shift(1)
        new_predictors += [f"Close_Ratio_{horizon}", f"Trend_{horizon}"]
    return df.dropna(), new_predictors

## Function to predict with a model

In [None]:
def predict(train, test, predictors, model):
    model.fit(train[predictors], train["Target"])
    preds = model.predict_proba(test[predictors])[:, 1]
    preds = (preds >= 0.6).astype(int)
    return pd.concat([test["Target"], pd.Series(preds, index=test.index, name="Predictions")], axis=1)

## Function to backtest the model

In [None]:
def backtest(data, model, predictors, start=2500, step=250):
    all_predictions = []
    for i in range(start, data.shape[0], step):
        train = data.iloc[0:i].copy()
        test = data.iloc[i:i+step].copy()
        predictions = predict(train, test, predictors, model)
        all_predictions.append(predictions)
    return pd.concat(all_predictions)

## Main Execution

In [None]:
sp500 = load_sp500_data()

## Plot Closing Price

In [None]:
plot_data(sp500, "Close", "S&P 500 Closing Price", "Date", "Closing Price", mdates.YearLocator(5), mdates.DateFormatter('%Y'))

In [None]:
sp500 = sp500.loc["1990-01-01":].copy()

## Create Target Column and Restrict Time Range

In [None]:
sp500["Tomorrow"] = sp500["Close"].shift(-1)
sp500["Target"] = (sp500["Tomorrow"] > sp500["Close"]).astype(int)
sp500 = sp500.loc["1990-01-01":].copy()

In [None]:
sp500

In [None]:
model = RandomForestClassifier(n_estimators = 100, min_samples_split = 100, random_state = 1)
train = sp500.iloc[:-100]
test = sp500.iloc[-100:]
predictors = ["Close", "Volume", "High", "Low"]
model.fit(train[predictors], train["Target"])

## Add rolling averages and trends as new predictors

In [None]:
horizons = [2, 5, 60, 250, 1000]
sp500, new_predictors = create_predictors(sp500, horizons)

## Update model with more trees and less restrictive splitting

In [None]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=50, random_state=1)

## Backtest with new predictors

In [None]:
predictions = backtest(sp500, model, new_predictors)

## Evaluate results

In [None]:
prediction_counts = predictions["Predictions"].value_counts()
precision = precision_score(predictions["Target"], predictions["Predictions"])
target_distribution = predictions["Target"].value_counts() / predictions.shape[0]

print(f"Prediction counts:\n{prediction_counts}\n")
print(f"Precision score: {precision:.4f}")
print(f"Target distribution:\n{target_distribution}\n")

## Plot predictions vs actual

In [None]:
def plot_predictions_vs_actual(predictions):
    fig, ax = plt.subplots(figsize=(10, 6))
    predictions.plot(ax=ax)
    ax.set_title("Predictions vs Actual")
    ax.set_xlabel("Date")
    ax.set_ylabel("Target / Predictions")
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=10))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m'))
    fig.autofmt_xdate()
    plt.show()

plot_predictions_vs_actual(predictions)

In [None]:
# other exchanges open overnight, other indices besides sp500 open. look at prices, see
# if they can be correlated, to help predict sp500. add news (e.g. articles about
# general macroeconomic conditions like IR,inflation). add in key components like key 
# stocks and key sectors (e.g. if tech goes down, later sp500 may go down). increase
# resolution, like hourly data, min by min data, tick data. maybe check sentiment analysis