In [27]:
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
import yfinance as yf
import numpy as np
from datetime import datetime

# Define the ticker symbol
tickerSymbol = 'QQQ'

# Get data on this ticker
tickerData = yf.Ticker(tickerSymbol)

# Get today's date as a string in the format 'YYYY-MM-DD'
today = datetime.now().strftime('%Y-%m-%d')

# Get the historical prices for this ticker
df = tickerData.history(period='1d', start='2010-1-1', end=today)

# Number of past days to use
N = 21

# Define the degree of the polynomial and the number of PCA components
degree = 3
n_components = 20

# Initialize the PolynomialFeatures transformer and PCA transformer
poly = PolynomialFeatures(degree=degree)
pca = PCA(n_components=n_components)

# Add lags for N past days to the data
for i in range(N):
    df['Open_lag_' + str(i+1)] = df['Open'].pct_change().shift(i+1)
    df['High_lag_' + str(i+1)] = df['High'].pct_change().shift(i+1)
    df['Low_lag_' + str(i+1)] = df['Low'].pct_change().shift(i+1)
    df['Close_lag_' + str(i+1)] = df['Close'].pct_change().shift(i+1)
    df['Volume_lag_' + str(i+1)] = df['Volume'].pct_change().shift(i+1)

# Drop the missing values
df = df.dropna()

# Define future X days
X_days = 5

# Initialize the DataFrame to store predictions
predictions = pd.DataFrame(index=range(X_days), columns=['Open', 'High', 'Low', 'Close', 'Volume'])

# Loop for X days
for i in range(X_days):
    # The columns 'Open', 'High', 'Low', 'Close', and 'Volume' are our targets
    y = df[['Open', 'High', 'Low', 'Close', 'Volume']].pct_change()
    
    # Drop the first row from y
    y = y.iloc[1:]

    # The lag columns are our features
    X = df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1)
    
    # Match the shape of X to y by also dropping the first row from X
    X = X.iloc[1:]

    # Transform the features into polynomial features
    X_poly = poly.fit_transform(X)
    
    # Perform PCA on the polynomial features
    X_pca = pca.fit_transform(X_poly)

    # Create a regression model
    model = LinearRegression()

    # Fit the model to the data
    model.fit(X_pca, y)

    # Use the model to make predictions on the last day + predicted days
    last_day_features = df.tail(N).drop(['Open', 'High', 'Low', 'Close', 'Volume'], axis=1)
    
    # Transform the last day features into polynomial features
    last_day_features_poly = poly.transform(last_day_features)
    
    # Perform PCA on the last day polynomial features
    last_day_features_pca = pca.transform(last_day_features_poly)
    next_day_prediction = model.predict(last_day_features_pca)

    # Store the predictions
    predictions.iloc[i] = next_day_prediction[0]

    # Create a new DataFrame row from the predictions and append to the original data
    new_data = pd.DataFrame([next_day_prediction[0]], index=[df.index[-1] + pd.DateOffset(1)], columns=['Open', 'High', 'Low', 'Close', 'Volume'])
    df = pd.concat([df, new_data])

    # Shift the lag columns for the next iteration
    for i in range(N):
        df['Open_lag_' + str(i+1)] = df['Open'].shift(i+1)
        df['High_lag_' + str(i+1)] = df['High'].shift(i+1)
        df['Low_lag_' + str(i+1)] = df['Low'].shift(i+1)
        df['Close_lag_' + str(i+1)] = df['Close'].shift(i+1)
        df['Volume_lag_' + str(i+1)] = df['Volume'].shift(i+1)

    # Drop the missing values for the next iteration
    df = df.dropna()

print(predictions)


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

       Open      High       Low     Close    Volume
0  0.000857  0.000766  0.000669  0.000654  0.061814
1   0.00034  0.000019   0.00026 -0.000242  0.051685
2  0.000314  0.000002  0.000243 -0.000263  0.051981
3  0.000284 -0.000023  0.000402 -0.000222   0.04964
4 -0.000293 -0.000442  0.000205 -0.000299  0.025728


In [28]:
import plotly.graph_objects as go

# Define the period to plot
plot_period = 3*N

# Calculate the actual prices from the predicted percentage changes
pred_prices_open = df['Open'].iloc[-1] * (1 + predictions['Open']).cumprod()
pred_prices_high = df['High'].iloc[-1] * (1 + predictions['High']).cumprod()
pred_prices_low = df['Low'].iloc[-1] * (1 + predictions['Low']).cumprod()
pred_prices_close = df['Close'].iloc[-1] * (1 + predictions['Close']).cumprod()

# Create a data frame for the predicted data
pred_data = pd.DataFrame({
    'Date': pd.date_range(start=df.index[-1], periods=X_days + 1)[1:],
    'Open': pred_prices_open,
    'High': pred_prices_high,
    'Low': pred_prices_low,
    'Close': pred_prices_close,
})

# Concatenate the actual and predicted data
full_data = pd.concat([
    df[['Open', 'High', 'Low', 'Close']].tail(plot_period),
    pred_data.set_index('Date')
])

# Create a candlestick chart
fig = go.Figure(data=[go.Candlestick(
    x=full_data.index,
    open=full_data['Open'],
    high=full_data['High'],
    low=full_data['Low'],
    close=full_data['Close']
)])

# Set chart title
fig.update_layout(title='Candlestick chart of actual and predicted prices')

# Show chart
fig.show()