In [15]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
import math
from itertools import cycle

import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [16]:
#load in data
ticker = "AAPL"
start_date = "2021-01-01"
end_date = "2023-01-01"

apple_data = pd.DataFrame(yf.download(ticker, start = start_date, end = end_date)).reset_index()
apple_data.tail()

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
498,2022-12-23,130.919998,132.419998,129.639999,131.860001,131.658981,63814900
499,2022-12-27,131.380005,131.410004,128.720001,130.029999,129.831772,69007800
500,2022-12-28,129.669998,131.029999,125.870003,126.040001,125.847855,85438400
501,2022-12-29,127.989998,130.479996,127.730003,129.610001,129.412415,75703700
502,2022-12-30,128.410004,129.949997,127.43,129.929993,129.731918,77034200


In [17]:
#creates the test and train data by looking at the previous 10 trading days of open, high low, close, adj close and volume
#takes an average of the 10 days to get a result - can possibly use other methods other than average

def create_dataset(dataset, time_step=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
      for j in range(dataset.shape[1]):
        a = np.mean(dataset[i:(i+time_step), j])   ###i=0, 0,1,2,3-----99   100 
        dataX.append(a)
      dataY.append(dataset[i + time_step, 4])
    return np.array(dataX).reshape(-1,6), np.array(dataY)

In [18]:
apple = np.array(apple_data[['Open', 'High', 'Low', 'Adj Close', 'Close', 'Volume']])

#scale the data - I use separate scalers due to the different matrix size and magnitudes 
scalar_close = MinMaxScaler(feature_range = (0,1))
scalar_feat = MinMaxScaler(feature_range = (0,1))
scalar_vol = MinMaxScaler(feature_range = (0,1))

close_scaled = scalar_close.fit_transform(apple[:,4].reshape(-1,1))
feat_scaled = scalar_feat.fit_transform(apple[:,:4].reshape(-1,4))
vol_scaled = scalar_vol.fit_transform(apple[:, -1].reshape(-1,1))

#scale and recombine into matrix
apple = np.hstack((feat_scaled, close_scaled, vol_scaled))


In [19]:
#split data into training and test sets
train_size = int(len(apple) * .65)
test_size = len(apple) - train_size

train_data = apple[:train_size, :]
test_data = apple[train_size:, :]

#look at previous 10 days - can change
time_step = 10 
x_train, y_train = create_dataset(train_data, time_step)
x_test, y_test = create_dataset(test_data, time_step)


In [58]:
#Run Random Forest Model - can change num of estimators(num_trees in forest)
regressor = RandomForestRegressor(n_estimators = 40)
regressor.fit(x_train, y_train)

In [59]:
train_predict=regressor.predict(x_train)
test_predict=regressor.predict(x_test)

train_predict = train_predict.reshape(-1,1)
test_predict = test_predict.reshape(-1,1)

print("Train data prediction:", train_predict.shape)
print("Test data prediction:", test_predict.shape)

Train data prediction: (315, 1)
Test data prediction: (166, 1)


In [60]:
#turn scaled prediction into unscaled predictions
train_predict = scalar_close.inverse_transform(train_predict)
test_predict = scalar_close.inverse_transform(test_predict)

#turn scaled train and test closing prices back into unscaled prices to compare
original_ytrain = scalar_close.inverse_transform(y_train.reshape(-1,1)) 
original_ytest = scalar_close.inverse_transform(y_test.reshape(-1,1)) 



In [61]:
print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain,train_predict)))
print("Train data MSE: ", mean_squared_error(original_ytrain,train_predict))
print("Test data MAE: ", mean_absolute_error(original_ytrain,train_predict))
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest,test_predict)))
print("Test data MSE: ", mean_squared_error(original_ytest,test_predict))
print("Test data MAE: ", mean_absolute_error(original_ytest,test_predict))
print("-------------------------------------------------------------------------------------")
print("The train model score is", regressor.score(x_train, y_train))
print("The test model score is ", regressor.score(x_test, y_test))



Train data RMSE:  1.486449418381197
Train data MSE:  2.2095318734057985
Test data MAE:  1.0225604865664517
-------------------------------------------------------------------------------------
Test data RMSE:  7.10770100857272
Test data MSE:  50.51941362726566
Test data MAE:  5.753564624326777
-------------------------------------------------------------------------------------
The train model score is 0.9928087056404774
The test model score is  0.5292171976385203


In [66]:
#run tests to see which num_trees is best - all pretty similar 
for i in range(0,200,5):
  regressor = RandomForestRegressor(n_estimators = i)
  regressor.fit(x_train, y_train)
  train_predict=regressor.predict(x_train)
  test_predict=regressor.predict(x_test)
  train_predict = train_predict.reshape(-1,1)
  test_predict = test_predict.reshape(-1,1)
  train_predict = scalar_close.inverse_transform(train_predict)
  test_predict = scalar_close.inverse_transform(test_predict)
  original_ytrain = scalar_close.inverse_transform(y_train.reshape(-1,1)) 
  original_ytest = scalar_close.inverse_transform(y_test.reshape(-1,1)) 
  print("The train model score is", regressor.score(x_train, y_train))
  print("The test model score is ", regressor.score(x_test, y_test))




The train model score is 0.9922941615779903
The test model score is  0.5199890990875964
The train model score is 0.9924917672198045
The test model score is  0.5071336476415074
The train model score is 0.9923964360269921
The test model score is  0.5086121991591852
The train model score is 0.9922784560344509
The test model score is  0.49582957169569986
The train model score is 0.9917394701105895
The test model score is  0.511220092151307
The train model score is 0.9927150452471714
The test model score is  0.4991112735320872
The train model score is 0.9924640587417743
The test model score is  0.5091146541709435
The train model score is 0.9925543571987977
The test model score is  0.499033964982439
The train model score is 0.9924084941243663
The test model score is  0.508094852468465
The train model score is 0.9927238095497934
The test model score is  0.5122382229523161
The train model score is 0.9921334335720446
The test model score is  0.5147451128800762
The train model score is 0.9924124

In [42]:
#get columns for graphing purpocses
closing_prices = scalar_close.fit_transform(np.array(apple_data[['Close']]).reshape(-1,1))
close_stock = apple_data[["Date" , "Close"]] 


look_back=time_step
trainPredictPlot = np.empty_like(closing_prices)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
print("Train predicted data: ", trainPredictPlot.shape)

# shift test predictions for plotting
testPredictPlot = np.empty_like(closing_prices)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict)+(look_back*2)+1:len(closing_prices)-1, :] = test_predict
print("Test predicted data: ", testPredictPlot.shape)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])

#plot
plotdf = pd.DataFrame({'date': close_stock['Date'],
                       'original_close': close_stock['Close'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','date': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

Train predicted data:  (503, 1)
Test predicted data:  (503, 1)


In [31]:
#create X dataset for prediction 
#it takes last two observations for day 1, and slowly decreases
#accuracy will then decrease the further out you go
def create_x_for_prediction(dataset, time_step=1):
    dataX = []
    for i in range(len(dataset)-time_step,len(dataset)):
      for j in range(dataset.shape[1]):
        a = np.mean(dataset[i:, j])   ###i=0, 0,1,2,3-----99   100 
        dataX.append(a)
    return np.array(dataX).reshape(-1,6)

In [43]:
x_pred = create_x_for_prediction(apple, 10)
y_pred= regressor.predict(x_pred)

In [49]:
#Graph the last 10 days and future 10 days predictions
pred_days = 10
time_step = 10
#pred days <= time_step

last_days=np.arange(1,time_step+1)
next_days=np.arange(time_step+1,time_step+pred_days+1)

#make empty matrixs
null_matrix = np.empty((len(last_days)+pred_days+1,1))
null_matrix[:] = np.nan
null_matrix = null_matrix.reshape(1,-1).tolist()[0]

#fill in values into empty matrix spots
last_days_value = null_matrix.copy()
next_days_value = null_matrix.copy()

next_days_value[time_step:] =scalar_close.inverse_transform(np.array(y_pred).reshape(-1,1)).reshape(1,-1).tolist()[0]
#print(next_days_value)

last_days_value[0:time_step+1] = scalar_close.inverse_transform(closing_prices[len(closing_prices)-time_step:]).reshape(1,-1).tolist()[0]
#print(last_days_value)


names = cycle(['Last 10 days close price','Predicted next 10 days close price'])

#plot
new_pred_plot = pd.DataFrame({
    'last_days':last_days_value,
    'next_days':next_days_value
})

fig = px.line(new_pred_plot,x=new_pred_plot.index, y=[new_pred_plot['last_days'],
                                                      new_pred_plot['next_days']],
              labels={'value': 'Stock price','index':'Time'})
              

fig.update_layout(title_text='Compare last 10 days vs next 10 days',
                  plot_bgcolor='white', font_size=15, font_color='black',legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()


In [47]:
#Graph the past data + prediction in one figure

whole_stock=closing_prices.tolist()
whole_stock.extend((np.array(y_pred).reshape(-1,1)).tolist())
rfdf=scalar_close.inverse_transform(whole_stock).reshape(1,-1).tolist()[0]

names = cycle(['Close price'])

#plot
fig = px.line(rfdf,labels={'value': 'Stock price','index': 'Timestamp'})
fig.update_layout(title_text='Plotting whole closing stock price with prediction',
                  plot_bgcolor='white', font_size=15, font_color='black',legend_title_text='Stock')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()