Multiple Linear Regression on Stock Price Data (is far from market prediction as its using current day's open, high, low, and volume)

Import libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import yfinance as yf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

Import dataset

In [2]:
stock_data = yf.download("AAPL", start="2020-01-01", end="2024-01-01")
#add moving average
stock_data["MA_5"] = stock_data["Close"].rolling(window=5).mean()
stock_data["MA_20"] = stock_data["Close"].rolling(window=20).mean()
#due to rolling window the first few are NaN
stock_data = stock_data.dropna()
print(stock_data)

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Price            Close        High         Low        Open     Volume  \
Ticker            AAPL        AAPL        AAPL        AAPL       AAPL   
Date                                                                    
2020-01-30   78.410370   78.463633   77.170796   77.604165  126743200   
2020-01-31   74.933746   78.122258   74.638379   77.698575  199588400   
2020-02-03   74.727966   75.897328   73.168813   73.672387  173788400   
2020-02-04   77.194992   77.386257   75.931207   76.337940  136616400   
2020-02-05   77.824471   78.625836   77.219209   78.325621  118826800   
...                ...         ...         ...         ...        ...   
2023-12-22  192.444595  194.243791  191.818350  194.015153   37122800   
2023-12-26  191.897858  192.732841  191.679169  192.454513   28919300   
2023-12-27  191.997284  192.345201  189.949580  191.341234   48087700   
2023-12-28  192.424713  193.498269  192.017156  192.981369   34049900   
2023-12-29  191.380951  193.239786  190.585722  192




Define Features and Target

In [3]:
X = stock_data[["Open", "High", "Low", "Volume", "MA_5", "MA_20"]].values
y = stock_data["Close"].values

Split into Training and Testing sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

Scale the data (Optional)

In [5]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

Train the model

In [6]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)

Make predictions and Compare

In [7]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision = 2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[146.19 146.41]
 [123.61 125.01]
 [ 67.61  69.25]
 [144.23 143.86]
 [141.39 142.31]
 [161.81 162.49]
 [146.23 146.68]
 [130.9  131.96]
 [192.44 192.27]
 [163.09 162.12]
 [124.43 124.35]
 [124.23 125.12]
 [147.7  149.02]
 [148.04 148.68]
 [155.87 155.16]
 [ 77.9   77.73]
 [181.68 181.58]
 [150.48 150.79]
 [127.43 127.59]
 [143.66 143.8 ]
 [141.   140.89]
 [124.97 125.07]
 [ 58.85  58.59]
 [168.   170.02]
 [ 61.86  58.78]
 [171.51 171.77]
 [143.49 143.14]
 [192.08 191.9 ]
 [150.53 151.07]
 [178.29 176.88]
 [126.68 127.04]
 [115.04 115.33]
 [ 86.6   86.05]
 [175.03 174.94]
 [130.88 130.83]
 [ 77.67  77.68]
 [ 68.13  67.21]
 [150.43 150.82]
 [120.04 118.36]
 [132.35 131.74]
 [160.99 162.12]
 [169.29 170.03]
 [176.32 176.3 ]
 [163.68 163.56]
 [164.65 163.37]
 [ 91.21  90.97]
 [181.24 181.56]
 [120.61 118.62]
 [144.47 144.02]
 [190.76 190.29]
 [175.36 175.86]
 [190.99 190.17]
 [134.85 135.84]
 [173.1  173.22]
 [145.09 145.5 ]
 [170.37 171.77]
 [141.79 142.14]
 [172.21 172.49]
 [132.12 131.9