Importing Python Libraries

In [40]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input

In [60]:
data = pd.read_csv('data/apple_stock_data.csv')

In [61]:
data.head()

Unnamed: 0,Date,Close,Volume,Open,High,Low
0,02/28/2020,$273.36,106721200,$257.26,$278.41,$256.37
1,02/27/2020,$273.52,80151380,$281.1,$286,$272.96
2,02/26/2020,$292.65,49678430,$286.53,$297.88,$286.5
3,02/25/2020,$288.08,57668360,$300.95,$302.53,$286.13
4,02/24/2020,$298.18,55548830,$297.26,$304.18,$289.23


convert date column to a datetime type, setting it as index and focus on close price

In [62]:
data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)
data.set_index('Date', inplace=True)
data = data[['Close']]

  data['Date'] = pd.to_datetime(data['Date'], dayfirst=True)


KeyError: "None of [Index(['Close'], dtype='object')] are in the [columns]"

# LSTM (Long Short Term Memory)

Scaling between 0 and 1 

In [54]:
scaler = MinMaxScaler(feature_range=(0, 1))
data['Close'] = scaler.fit_transform(data[['Close']])

sequence of defined length

In [55]:
def create_sequences(data, seq_length=60):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    return np.array(X), np.array(y)

seq_length = 60
X, y = create_sequences(data['Close'].values, seq_length)

split training(80%) and testing(20%)

In [33]:
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

Building LSTM model

In [12]:
lstm_model = Sequential()
lstm_model.add(Input(shape=(X_train.shape[1], 1)))
lstm_model.add(LSTM(units=50, return_sequences=True))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(1))

Compiling with optimizer and loss function and fit into the training data

In [13]:
lstm_model.compile(optimizer='adam', loss='mean_squared_error')
lstm_model.fit(X_train, y_train, epochs=20, batch_size=32)

Epoch 1/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 40ms/step - loss: 0.0099
Epoch 2/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 36ms/step - loss: 1.1246e-04
Epoch 3/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 39ms/step - loss: 9.1907e-05
Epoch 4/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 39ms/step - loss: 9.0613e-05
Epoch 5/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 38ms/step - loss: 9.2575e-05
Epoch 6/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 40ms/step - loss: 8.6486e-05
Epoch 7/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 43ms/step - loss: 1.0840e-04
Epoch 8/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 42ms/step - loss: 6.5048e-05
Epoch 9/20
[1m277/277[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 42ms/step - loss: 5.4219e-05
Epoch 10/20
[1m277/277[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x20eeae9fcd0>

# Linear regression Model

Generating lagged feature for linear regression

In [14]:
data['Lag_1'] = data['Close'].shift(1)
data['Lag_2'] = data['Close'].shift(2)
data['Lag_3'] = data['Close'].shift(3)
data = data.dropna()

split into training and testing

In [15]:
X_lin = data[['Lag_1', 'Lag_2', 'Lag_3']]
y_lin = data['Close']
X_train_lin, X_test_lin = X_lin[:train_size], X_lin[train_size:]
y_train_lin, y_test_lin = y_lin[:train_size], y_lin[train_size:]

training model

In [16]:
lin_model = LinearRegression()
lin_model.fit(X_train_lin, y_train_lin)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


Prediction using LSTM,
Inverse transform the scaled prediction

In [17]:
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
lstm_predictions = lstm_model.predict(X_test_lstm)
lstm_predictions = scaler.inverse_transform(lstm_predictions)

[1m70/70[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step


Prediction using Linear regression, Inverse transform the scaled prediction

In [18]:
lin_predictions = lin_model.predict(X_test_lin)
lin_predictions = scaler.inverse_transform(lin_predictions.reshape(-1, 1))

In [19]:
min_len = min(len(lstm_predictions), len(lin_predictions))
lstm_predictions = lstm_predictions[:min_len]
lin_predictions = lin_predictions[:min_len]

In [20]:
hybrid_predictions = (0.7 * lstm_predictions) + (0.3 * lin_predictions)

# Prediction using Hybrid model

LSTM for predicting for 10 days

In [56]:
lstm_future_predictions = []
last_sequence = X[-1].reshape(1, seq_length, 1)
for _ in range(10):
    lstm_pred = lstm_model.predict(last_sequence)[0, 0]
    lstm_future_predictions.append(lstm_pred)
    lstm_pred_reshaped = np.array([[lstm_pred]]).reshape(1, 1, 1)
    last_sequence = np.append(last_sequence[:, 1:, :], lstm_pred_reshaped, axis=1)
lstm_future_predictions = scaler.inverse_transform(np.array(lstm_future_predictions).reshape(-1, 1))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 649ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 85ms/step


Linear regression for predicting for 10 days

In [57]:
recent_data = scaler.transform(data[['Close']])[-3:].flatten()
lin_future_predictions = []
feature_names = ['Lag_1', 'Lag_2', 'Lag_3']
for _ in range(10):
    input_df = pd.DataFrame([recent_data], columns=feature_names)
    lin_pred = lin_model.predict(input_df)[0]
    lin_future_predictions.append(lin_pred)
    recent_data = np.append(recent_data[1:], lin_pred)
lin_future_predictions = scaler.inverse_transform(
    np.array(lin_future_predictions).reshape(-1, 1)
)

Combination of both models for predicting for 10 days

In [58]:
hybrid_future_predictions = (0.7 * lstm_future_predictions) + (0.3 * lin_future_predictions)

final dataframe

In [59]:
future_dates = pd.date_range(start=data.index[-1] + pd.Timedelta(days=1), periods=10)
predictions_df = pd.DataFrame({
    'Date': future_dates,
    'LSTM Predictions': lstm_future_predictions.flatten(),
    'Linear Regression Predictions': lin_future_predictions.flatten(),
    'Hybrid Model Predictions': hybrid_future_predictions.flatten()
})
print(predictions_df)

        Date  LSTM Predictions  Linear Regression Predictions  \
0 1980-12-13          0.304794                       0.004935   
1 1980-12-14          0.339089                       0.005085   
2 1980-12-15          0.388589                       0.005060   
3 1980-12-16          0.444043                       0.009620   
4 1980-12-17          0.501562                       0.009883   
5 1980-12-18          0.559515                       0.009811   
6 1980-12-19          0.617240                       0.014300   
7 1980-12-20          0.674485                       0.014672   
8 1980-12-21          0.731186                       0.014558   
9 1980-12-22          0.787357                       0.018975   

   Hybrid Model Predictions  
0                  0.214836  
1                  0.238888  
2                  0.273530  
3                  0.313716  
4                  0.354058  
5                  0.394604  
6                  0.436358  
7                  0.476541  
8             