In [1]:
# importing pandas
import pandas as pd

In [2]:
# importing and viewing the data
df = pd.read_csv('./data/apple_stock_data.csv')
df.head()

Unnamed: 0,Date,Adj Close,Close,High,Low,Open,Volume
0,2023-11-02 00:00:00+00:00,176.665985,177.570007,177.779999,175.460007,175.520004,77334800
1,2023-11-03 00:00:00+00:00,175.750671,176.649994,176.820007,173.350006,174.240005,79763700
2,2023-11-06 00:00:00+00:00,178.31752,179.229996,179.429993,176.210007,176.380005,63841300
3,2023-11-07 00:00:00+00:00,180.894333,181.820007,182.440002,178.970001,179.179993,70530000
4,2023-11-08 00:00:00+00:00,181.958893,182.889999,183.449997,181.589996,182.350006,49340300


In [3]:
# converting the date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

In [4]:
# checking the conversion effectiveness
df['Date'].dtype

datetime64[ns, UTC]

In [5]:
# setting the 'Date' column as the dataframe index
df.set_index('Date', inplace=True)

In [6]:
#checking the dataframe indexes and their type
df.index

DatetimeIndex(['2023-11-02 00:00:00+00:00', '2023-11-03 00:00:00+00:00',
               '2023-11-06 00:00:00+00:00', '2023-11-07 00:00:00+00:00',
               '2023-11-08 00:00:00+00:00', '2023-11-09 00:00:00+00:00',
               '2023-11-10 00:00:00+00:00', '2023-11-13 00:00:00+00:00',
               '2023-11-14 00:00:00+00:00', '2023-11-15 00:00:00+00:00',
               ...
               '2024-10-21 00:00:00+00:00', '2024-10-22 00:00:00+00:00',
               '2024-10-23 00:00:00+00:00', '2024-10-24 00:00:00+00:00',
               '2024-10-25 00:00:00+00:00', '2024-10-28 00:00:00+00:00',
               '2024-10-29 00:00:00+00:00', '2024-10-30 00:00:00+00:00',
               '2024-10-31 00:00:00+00:00', '2024-11-01 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='Date', length=252, freq=None)

In [7]:
# reducing the dataframe to the 'Close' column
df = df[['Close']]

In [8]:
# checking the new form of the dataframe
df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-11-02 00:00:00+00:00,177.570007
2023-11-03 00:00:00+00:00,176.649994
2023-11-06 00:00:00+00:00,179.229996
2023-11-07 00:00:00+00:00,181.820007
2023-11-08 00:00:00+00:00,182.889999


## Choosing the Hybrid Models


    We will be using LSTM (Long Short-Term Memory) and Linear Regression models for this task. I chose LSTM because it effectively captures sequential dependencies and patterns in time-series data, which makes it suitable for modelling stock price movements influenced by historical trends.

    Linear Regression, on the other hand, is a straightforward model that captures simple linear relationships and long-term trends in data. By combining these two models into a hybrid approach, we leverage the LSTM’s ability to model complex time-dependent patterns alongside the Linear Regression’s ability to identify and follow broader trends. This combination aims to create a more balanced and accurate prediction system.

### Data normalization


In [9]:
# importing required library
from sklearn.preprocessing import MinMaxScaler

In [10]:
# declaring and initializing the scaler
scaler = MinMaxScaler(feature_range=(0,1))

In [11]:
# applying the scaler to the data
df['Close'] = scaler.fit_transform(df[['Close']])

In [12]:
# checking the result
df['Close'].head()

Date
2023-11-02 00:00:00+00:00    0.175853
2023-11-03 00:00:00+00:00    0.162983
2023-11-06 00:00:00+00:00    0.199077
2023-11-07 00:00:00+00:00    0.235311
2023-11-08 00:00:00+00:00    0.250280
Name: Close, dtype: float64

### Preparing data for LSTM

#### Creating sequences of a defined length(e.g: 60days)


In [13]:
# importing the required library
import numpy as np

In [14]:
def create_sequences(data, seq_length=60):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length])
    
    return np.array(X), np.array(y)

In [15]:
seq_length = 60
X, y = create_sequences(df['Close'].values, seq_length)

In [16]:
X

array([[0.1758535 , 0.16298258, 0.19907662, ..., 0.3836038 , 0.37395072,
        0.32232785],
       [0.16298258, 0.19907662, 0.23531069, ..., 0.37395072, 0.32232785,
        0.27140452],
       [0.19907662, 0.23531069, 0.2502798 , ..., 0.32232785, 0.27140452,
        0.30581984],
       ...,
       [0.5907946 , 0.62702868, 0.67585339, ..., 0.92907118, 0.956911  ,
        0.96068834],
       [0.62702868, 0.67585339, 0.71684399, ..., 0.956911  , 0.96068834,
        0.9107444 ],
       [0.67585339, 0.71684399, 0.73489091, ..., 0.96068834, 0.9107444 ,
        0.85212657]])

In [17]:
y

array([0.27140452, 0.30581984, 0.29169009, 0.31729147, 0.3399553 ,
       0.3414942 , 0.32624523, 0.33365987, 0.30987682, 0.28035806,
       0.26790704, 0.26385005, 0.24216562, 0.23167317, 0.24230566,
       0.27098484, 0.2451036 , 0.22607729, 0.2466425 , 0.22971459,
       0.22034137, 0.2050924 , 0.14129836, 0.07162836, 0.05763844,
       0.05595971, 0.08016223, 0.10842194, 0.11513705, 0.08575833,
       0.11191942, 0.10660318, 0.12199219, 0.15500843, 0.19124229,
       0.08911577, 0.10184666, 0.08184116, 0.06589266, 0.11625627,
       0.09065467, 0.07036932, 0.05372127, 0.06505308, 0.05344163,
       0.0640739 , 0.04826521, 0.06533294, 0.03889198, 0.14045878,
       0.16158371, 0.10758258, 0.06127595, 0.04196978, 0.02853936,
       0.        , 0.01175149, 0.02658078, 0.05623957, 0.06841074,
       0.06015673, 0.11891439, 0.07456634, 0.06015673, 0.11233911,
       0.25713495, 0.23377179, 0.24342466, 0.2481814 , 0.273783  ,
       0.25251824, 0.29770565, 0.31379398, 0.34583104, 0.34750

In [18]:
#### Splitting the sequences into training and test sets(80-20)
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

#### Building a sequential LSTM model with layers to capture the temporal dependencies in the data

In [19]:
# importing required library
from tensorflow.keras import Sequential 
from tensorflow.keras.layers import LSTM, Dense

2025-01-16 15:14:39.096419: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-16 15:14:39.105099: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-16 15:14:39.195225: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-16 15:14:39.284342: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737036879.375072   24767 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737036879.40

In [20]:
lstm_model = Sequential()
lstm_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
lstm_model.add(LSTM(units=50))
lstm_model.add(Dense(1))

2025-01-16 15:14:52.483896: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(**kwargs)


In [21]:
# compiling the model 
lstm_model.compile(optimizer='adam',loss='mean_squared_error')
lstm_model.fit(X_train, y_train, epochs=20, batch_size=32)

Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - loss: 0.2027
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - loss: 0.0250
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - loss: 0.0432
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - loss: 0.0173
Epoch 5/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - loss: 0.0189
Epoch 6/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 0.0141
Epoch 7/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - loss: 0.0120
Epoch 8/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 0.0106
Epoch 9/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.0115
Epoch 10/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.0102
Epoch 11/20
[1m5/5

<keras.src.callbacks.history.History at 0x79d98045c860>

In [22]:
# training the second model(Linear Regression)
# generating lagged features for Linear Regression (e.g., using the past 3 days as predictors)
df['Lag_1'] = df['Close'].shift(1)
df['Lag_2'] = df['Close'].shift(2)
df['Lag_3'] = df['Close'].shift(3)
df = df.dropna()

In [23]:
# checking the result
df.head()

Unnamed: 0_level_0,Close,Lag_1,Lag_2,Lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-11-07 00:00:00+00:00,0.235311,0.199077,0.162983,0.175853
2023-11-08 00:00:00+00:00,0.25028,0.235311,0.199077,0.162983
2023-11-09 00:00:00+00:00,0.243565,0.25028,0.235311,0.199077
2023-11-10 00:00:00+00:00,0.299384,0.243565,0.25028,0.235311
2023-11-13 00:00:00+00:00,0.277001,0.299384,0.243565,0.25028


In [24]:
# splitting the data accordingly for training and testing
X_lin = df[['Lag_1', 'Lag_2', 'Lag_3']]
y_lin = df['Close']
X_train_lin, X_test_lin = X_lin[:train_size], X_lin[train_size:]
y_train_lin, y_test_lin = y_lin[:train_size], y_lin[train_size:]

In [25]:
# training the linear regression model
from sklearn.linear_model import LinearRegression

lin_model = LinearRegression()
lin_model.fit(X_train_lin, y_train_lin)

### Making predictions using LSTM on the test set and inverse transform the scaled predictions

In [28]:
# reshaping the test set
X_test_lstm = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

# making predictions
lstm_predictions = lstm_model.predict(X_test_lstm)

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


In [29]:
# checking the predictions
lstm_predictions

array([[0.8426436 ],
       [0.833055  ],
       [0.8254324 ],
       [0.8195515 ],
       [0.8149774 ],
       [0.80677265],
       [0.79717994],
       [0.79017997],
       [0.7915109 ],
       [0.7975903 ],
       [0.8052486 ],
       [0.8142967 ],
       [0.8229548 ],
       [0.83177465],
       [0.84035647],
       [0.8522751 ],
       [0.86033744],
       [0.86605227],
       [0.8689727 ],
       [0.8707561 ],
       [0.86768967],
       [0.8648825 ],
       [0.8651271 ],
       [0.86688286],
       [0.8684448 ],
       [0.8727275 ],
       [0.88039255],
       [0.8882136 ],
       [0.8960609 ],
       [0.9056287 ],
       [0.9168039 ],
       [0.9278796 ],
       [0.9343987 ],
       [0.9375583 ],
       [0.93892163],
       [0.9404759 ],
       [0.9421871 ],
       [0.9411827 ],
       [0.935375  ]], dtype=float32)

In [30]:

# inverse transforming the scaled predictions
lstm_predictions = scaler.inverse_transform(lstm_predictions)

In [31]:
# checking the predictions after inverse transforming
lstm_predictions

array([[225.23216],
       [224.54677],
       [224.00189],
       [223.58154],
       [223.25458],
       [222.6681 ],
       [221.98242],
       [221.48206],
       [221.5772 ],
       [222.01175],
       [222.55916],
       [223.20593],
       [223.8248 ],
       [224.45526],
       [225.06866],
       [225.92062],
       [226.49692],
       [226.90541],
       [227.11417],
       [227.24165],
       [227.02245],
       [226.8218 ],
       [226.83928],
       [226.96478],
       [227.07643],
       [227.38255],
       [227.93045],
       [228.4895 ],
       [229.05043],
       [229.73433],
       [230.53314],
       [231.32483],
       [231.79082],
       [232.01666],
       [232.11412],
       [232.22522],
       [232.34753],
       [232.27573],
       [231.8606 ]], dtype=float32)