Machine Learning Model: Convolutional NN - LSTM model
The code is a CNN-LSTM model for stock price prediction that takes as input the cleaned stock data.
Some conversions are made to the date, as well as the addition of new variables before the model is created.
Name: Sean Brady
Created: Dec 3, 2023

In [1]:
#must use anaconda enviornment to import tensorflow modules
#at least on my computer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, LSTM, Flatten
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import pandas as pd

In [2]:
# create data frame
df = pd.read_csv('clean_data.csv')
print(df.head(10))

         date   open    high     low  close    volume   7_day_ma  15_day_ma  \
0  2013-05-07  42.18  42.410  41.900  42.40   3524022  41.662857  41.933333   
1  2013-05-08  42.40  42.950  42.300  42.94   2119765  41.874286  41.983333   
2  2013-05-09  42.97  43.195  42.630  43.16   3159293  42.120000  42.072000   
3  2013-05-10  43.12  43.850  43.040  43.63   4662252  42.451429  42.180667   
4  2013-05-13  43.43  43.560  42.720  43.04   4260335  42.674286  42.260000   
5  2013-05-14  42.98  44.060  42.882  43.97   6075845  43.020000  42.350667   
6  2013-05-15  44.90  46.490  44.890  45.68  10289000  43.545714  42.539333   
7  2013-05-16  45.43  45.840  44.970  44.99   4890962  43.915714  42.690000   
8  2013-05-17  45.02  45.830  44.990  45.56   3247851  44.290000  42.974000   
9  2013-05-20  45.48  47.450  45.390  46.34   5698804  44.744286  43.299333   

   30_day_ma  daily_returns  ...  daily_returns_lag_5  daily_returns_lag_7  \
0  42.018333       0.009524  ...            -0.00048

In [3]:
# transform date to datetime, get year,month,day from date column
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

print(df.head(10))

        date   open    high     low  close    volume   7_day_ma  15_day_ma  \
0 2013-05-07  42.18  42.410  41.900  42.40   3524022  41.662857  41.933333   
1 2013-05-08  42.40  42.950  42.300  42.94   2119765  41.874286  41.983333   
2 2013-05-09  42.97  43.195  42.630  43.16   3159293  42.120000  42.072000   
3 2013-05-10  43.12  43.850  43.040  43.63   4662252  42.451429  42.180667   
4 2013-05-13  43.43  43.560  42.720  43.04   4260335  42.674286  42.260000   
5 2013-05-14  42.98  44.060  42.882  43.97   6075845  43.020000  42.350667   
6 2013-05-15  44.90  46.490  44.890  45.68  10289000  43.545714  42.539333   
7 2013-05-16  45.43  45.840  44.970  44.99   4890962  43.915714  42.690000   
8 2013-05-17  45.02  45.830  44.990  45.56   3247851  44.290000  42.974000   
9 2013-05-20  45.48  47.450  45.390  46.34   5698804  44.744286  43.299333   

   30_day_ma  daily_returns  ...  daily_returns_lag_30  \
0  42.018333       0.009524  ...             -0.002909   
1  42.055667       0.0127

In [4]:
#define features, target vars
#clist of lagged columns
lag_columns = ['open', 'high', 'low', 'volume', '7_day_ma', '15_day_ma', '30_day_ma', 'daily_returns', 'daily_volatility']
#a list of columns to exclude. This should be all the columns with data from a day d for which we are trying 
#to make predictions for (because we will not have access to this data in practice)
exclude = [col for col in lag_columns if any(f'{col}_lag_' in c for c in df.columns)]
#a list of the features to include. This is all columns that are not the date, target, or included
#in our list of columns to exclude
include = [col for col in df.columns if col not in ['date', 'close'] + exclude]
print(include)
print('\n\n')
print(exclude)

['open_lag_1', 'open_lag_3', 'open_lag_5', 'open_lag_7', 'open_lag_15', 'open_lag_30', 'high_lag_1', 'high_lag_3', 'high_lag_5', 'high_lag_7', 'high_lag_15', 'high_lag_30', 'low_lag_1', 'low_lag_3', 'low_lag_5', 'low_lag_7', 'low_lag_15', 'low_lag_30', 'volume_lag_1', 'volume_lag_3', 'volume_lag_5', 'volume_lag_7', 'volume_lag_15', 'volume_lag_30', '7_day_ma_lag_1', '7_day_ma_lag_3', '7_day_ma_lag_5', '7_day_ma_lag_7', '7_day_ma_lag_15', '7_day_ma_lag_30', '15_day_ma_lag_1', '15_day_ma_lag_3', '15_day_ma_lag_5', '15_day_ma_lag_7', '15_day_ma_lag_15', '15_day_ma_lag_30', '30_day_ma_lag_1', '30_day_ma_lag_3', '30_day_ma_lag_5', '30_day_ma_lag_7', '30_day_ma_lag_15', '30_day_ma_lag_30', 'daily_returns_lag_1', 'daily_returns_lag_3', 'daily_returns_lag_5', 'daily_returns_lag_7', 'daily_returns_lag_15', 'daily_returns_lag_30', 'daily_volatility_lag_1', 'daily_volatility_lag_3', 'daily_volatility_lag_5', 'daily_volatility_lag_7', 'daily_volatility_lag_15', 'daily_volatility_lag_30', 'year', '

In [5]:
#Split data into features and our target variable
features = df[include]
target = df['close']

In [6]:
#scale the data
scaler=MinMaxScaler()
scaled_feat = scaler.fit_transform(features)

#reshape for CONV1D
X = scaled_feat.reshape((features.shape[0],features.shape[1],1))

y = df['close']


In [7]:
#split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
#initialize sequential model
#this will be used to create a linear stack of layers
model = Sequential() 

#add convolutional layer w/ 64 output filters
#kernel size 3
#using rectified linear unit for activation function
#input_shape is shape of data given number of features with one data point per step
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X.shape[1],1)))

#add LSTM layer w/ 50 neurons
model.add(LSTM(units=50, return_sequences=True))

#add layer to flatten input to 1-d array
model.add(Flatten())

#add connected NN layer
#w/ 1 output neuron
model.add(Dense(units=1))

#configure model for training
#use Adam for optimizer
#Use MSE for loss func
model.compile(optimizer='adam', loss='mean_squared_error')

2023-12-03 14:00:00.841576: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa3acfd0a30>

In [10]:
# Evaluate the model
test_loss = model.evaluate(X_test, y_test)

# For a more detailed evaluation, you can make predictions and compare them to the actual values
predictions = model.predict(X_test)

# Calculate metrics like MSE or MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

Mean Squared Error: 11.532030729977805
Mean Absolute Error: 1.9028174238558095
