In [2]:
from tensorflow import keras
from keras.layers import LSTM, Dense
from keras.models import Sequential
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import talib
import numpy as np
import pandas as pd
import glob
import os
import joblib

In [None]:

# Set the maximum number of files to import
n_files = 10

# Get a list of all CSV files in the directory
file_list = glob.glob('stock_archive/stock_market_data/**/**/*.csv')


# Sort the file list (optional)
file_list.sort()

# Initialize an empty list to store the DataFrames
dfs = []

# Iterate over the specified number of files
for file in file_list[:n_files]:
    df = pd.read_csv(file, on_bad_lines='skip')
    dfs.append(df)

# Concatenate the DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)

# Print the size of combined DataFrame
print(f"Size of dataset: {len(data)}\n")

# Calculate Log Returns
data['LogReturns'] = np.log(data['Adjusted Close'] / data['Adjusted Close'].shift(1))

# Calculate TEMA
data['TEMA'] = talib.TEMA(data['Adjusted Close'], timeperiod=20)

# Calculate Simple Moving Average (SMA)
data['SMA_10'] = talib.SMA(data['Adjusted Close'], timeperiod=10)

# Calculate Exponential Moving Average (EMA)
data['EMA_20'] = talib.EMA(data['Adjusted Close'], timeperiod=20)

# Calculate RSI
data['RSI'] = talib.RSI(data['Adjusted Close'], timeperiod=14)

# Calculate MACD
macd, macd_signal, macd_histogram = talib.MACD(data['Adjusted Close'])
data['MACD'] = macd
data['MACD_Signal'] = macd_signal
data['MACD_Histogram'] = macd_histogram

# Drop data which had NaN values
# Since identifiers like TEMA take timeperiod 20 the last
# 20 rows atleast will always have NaN values
data = data.dropna()


# Split the data into features (X) and target variable (y)
X = data.drop(['Date'], axis=1)  # Input features
y = data['Adjusted Close'].shift(-20)  # Target variables shifted by one day

# Remove the last row from X and y
X = X[:-20]
y = y[:-20]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)

# MODEL TRAINING AND EVALUATION


# RANDOM FOREST

# Initialize the Random Forest model
rf_model = RandomForestRegressor()

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
rf_mse = mean_squared_error(y_test, rf_predictions)
rf_rmse = mean_squared_error(y_test, rf_predictions, squared=False)
rf_mae = mean_absolute_error(y_test, rf_predictions)
rf_r2 = r2_score(y_test, rf_predictions)

print("Random Forest Metrics:")
print("MSE:", rf_mse)
print("RMSE:", rf_rmse)
print("MAE:", rf_mae)
print("R-squared:", rf_r2)


# GRADIENT BOOSTING

# Initialize the Gradient Boosting model
gb_model = GradientBoostingRegressor()

# Train the Gradient Boosting model
gb_model.fit(X_train, y_train)

# Make predictions on the test set
gb_predictions = gb_model.predict(X_test)

# Evaluate the model
gb_mse = mean_squared_error(y_test, gb_predictions)
gb_rmse = mean_squared_error(y_test, gb_predictions, squared=False)
gb_mae = mean_absolute_error(y_test, gb_predictions)
gb_r2 = r2_score(y_test, gb_predictions)

print("\nGradient Boosting Metrics:")
print("MSE:", gb_mse)
print("RMSE:", gb_rmse)
print("MAE:", gb_mae)
print("R-squared:", gb_r2)

# LINEAR REGRESSION

# Create an instance of LinearRegression
linear_model = LinearRegression()

# Train the model
linear_model.fit(X_train, y_train)

# Make predictions
lr_predictions = linear_model.predict(X_test)

# Evaluate the model
lr_mse = mean_squared_error(y_test, lr_predictions)
lr_rmse = mean_squared_error(y_test, lr_predictions, squared=False)
lr_mae = mean_absolute_error(y_test, lr_predictions)
lr_r2 = r2_score(y_test, lr_predictions)

print("\nLinear Regression Metrics:")
print("MSE:", lr_mse)
print("RMSE:", lr_rmse)
print("MAE:", lr_mae)
print("R-squared:", lr_r2)

# LONG SHORT TERM MEMORY (LSTM)

# Reshape the data for LSTM input shape (samples, time steps, features)
X_train_lstm = X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Initialize the LSTM model
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(1, X_train.shape[1])))
lstm_model.add(Dense(1))

# Compile the model
lstm_model.compile(loss='mse', optimizer='adam')

# Train the LSTM model
lstm_model.fit(X_train_lstm, y_train, epochs=10, batch_size=32, use_multiprocessing=True, verbose=0)

# Make predictions on the test set
lstm_predictions = lstm_model.predict(X_test_lstm, use_multiprocessing=True, verbose=0)

# Reshape the predictions back to 1D array
lstm_predictions = lstm_predictions.flatten()

# Evaluate the model
lstm_mse = mean_squared_error(y_test, lstm_predictions)
lstm_rmse = mean_squared_error(y_test, lstm_predictions, squared=False)
lstm_mae = mean_absolute_error(y_test, lstm_predictions)
lstm_r2 = r2_score(y_test, lstm_predictions)

print("\nLong Short Term Memory (LSTM) Metrics:")
print("MSE:", lstm_mse)
print("RMSE:", lstm_rmse)
print("MAE:", lstm_mae)
print("R-squared:", lstm_r2)

In [17]:

# Set the maximum number of files to import
# Commented to train with all files
# n_files = 1

# Get a list of all CSV files in the directory
file_list = glob.glob('stock_archive/stock_market_data/**/**/*.csv')

# Sort the file list (optional)
file_list.sort()

# Initialize an empty list to store the DataFrames
dfs = []

# Iterate over the specified number of files
for file in file_list:
    df = pd.read_csv(file, on_bad_lines='skip')
    dfs.append(df)

# Concatenate the DataFrames into a single DataFrame
data = pd.concat(dfs, ignore_index=True)

# Print the size of combined DataFrame
print(f"Size of dataset: {len(data)}\n")

# Calculate Log Returns
data['LogReturns'] = np.log(data['Adjusted Close'] / data['Adjusted Close'].shift(1))

# Calculate TEMA
data['TEMA'] = talib.TEMA(data['Adjusted Close'], timeperiod=20)

# Calculate Simple Moving Average (SMA)
data['SMA_10'] = talib.SMA(data['Adjusted Close'], timeperiod=10)

# Calculate Exponential Moving Average (EMA)
data['EMA_20'] = talib.EMA(data['Adjusted Close'], timeperiod=20)

# Calculate RSI
data['RSI'] = talib.RSI(data['Adjusted Close'], timeperiod=14)

# Calculate MACD
macd, macd_signal, macd_histogram = talib.MACD(data['Adjusted Close'])
data['MACD'] = macd
data['MACD_Signal'] = macd_signal
data['MACD_Histogram'] = macd_histogram

# Drop data which had NaN values
# Since identifiers like TEMA take timeperiod 20 the last
# 20 rows atleast will always have NaN values
data = data.dropna()


# Split the data into features (X) and target variable (y)
X = data.drop(['Date'], axis=1)  # Input features
y = data['Adjusted Close'].shift(-20)  # Target variables shifted by n days

# Remove the last n rows from X and y
X = X[:-20]
y = y[:-20]


# MODEL TRAINING


# RANDOM FOREST

# Initialize the Random Forest model
rf_model = RandomForestRegressor()

# Train the Random Forest model
rf_model.fit(X, y)


if not os.path.exists('models'):
    os.makedirs('models')

# Export the model to a file
joblib.dump(rf_model, 'models/random_forest_model.joblib')


Size of dataset: 24614702

