In [None]:
import zipfile
import os

# Path to the ZIP file
zip_file_path = '/content/TrainingData.zip'

# Destination folder where files will be extracted
extract_to_folder = '/content/'

# Ensure the destination folder exists
os.makedirs(extract_to_folder, exist_ok=True)

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents to the destination folder
    zip_ref.extractall(extract_to_folder)

print(f"Files extracted to '{extract_to_folder}'")

Files extracted to '/content/'


In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Import libraries

In [26]:
# Import necessary libraries
import os

# Import necessary libraries for data processing
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Import necessary libraries for model
from keras.callbacks import Callback
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense
from keras.metrics import Precision, Recall, F1Score, AUC

# Import necessary libraries for evaluation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


## Directories

In [27]:
base_dir = '/content/drive/MyDrive/TrainingData'
Save_model_dir = 'LSTM_model'

# Data Preparation

### Load and merge data

In [30]:
# List of stocks and splits
periods = ['1']
stocks = ['A', 'B', 'C', 'D', 'E']
splits = ['0', '1', '2', '3', '4','5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15']

# Load the data from a CSV file
market_data_list = []
trade_data_list = []

# loop over the stocks and load the data for each period
for period in periods:

  # check for nested period dir
  period_path = os.path.join(base_dir, f'Period{period}')
  nested_period_path = os.path.join(period_path, f'Period{period}')

  # Handle nested data
  if os.path.exists(nested_period_path):
      period_dir_to_use = nested_period_path
  else:
      period_dir_to_use = period_path


  for stock in stocks:

      trade_data_dir = f'{period_dir_to_use}/{stock}/trade_data__{stock}.csv'
      if os.path.exists(trade_data_dir):
          trade_data = pd.read_csv(trade_data_dir)
          trade_data['stock'] = stock
          trade_data_list.append(trade_data)
      else:
          print(f"File {trade_data_dir} does not exist")
          continue

      for split in splits:
          market_data_dir = f'{period_dir_to_use}/{stock}/market_data_{stock}_{split}.csv'

          if os.path.exists(market_data_dir):
              market_data = pd.read_csv(market_data_dir)

              # Add stock identifier
              market_data['stock'] = stock
              market_data_list.append(market_data)
          else:
              print(f"File {market_data_dir} does not exist")



File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_2.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_3.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_4.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_5.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_6.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_7.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_8.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_9.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_10.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_11.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/market_data_A_12.csv does not exist
File /content/drive/MyDrive/TrainingData/Period1/A/

In [31]:
print("Columns in market_data:", market_data.columns)
print("Columns in trade_data:", trade_data.columns)

# Convert timestamps to datetime (if they aren't already)
market_data['timestamp'] = pd.to_datetime(market_data['timestamp'])
trade_data['timestamp'] = pd.to_datetime(trade_data['timestamp'])

market_data = market_data.sort_values(by='timestamp')
trade_data = trade_data.sort_values(by='timestamp')

Columns in market_data: Index(['bidVolume', 'bidPrice', 'askVolume', 'askPrice', 'timestamp', 'stock'], dtype='object')
Columns in trade_data: Index(['price', 'volume', 'timestamp', 'stock'], dtype='object')


  market_data['timestamp'] = pd.to_datetime(market_data['timestamp'])
  trade_data['timestamp'] = pd.to_datetime(trade_data['timestamp'])


In [32]:
merged_data = pd.merge_asof(
    market_data.sort_values(by=["stock", "timestamp"]),
    trade_data[['stock','timestamp','price','volume']].sort_values(by=["stock", "timestamp"]),
    on='timestamp',
    by='stock',            # match on the same stock symbol
    direction='nearest',
    tolerance=pd.Timedelta(seconds=1)
)

### Feature Engineering

In [33]:
# define significant change as a percentage change of more than 1%
threshold = 0.01

# Calculate the spread between the market and trade data
merged_data['price_change'] = merged_data['price'].pct_change()

# calculate and label significant changes
merged_data['significant_change'] = (merged_data['price_change'].abs() > threshold).astype(int)

  merged_data['price_change'] = merged_data['price'].pct_change()


In [34]:
# Add a moving average of the spread
merged_data['spread'] = merged_data['askPrice'] - merged_data['bidPrice']
merged_data['bidPrice_ma'] = merged_data.groupby('stock')['bidPrice'].rolling(window=5).mean().reset_index(0, drop=True)
merged_data['askPrice_ma'] = merged_data.groupby('stock')['askPrice'].rolling(window=5).mean().reset_index(0, drop=True)

# Fill any remaining NANS by backfilling
merged_data.fillna(method='bfill', inplace=True)
merged_data.fillna(method='ffill', inplace=True)

  merged_data.fillna(method='bfill', inplace=True)
  merged_data.fillna(method='ffill', inplace=True)


### Define the feature set and target set

In [35]:
# Define the feature and it's fitted scaller
features = [
    'bidVolume',
    'bidPrice',
    'askVolume',
    'askPrice',
    'spread',
    'bidPrice_ma',
    'askPrice_ma'
    ]

scaler = MinMaxScaler()
merged_data[features] = scaler.fit_transform(merged_data[features])

### Preparing input

In [36]:
# define the timesteps
timesteps = 10

# Prepare input (x) and output (y)
x = []
y = []

for stock in stocks:
    stock_data = merged_data[merged_data['stock'] == stock]
    stocks_features = stock_data[features].values
    stocks_target = stock_data[['significant_change']].values

    for i in range(len(stock_data) - timesteps):
        x.append(stocks_features[i:i + timesteps])
        y.append(stocks_target[i + timesteps])

x = np.array(x)
y = np.array(y)

### Split data

In [42]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Implementation of Model

### Define Model class

In [43]:
# reshape the input data
input_shape = (x_train.shape[1], x_train.shape[2])

# define the LSTM model
model = Sequential()
model.add(LSTM(100, input_shape=input_shape, return_sequences=False))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(**kwargs)


### Compile model

In [44]:
# compile the model for training
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

### Train and validate model

In [45]:

# # train the model
# model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_test, y_test))


# Custom callback to save and print metrics at each epoch
class EpochLogger(Callback):
    def __init__(self):
        super().__init__()
        self.epochs = []

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        # Print the metrics for the current epoch
        print(f"Epoch {epoch + 1}: {logs}")

        # Save metrics in a list for later use
        self.epochs.append({"epoch": epoch + 1, **logs})

# Instantiate the custom callback
epoch_logger = EpochLogger()

# Train the model and use the custom callback
history = model.fit(
    x_train,
    y_train,
    epochs=20,
    batch_size=32,
    validation_data=(x_test, y_test),
    callbacks=[epoch_logger]
)

# Save the epoch logs to a CSV file
epoch_logs_df = pd.DataFrame(epoch_logger.epochs)
epoch_logs_df.to_csv('epoch_logs.csv', index=False)

# Print the saved dataframe
print(epoch_logs_df)

Epoch 1/20
[1m185/189[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.9231 - loss: 0.1541Epoch 1: {'accuracy': 0.9848987460136414, 'loss': 0.040466438978910446, 'val_accuracy': 1.0, 'val_loss': 3.9413389458786696e-05}
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 16ms/step - accuracy: 0.9247 - loss: 0.1511 - val_accuracy: 1.0000 - val_loss: 3.9413e-05
Epoch 2/20
[1m182/189[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 1.0000 - loss: 3.2760e-05Epoch 2: {'accuracy': 1.0, 'loss': 2.739078990998678e-05, 'val_accuracy': 1.0, 'val_loss': 1.9205119315302e-05}
[1m189/189[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 1.0000 - loss: 3.2538e-05 - val_accuracy: 1.0000 - val_loss: 1.9205e-05
Epoch 3/20
[1m184/189[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 1.0000 - loss: 1.6871e-05Epoch 3: {'accuracy': 1.0, 'loss': 1.4972746612329502e-05, 'val_accuracy': 1.0

### Test model

In [46]:
# Predict on the scaled test data
y_pred_scaled = model.predict(x_test)

# Apply inverse transformation to scale back to the original range and inverse transform y_test
y_pred_binary = (y_pred_scaled > 0.5).astype(int)

print("Predicted significant changes: (Binary)", y_pred_binary.ravel()[:10])
print("Actual significant changes: (Binary)", y_test.ravel()[:10])

[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
Predicted significant changes: (Binary) [0 0 0 0 0 0 0 0 0 0]
Actual significant changes: (Binary) [0 0 0 0 0 0 0 0 0 0]


## Other resources code

### Load model

In [None]:
loaded_model = load_model(Save_model_dir)

### Save model

In [48]:
# save in tensoflow model
Save_model_dir = 'LSTM_model.keras' # Add the .keras extension to the filename
model.save(Save_model_dir)