### Import libraries

In [None]:
# Import necessary libraries
import os

# Import necessary libraries for data processing
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import numpy as np

# Import necessary libraries for model
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense
from keras.metrics import Precision, Recall, F1Score, AUC

# Import necessary libraries for evaluation
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt


# Data Preparation

### Load and merge data

In [None]:
# Load the data from a CSV file
market_data_1 = pd.read_csv('../../backend/data/TrainingData/Period1/A/market_data_A_0.csv')
market_data_2 = pd.read_csv('../../backend/data/TrainingData/Period1/A/market_data_A_1.csv')
market_data = pd.concat([market_data_1, market_data_2])

# Load the trade data from a CSV file
trade_data = pd.read_csv('../../backend/data/TrainingData/Period1/A/trade_data__A.csv')

# merge the data on the nearest timestamp
merged_data = pd.merge_asof(market_data.sort_values('timestamp'),
    trade_data.sort_values('timestamp'),
    by='timestamp',
    direction='nearest')

### Feature Engineering

In [None]:
# Calculate the spread between the market and trade data
merged_data['spread'] = merged_data['askPrice'] - merged_data['bidPrice']

# Add a moving average of the spread
merged_data['bidPrice_ma'] = merged_data['bidPrice'].rolling(window=5).mean()
merged_data['askPrice_ma'] = merged_data['askPrice'].rolling(window=5).mean()
merged_data.fillna(method='bfill', inplace=True)

### Nomalise Features

In [None]:
# Normalize the data to a range of 0 to 1
scaler = MinMaxScaler()
features = ['bidVolume', 'bidPrice', 'askVolume', 'askPrice', 'spread', 'bidPrice_ma', 'askPrice_ma']
scaler.fit(merged_data[features])

### Define input and output

In [None]:
# Define timesteps
timesteps = 10

# Prepare input (x) and output (y)
x = []
y = []

for i in range(len(merged_data) - timesteps):
    x.append(merged_data[features].iloc[i:i + timesteps].values)
    y.append(merged_data['bidPrice'].iloc[i + timesteps])

x = np.array(x)
y = np.array(y)

### Split data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Implementation of Model

### Define Model class

In [None]:
# reshape the input data
input_shape = (x_train.shape[1], x_train.shape[2])

# define the LSTM model
model = Sequential()
model.add(LSTM(100, input_shape=input_shape, return_sequences=True))
model.add(Dense(1, activation='Linear')) # TODO:Adjust the activation function to sigmoid if needed

### Compile model

In [None]:
# compile the model for training
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae']) # TODO:Adjust the loss function to binary_crossentropy if needed

### Train and validate model

In [None]:

# train the model
model.fit(x_train, y_train, epochs=20, batch_size=32, validation_data=(x_test, y_test))

### Test model

In [None]:
# Predict on the test data
y_pred = model.predict(x_test)

# Inverse transform the predictions
y_pred_original = scaler.inverse_transform(y_pred.reshape(-1, 1))
y_test_original = scaler.inverse_transform(y_test.reshape(-1, 1))

### Visualize results

In [None]:
# Make predictions on the test data
y_pred = model.predict(x_test)

# Convert predictions to binary
y_pred_binary = (y_pred > 0.5).astype(int)

# generate confusion matrix
cm = confusion_matrix(y_test, y_pred_binary)

# plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

## Other resources code

### Load model

In [None]:
loaded_model = load_model('LSTM_model')

### Save model

In [None]:
# save in tensoflow model
model.save('LSTM_model')