In [34]:
import numpy as np
import pandas as pd
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, LSTM, Dense
from tensorflow.keras.callbacks import LearningRateScheduler

import numpy as np
import pandas as pd
from tensorflow.keras.utils import Sequence

import numpy as np
import pandas as pd
from tensorflow.keras.utils import Sequence

class TimeSeriesGenerator(Sequence):
    def __init__(self, df, sequence_length, prediction_length, batch_size):
        self.df = df
        self.sequence_length = sequence_length
        self.prediction_length = prediction_length
        self.batch_size = batch_size
        # Use only business days for generating batches
        self.dates = df.resample('B').agg({'open': 'first'}).index
        self.min_values = self.df.min()
        self.max_values = self.df.max()

    def __len__(self):
        return (len(self.dates) - 1) // self.batch_size

    def __getitem__(self, idx):
        batch_start = idx * self.batch_size
        batch_end = batch_start + self.batch_size
        X_batch, y_batch_high, y_batch_low = [], [], []
        for day in self.dates[batch_start:batch_end]:
            morning_start = day + pd.Timedelta(hours=7, minutes=30)
            morning_end = day + pd.Timedelta(hours=9, minutes=30)
            prediction_end = morning_end + pd.Timedelta(minutes=self.prediction_length)
            if morning_start in self.df.index and morning_end in self.df.index and prediction_end in self.df.index:
                input_sequence = self.df[['open', 'high', 'low', 'close']].loc[morning_start:morning_end].values[-self.sequence_length:]
                target_range = self.df[['high', 'low']].loc[morning_end:prediction_end]
                if input_sequence.shape[0] == self.sequence_length and not target_range.empty:
                    input_sequence = (input_sequence - self.min_values.values) / (self.max_values.values - self.min_values.values)
                    X_batch.append(input_sequence)
                    y_batch_high.append(target_range['high'].max())
                    y_batch_low.append(target_range['low'].min())
        X_batch = np.array(X_batch).astype(np.float32)
        y_batch_high = np.array(y_batch_high).astype(np.float32)
        y_batch_low = np.array(y_batch_low).astype(np.float32)
        y_batch = np.column_stack((y_batch_high, y_batch_low))
        
        # Check for "nan" and extreme values
        if np.isnan(X_batch).any():
            raise ValueError("There are 'nan' values in the input sequences (X_batch).")
        if np.isnan(y_batch).any():
            raise ValueError("There are 'nan' values in the target outputs (y_batch).")
        threshold = 1e6
        # Ensure that the array is not empty before calling np.max and np.abs
        if len(X_batch) > 0 and np.max(np.abs(X_batch)) > threshold:
            raise ValueError(f"There are extreme values in the input sequences (X_batch) exceeding the threshold {threshold}.")
        if len(y_batch) > 0 and np.max(np.abs(y_batch)) > threshold:
            raise ValueError(f"There are extreme values in the target outputs (y_batch) exceeding the threshold {threshold}.")
        
        return X_batch, y_batch


In [33]:
# Load the SPX stock data for the years 2020 and 2021
df_2020 = pd.read_excel('SPX_2020.xlsx', names=['date', 'open', 'high', 'low', 'close', 'volume']).drop(columns='volume')
df_2021 = pd.read_excel('SPX_2021.xlsx', names=['date', 'open', 'high', 'low', 'close', 'volume']).drop(columns='volume')

# Concatenate the data for both years, reset the index, and set the 'date' column as the index
df = pd.concat([df_2020, df_2021]).reset_index(drop=True).set_index('date')

# Remove any duplicate entries in the index
df = df.loc[~df.index.duplicated(keep='first')]

# Convert the DataFrame to the 1-minute frequency while keeping only business days
df = df.resample('1T').pad().asfreq(freq='B')


  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  df = df.resample('1T').pad().asfreq(freq='B')


In [35]:
# Create an instance of the updated data generator using the concatenated SPX stock data
data_gen = TimeSeriesGenerator(df, sequence_length=1440, prediction_length=120, batch_size=32)

# Retrieve the sequence length and number of features from the data generator
sequence_length = data_gen.sequence_length
num_features = data_gen.df.shape[1]

print("Building CNN-LSTM model...")

# Define the CNN-LSTM model architecture
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(sequence_length, num_features)))
model.add(LSTM(units=50, return_sequences=False))
model.add(Dense(2, activation='linear'))  # Output: Predicting 'high' and 'low' points

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Print the summary of the model architecture
model.summary()

print("Training the model...")

# Train the model using the data generator
history = model.fit(data_gen, epochs=10, verbose=1)

print("Model training complete.")

# Note: The model is trained on batches of data generated by the data generator.
# The generator provides sequences of historical data and corresponding targets for the specific prediction window.


Building CNN-LSTM model...
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_4 (Conv1D)           (None, 1438, 32)          416       
                                                                 
 lstm_4 (LSTM)               (None, 50)                16600     
                                                                 
 dense_4 (Dense)             (None, 2)                 102       
                                                                 
Total params: 17,118
Trainable params: 17,118
Non-trainable params: 0
_________________________________________________________________
Training the model...
Epoch 1/10


ValueError: in user code:

    File "/Users/kush/Documents/DS/env/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/Users/kush/Documents/DS/env/lib/python3.10/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/Users/kush/Documents/DS/env/lib/python3.10/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/Users/kush/Documents/DS/env/lib/python3.10/site-packages/keras/engine/training.py", line 1023, in train_step
        y_pred = self(x, training=True)
    File "/Users/kush/Documents/DS/env/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/Users/kush/Documents/DS/env/lib/python3.10/site-packages/keras/engine/input_spec.py", line 250, in assert_input_compatibility
        raise ValueError(

    ValueError: Exception encountered when calling layer 'sequential_4' (type Sequential).
    
    Input 0 of layer "conv1d_4" is incompatible with the layer: expected min_ndim=3, found ndim=1. Full shape received: (None,)
    
    Call arguments received by layer 'sequential_4' (type Sequential):
      • inputs=tf.Tensor(shape=(None,), dtype=float32)
      • training=True
      • mask=None


In [30]:
df[15980:]

Unnamed: 0_level_0,open,high,low,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-21 07:26:00,3314.655,3314.655,3314.352,3314.640
2020-01-21 07:27:00,3314.640,3315.200,3314.640,3315.200
2020-01-21 07:28:00,3315.200,3315.200,3314.900,3314.900
2020-01-21 07:31:00,3314.900,3315.149,3314.900,3315.149
2020-01-21 07:32:00,3315.149,3315.637,3315.143,3315.355
...,...,...,...,...
2021-12-31 16:09:00,4770.399,4771.148,4770.145,4770.848
2021-12-31 16:10:00,4771.136,4771.154,4770.333,4770.836
2021-12-31 16:11:00,4770.651,4771.139,4770.136,4771.133
2021-12-31 16:12:00,4771.648,4771.654,4770.654,4770.839
