In [1]:
import pandas as pd
df = pd.read_parquet('TRAIN_Reco_2021_2022_2023.parquet.gzip').reset_index()

In [2]:
df['ExecutionTime'] = pd.to_datetime(df['ExecutionTime'])
numerical_columns = ['high', 'low', 'close', 'volume']
df[numerical_columns] = df[numerical_columns].astype('float16')
df.dtypes

ExecutionTime    datetime64[ns, Europe/Berlin]
ID                                      object
high                                   float16
low                                    float16
close                                  float16
volume                                 float16
dtype: object

In [16]:
train_start_date = '2023-04-01'
train_end_date = '2023-09-30'

val_start_date = '2023-10-01'
val_end_date = '2023-12-31'  # Adjust if you have data beyond 2023

# Step 4: Split the data into training and validation sets
train_df = df[(df['ExecutionTime'] >= train_start_date) & (df['ExecutionTime'] <= train_end_date)]
val_df = df[(df['ExecutionTime'] >= val_start_date) & (df['ExecutionTime'] <= val_end_date)]

In [17]:
train_df.head()

Unnamed: 0,ExecutionTime,ID,high,low,close,volume
78149,2023-04-01 00:00:00+02:00,Fri00Q1,0.0,0.0,0.0,0.0
78150,2023-04-01 00:15:00+02:00,Fri00Q1,0.0,0.0,0.0,0.0
78151,2023-04-01 00:30:00+02:00,Fri00Q1,0.0,0.0,0.0,0.0
78152,2023-04-01 00:45:00+02:00,Fri00Q1,0.0,0.0,0.0,0.0
78153,2023-04-01 01:00:00+02:00,Fri00Q1,0.0,0.0,0.0,0.0


In [18]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Assume you have already scaled your data
scaler = MinMaxScaler()

# Fit and transform your training data
train_df_scaled = scaler.fit_transform(train_df[['high', 'low', 'close', 'volume']])

# Transform test data
test_df_scaled = scaler.transform(val_df[['high', 'low', 'close', 'volume']])

# Convert scaled data back to DataFrame (keeping the same column names)
train_df_scaled = pd.DataFrame(train_df_scaled, columns=['high', 'low', 'close', 'volume'], index=train_df.index)
val_df_scaled = pd.DataFrame(test_df_scaled, columns=['high', 'low', 'close', 'volume'], index=val_df.index)

# Retain 'ID' and 'ExecutionTime' in the final DataFrame
train_df_scaled[['ID', 'ExecutionTime']] = train_df[['ID', 'ExecutionTime']]
val_df_scaled[['ID', 'ExecutionTime']] = val_df[['ID', 'ExecutionTime']]

In [19]:
train_df_scaled.head()

Unnamed: 0,high,low,close,volume,ID,ExecutionTime
78149,0.188232,0.567383,0.269043,0.0,Fri00Q1,2023-04-01 00:00:00+02:00
78150,0.188232,0.567383,0.269043,0.0,Fri00Q1,2023-04-01 00:15:00+02:00
78151,0.188232,0.567383,0.269043,0.0,Fri00Q1,2023-04-01 00:30:00+02:00
78152,0.188232,0.567383,0.269043,0.0,Fri00Q1,2023-04-01 00:45:00+02:00
78153,0.188232,0.567383,0.269043,0.0,Fri00Q1,2023-04-01 01:00:00+02:00


In [20]:
# Remove the timezone information
train_df_scaled['ExecutionTime'] = pd.to_datetime(train_df_scaled['ExecutionTime']).dt.tz_localize(None)
val_df_scaled['ExecutionTime'] = pd.to_datetime(val_df_scaled['ExecutionTime']).dt.tz_localize(None)

In [21]:
import numpy as np
# Step 5: Prepare the data with 10 lags
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i:i + n_steps])
        y.append(data[i + n_steps])
    return np.array(X), np.array(y)

# Using ['close', 'high', 'low', 'volume'] as target columns
n_steps = 10
X_train, y_train = create_sequences(train_df_scaled[['close', 'high', 'low', 'volume']].values, n_steps)
X_test, y_test = create_sequences(val_df_scaled[['close', 'high', 'low', 'volume']].values, n_steps)


In [22]:
# Reshape the data for LSTM (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 4))  # 4 features (close, high, low, volume)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 4))


In [12]:
# ! .venv\Scripts\pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.17.0-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
Collecting tensorflow-intel==2.17.0 (from tensorflow)
  Downloading tensorflow_intel-2.17.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow-

In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Step 6: Define the LSTM model
model = Sequential()

# Add LSTM layers
model.add(LSTM(50, return_sequences=True, input_shape=(n_steps, 4)))  # 4 features in input
model.add(LSTM(50, return_sequences=False))
model.add(Dense(25))
model.add(Dense(40))  # 4 output columns (close, high, low, volume)


In [25]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Step 7: Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32)


Epoch 1/5
[1m  2632/366933[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m41:55[0m 7ms/step - loss: 0.0016

KeyboardInterrupt: 

In [None]:

# Step 8: Make predictions
predictions = model.predict(X_test)

# Step 9: Inverse transform predictions to original scale
predictions_rescaled = scaler.inverse_transform(predictions)

# Step 10: Print or plot predictions
print(predictions_rescaled)