In [1]:
import pandas as pd
import keras
from kerastuner import RandomSearch
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.optimizers import Adam

  from kerastuner import RandomSearch


### Preparing train and val dataset for LSTM

In [2]:
# Paths to the train and validation datasets
train_data_path = 'train_data.csv'  
val_data_path = 'val_data.csv'  
test_data_path = 'test_data.csv'

In [3]:
# Load the datasets
train_data = pd.read_csv(train_data_path)
val_data = pd.read_csv(val_data_path)
test_data = pd.read_csv(test_data_path)
print(train_data.shape)
print(val_data.shape)

(2791, 1981)
(931, 1981)


In [4]:
# Splitting the datasets into features (X) and target (y)
X_train = train_data.drop('Output', axis=1)
y_train = train_data['Output']
X_val = val_data.drop('Output', axis=1)
y_val = val_data['Output']
X_test = test_data.drop('Output', axis=1)
y_test = test_data['Output']

In [5]:
# Check the shapes of the datasets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (2791, 1980)
y_train shape: (2791,)
X_val shape: (931, 1980)
y_val shape: (931,)
X_test shape: (931, 1980)
y_test shape: (931,)


In [6]:
print (y_val)

0      0.080714
1      0.687701
2      0.019128
3      0.191419
4      0.391193
         ...   
926    0.470977
927    0.183726
928    0.029032
929    0.089227
930    0.197472
Name: Output, Length: 931, dtype: float64


### Building lstm model

In [7]:
import numpy as np

In [8]:
# Assuming an equal number of features per day
num_days = 60
total_features = 1980
features_per_day = int(total_features / num_days)
features_per_day

33

In [9]:
# Reshape the data to (samples, time steps, features)
X_train_reshaped = X_train.values.reshape(-1, num_days, features_per_day)
X_val_reshaped = X_val.values.reshape(-1, num_days, features_per_day)
X_test_reshaped = X_test.values.reshape(-1, num_days, features_per_day)

In [10]:
print(X_train_reshaped)

[[[8.77508802e-01 8.75963586e-01 8.74107413e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [8.83411306e-01 8.85684264e-01 8.85258435e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [8.47678380e-01 8.68316151e-01 8.60032794e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  ...
  [6.42186773e-01 6.40862657e-01 6.39483947e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [6.67746895e-01 6.62961532e-01 6.49134702e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [6.87238444e-01 7.00086699e-01 6.76880022e-01 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]]

 [[3.84374500e-02 3.66728240e-02 3.22461207e-02 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [3.94101290e-02 3.78413780e-02 3.96176240e-02 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  [3.85370899e-02 3.68141810e-02 3.72662445e-02 ... 0.00000000e+00
   0.00000000e+00 0.00000000e+00]
  ...
  [3.46843300e-03 1.99313900e-03 7.83793000e-04 ... 0.00000000e+00
   1.00000

In [11]:
# LSTM model structure
def build_lstm_model(hp):
    model = Sequential()
    model.add(LSTM(
        units=hp.Int('units', min_value=32, max_value=512, step=32),
        input_shape=(num_days, features_per_day),
        return_sequences=True))
    model.add(Dropout(hp.Float('dropout', min_value=0, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units', min_value=32, max_value=512, step=32)))
    model.add(Dense(1))
    model.compile(
        optimizer=Adam(hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])),
        loss='mean_squared_error')
    return model

In [12]:
tuner = RandomSearch(
    build_lstm_model,
    objective='val_loss',
    max_trials=5,  
    executions_per_trial=2,
    directory='lstm_tuning',
    project_name='stock_prediction')

tuner.search(X_train_reshaped, y_train, epochs=10, validation_data=(X_val_reshaped, y_val))

Reloading Tuner from lstm_tuning\stock_prediction\tuner0.json


In [13]:
# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

In [14]:
# Combine the train and validation sets for final training
X_combined = np.concatenate((X_train_reshaped, X_val_reshaped), axis=0)
y_combined = np.concatenate((y_train, y_val), axis=0)

In [15]:
# Train the best model on the combined dataset
best_model.fit(X_combined, y_combined, epochs=50, batch_size=32, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x17600094cd0>

In [19]:
# After training, save the best model to an HDF5 file
best_model_path = 'LSTM_Model/best_lstm_model.h5'  # Replace with your desired path
best_model.save(best_model_path)

print(f"The best model is saved to {best_model_path}")

  saving_api.save_model(


The best model is saved to LSTM_Model/best_lstm_model.h5


### Best model loss and prediction for test set

In [16]:
# Evaluate the model on the test set
test_loss = best_model.evaluate(X_test_reshaped, y_test, verbose=0)

In [17]:
# Calculate predictions to evaluate other metrics such as R^2 or MAE
y_pred = best_model.predict(X_test_reshaped)



### Getting model and result info

In [18]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Assuming y_test and y_pred are already defined
# You can replace them with your actual test and prediction data

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

# Calculate MAPE
# Assuming y_pred is the array you provided
y_pred = y_pred.flatten()

# Assuming test_loss is defined
print(f"Test Loss: {test_loss}")
print(f"R^2 Score: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

# Now y_pred is one-dimensional, and you can proceed with calculations like MAPE
# Ensure there are no zero elements in y_test to avoid division by zero in MAPE calculation
if np.any(y_test == 0):
    print("Error: y_test contains zero values, which will lead to division by zero in MAPE calculation.")
else:
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    print(f"Mean Absolute Percentage Error: {mape}%")

Test Loss: 0.000555491482373327
R^2 Score: 0.9923823291490315
Mean Absolute Error: 0.011540712830362215
Mean Squared Error: 0.000555491405254933


In [20]:
from keras.models import load_model

# Load the best model
best_model = load_model(best_model_path)

# Get the configuration of the model
config = best_model.get_config()

# Extract the units from the LSTM layers
lstm1_units = config['layers'][1]['config']['units']
lstm2_units = config['layers'][3]['config']['units']

# Extract the dropout rate from the Dropout layer
dropout_rate = config['layers'][2]['config']['rate']

# Learning rate is part of the optimizer's configuration
learning_rate = best_model.optimizer.learning_rate.numpy()

# Output the extracted values
print(f"LSTM Layer 1 Units: {lstm1_units}")
print(f"LSTM Layer 2 Units: {lstm2_units}")
print(f"Dropout Rate: {dropout_rate}")
print(f"Learning Rate: {learning_rate}")

LSTM Layer 1 Units: 256
LSTM Layer 2 Units: 256
Dropout Rate: 0.0
Learning Rate: 0.0010000000474974513
