In [347]:
import os
import sys
import random
import numpy as np
import optuna
import joblib
from importlib import reload  # Import the reload function
from sklearn.model_selection import train_test_split

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
import utils.helpers as hp
import utils.preprocess as pp
import utils.modeling as md  

reload(hp)
reload(pp)
reload(md)
%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [348]:
timeframe = 5
years = 3
samples = (60 / timeframe) * 24 * 365 * years


shift = 20
epics = 50
samples / shift / epics


315.36

In [368]:

timeframe_filenames = {
    "1m": "../data/raw/ETHUSDT_1m_v600k.csv",
    "3m": "../data/DOGEUSDT_3m_v0.csv",
    "5m": "../data/DOGEUSDT_5m_v0.csv",
    "15m": "../data/DOGEUSDT_15m_v0.csv",
    "30m": "../data/DOGEUSDT_30m_v0.csv"
}

def load_model_test_split(filenames, params, model_data_ratio=0.8):
    # Load data
    df = hp.load_data(filenames[params['timeframe']])
    # Create indicators
    indicated_df = pp.create_indicators(df, params['num_indicators'])
    
    # Split data
    model_df = indicated_df[:int(len(indicated_df)*model_data_ratio)]
    test_df = indicated_df[int(len(indicated_df)*model_data_ratio):]
    
    return model_df, test_df

def test_data(df, params):

    # Testing data
    test_df = pp.normalize_X(df, scaler_name=params['scaler_type'])

    # Testing data special treatment
    win_norm_X_test, y_test = pp.create_targets(test_df, params['window_size'], params['look_ahead_size'], params['look_ahead_size'])
    unused, scaler_y = pp.normalize_y(y_test, return_scaler=True)
    
    joblib.dump(win_norm_X_test, '../data/processed/X_test.pkl')
    joblib.dump(y_test, '../data/processed/y_test.pkl')
    joblib.dump(scaler_y, '../data/processed/scaler_y.pkl')
    print('Test data dumped')
    return win_norm_X_test, y_test

def custom_model(df, params, tuning=False):

    # Load data and create indicators
    model_df = pp.normalize_X(df, scaler_name=params['scaler_type'])
        
    # Train
    model = md.train(model_df, params, epochs=50, train_size=3000, val_size=500, step_size=200, tuning=     tuning) 
    
    return model

In [370]:
params = {'lstm_units_1': 64, 'lstm_units_2': 128, 'dropout': 0.2, 'learning_rate': 0.005326718162056909, 'batch_size': 32, 'dense_units': 16, 'sequence_length': 10, 'gradient_clipping': 0.5, 'optimizer': 'adam', 'activation': 'tanh', 'num_indicators': 16, 'scaler_type': 'MinMaxScaler', 'timeframe': '1m', 'window_size': 80, 'look_ahead_size': 20}

model_df, test_df = load_model_test_split(timeframe_filenames, params)
model = custom_model(model_df, params)

Dataset loaded from ../data/raw/ETHUSDT_1m_v600k.csv.


KeyboardInterrupt: 

In [364]:
hp.save_model(model, '../models/model_1m_eth_v0.keras')

Model saved to ../models/model_1m_eth_v0.keras


In [371]:
# Objective function for optimization
def objective(trial):

    # MODEL CREATION
    params = {
        # LSTM Units
        "lstm_units_1": trial.suggest_categorical("lstm_units_1", [64, 128, 256]),
        "lstm_units_2": trial.suggest_categorical("lstm_units_2", [32, 64, 128]),

        # Dropout rate for regularization
        "dropout": trial.suggest_categorical("dropout", [0.1, 0.2, 0.3, 0.5]),

        # Learning rate for the optimizer (log scale for better range)
        "learning_rate": trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True),

        # Batch size (common choices)
        "batch_size": trial.suggest_categorical("batch_size", [16, 32, 64, 128]),

        # Dense layer units (for fully connected layers after LSTM)
        "dense_units": trial.suggest_categorical("dense_units", [16, 32, 64, 128]),

        # Sequence length (historical data considered for prediction)
        "sequence_length": trial.suggest_categorical("sequence_length", [10, 20, 50, 100]),

        # Gradient clipping for preventing exploding gradients
        "gradient_clipping": trial.suggest_categorical("gradient_clipping", [0.5, 1.0, 2.0]),

        # Optimizer choice (Adam or RMSprop)
        "optimizer": trial.suggest_categorical("optimizer", ['adam', 'rmsprop']),

        # Activation function for layers
        "activation": trial.suggest_categorical("activation", ['relu', 'tanh']),

        "num_indicators" : trial.suggest_int('num_indicators', 5, 46),

        "scaler_type" : trial.suggest_categorical('scaler', ['MinMaxScaler', 'StandardScaler', 'RobustScaler']),
 
        'timeframe': '1m', 
        'window_size': 80, 
        'look_ahead_size': 20
    }

    # DATA CREATION AND PROCESSING
    timeframe_filenames = {
    "1m": "../data/raw/ETHUSDT_1m_v600k.csv",
    "3m": "../data/raw/DOGEUSDT_3m_v0.csv",
    "5m": "../data/raw/DOGEUSDT_5m_v0.csv",
    "15m": "../data/raw/DOGEUSDT_15m_v0.csv",
    "30m": "../data/raw/DOGEUSDT_30m_v0.csv"
}

    print(params)
    model = custom_model(model_df, params, tuning=True)

    # Check if the trial is pruned after training
    if trial.should_prune():
        print("Trial pruned after training.")
        raise optuna.exceptions.TrialPruned()
    
    X_test, y_test = test_data(test_df, params)
    
    test_loss, test_mae = model.evaluate(X_test, y_test, verbose=1)
    
    return test_mae 
    

In [372]:

storage_path = "sqlite:///../optuna/study.db"
pruner = optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=10)

optuna.delete_study(study_name="the_study", storage=storage_path)
study = optuna.create_study(storage=storage_path, study_name="the_study", direction='minimize', pruner=pruner)

#study = optuna.load_study(storage=storage_path, study_name="the_study")

model_df, test_df = load_model_test_split(timeframe_filenames, params)

study.optimize(objective, n_trials=100)  # Run 100 trials

# Get the best parameters
print("Best parameters:", study.best_params)


[I 2025-01-06 00:12:13,056] A new study created in RDB with name: the_study


Dataset loaded from ../data/raw/ETHUSDT_1m_v600k.csv.
{'lstm_units_1': 128, 'lstm_units_2': 32, 'dropout': 0.2, 'learning_rate': 1.0849769053082761e-05, 'batch_size': 128, 'dense_units': 128, 'sequence_length': 20, 'gradient_clipping': 1.0, 'optimizer': 'adam', 'activation': 'relu', 'num_indicators': 15, 'scaler_type': 'RobustScaler', 'timeframe': '1m', 'window_size': 80, 'look_ahead_size': 20}


[W 2025-01-06 00:12:15,794] Trial 0 failed with parameters: {'lstm_units_1': 128, 'lstm_units_2': 32, 'dropout': 0.2, 'learning_rate': 1.0849769053082761e-05, 'batch_size': 128, 'dense_units': 128, 'sequence_length': 20, 'gradient_clipping': 1.0, 'optimizer': 'adam', 'activation': 'relu', 'num_indicators': 15, 'scaler': 'RobustScaler'} because of the following error: ValueError("Exception encountered when calling LSTMCell.call().\n\n\x1b[1mDimensions must be equal, but are 16 and 15 for '{{node sequential_21_1/lstm_42_1/lstm_cell_1/MatMul}} = MatMul[T=DT_FLOAT, grad_a=false, grad_b=false, transpose_a=false, transpose_b=false](sequential_21_1/lstm_42_1/strided_slice_2, sequential_21_1/lstm_42_1/lstm_cell_1/Cast/ReadVariableOp)' with input shapes: [?,16], [15,512].\x1b[0m\n\nArguments received by LSTMCell.call():\n  • inputs=tf.Tensor(shape=(None, 16), dtype=float32)\n  • states=('tf.Tensor(shape=(None, 128), dtype=float32)', 'tf.Tensor(shape=(None, 128), dtype=float32)')\n  • training=T

ValueError: Exception encountered when calling LSTMCell.call().

[1mDimensions must be equal, but are 16 and 15 for '{{node sequential_21_1/lstm_42_1/lstm_cell_1/MatMul}} = MatMul[T=DT_FLOAT, grad_a=false, grad_b=false, transpose_a=false, transpose_b=false](sequential_21_1/lstm_42_1/strided_slice_2, sequential_21_1/lstm_42_1/lstm_cell_1/Cast/ReadVariableOp)' with input shapes: [?,16], [15,512].[0m

Arguments received by LSTMCell.call():
  • inputs=tf.Tensor(shape=(None, 16), dtype=float32)
  • states=('tf.Tensor(shape=(None, 128), dtype=float32)', 'tf.Tensor(shape=(None, 128), dtype=float32)')
  • training=True