# Transformer model tuning

## Index
* [Data Preparation](#Title)
* [Second Bullet Header](#second-bullet)

In [1]:
import os
from typing import *

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_addons as tfa
from keras import backend as K
from keras.callbacks import Callback
from keras.layers import (
    BatchNormalization,
    Concatenate,
    Conv1D,
    ConvLSTM1D,
    Dense,
    Dropout,
    Flatten,
    Input,
    Layer,
    LayerNormalization,
    MaxPooling1D,
    MultiHeadAttention,
)
from keras.optimizers import Adam, RMSprop
from keras.utils import Sequence
from keras_tuner.engine.hyperparameters import HyperParameters
from keras_tuner.tuners import Hyperband, RandomSearch
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Model

from metrics import *

%matplotlib inline

plt.style.use("seaborn-v0_8-darkgrid")

PATH = "./data"
SYMBOLS = ["ADA", "BNB", "BTC", "EOS", "ETH", "LTC", "TRX", "VET", "XRP"]


def to_csvf(x):
    return x + "USDT.csv"

2023-08-11 19:18:36.685545: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-08-11 19:18:36.712116: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/add

In [2]:
def read_file(symbol: str, tf: str, timestamp_unit: str = "ms") -> pd.DataFrame:
    """
    Reads a CSV file, assigns column names, converts the 'date' column to datetime,
    and sets it as the DataFrame's index.

    Parameters
    ----------
    symbol : str
        The symbol.
    tf : str
        The tf.
    timestamp_unit : str, default 'ms'
        The unit of the timestamp in the 'date' column. By default, it's 'ms' (milliseconds).

    Returns
    -------
    pd.DataFrame
        The DataFrame with the 'date' column converted to datetime and set as the index.
    """

    df = pd.read_csv(to_csvf(os.path.join(PATH, tf, symbol)), header=None).iloc[:, 0:6]
    df.columns = ["date", "open", "high", "low", "close", "volume"]
    df["date"] = pd.to_datetime(df["date"], unit=timestamp_unit)
    df.set_index("date", inplace=True)

    return df

In [3]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

PATH = "./data"
SYMBOLS = ["ADA", "BNB", "BTC", "EOS", "ETH", "LTC", "TRX", "VET", "XRP"]


def read_file(symbol: str, tf: str, timestamp_unit: str = "ms") -> pd.DataFrame:
    """
    Reads a CSV file, assigns column names, converts the 'date' column to datetime,
    and sets it as the DataFrame's index.

    Parameters
    ----------
    symbol : str
        The symbol.
    tf : str
        The tf.
    timestamp_unit : str, default 'ms'
        The unit of the timestamp in the 'date' column. By default, it's 'ms' (milliseconds).

    Returns
    -------
    pd.DataFrame
        The DataFrame with the 'date' column converted to datetime and set as the index.
    """

    df = pd.read_csv(to_csvf(os.path.join(PATH, tf, symbol)), header=None).iloc[:, 0:6]
    df.columns = ["date", "open", "high", "low", "close", "volume"]
    df["date"] = pd.to_datetime(df["date"], unit=timestamp_unit)
    df.set_index("date", inplace=True)

    return df


def create_input_tensor(data, lookback=250 * 24):
    inputs = []
    for i in range(0, len(data) - lookback, 24):
        inputs.append(data.iloc[i : i + lookback].values)
        # print(data.iloc[i+lookback])

    return np.array(inputs)


def create_target_tensor(data_dict):
    # Step 1: Create a dataframe with closing prices for each currency
    close_prices_df = pd.DataFrame(
        {symbol: df["close"] for symbol, df in data_dict.items()}
    ).dropna()
    # Step 2: Repeat each value three times
    # print(close_prices_df)
    target_array = np.array(
        close_prices_df.apply(
            lambda x: np.array([item for item in x for _ in range(3)]), axis=1
        ).values
    )

    return np.vstack(target_array)


def prepare_data(PATH):
    # 1. Read BTC hourly data
    btc_data = read_file("BTC", "1h")
    # Create a new index to fill missing values
    full_index = pd.date_range(btc_data.index.min(), btc_data.index.max(), freq="H")
    df_full = pd.DataFrame(index=full_index)
    # Create the new dataframe forwarding missing values
    btc_data = df_full.merge(
        btc_data, left_index=True, right_index=True, how="left"
    ).fillna(method="ffill")
    # 2. Read other currencies' daily data
    daily_data = {}
    for symbol in SYMBOLS:
        if symbol != "BTC":
            daily_data[symbol] = read_file(f"{symbol}", "1d")

    # 3. Find overlapping date range
    min_date = btc_data.index.min()
    max_date = btc_data.index.max()
    for df in daily_data.values():
        min_date = max(min_date, df.index.min())
        max_date = min(max_date, df.index.max())

    # 4. Prune each dataset to the overlapping range
    btc_data = btc_data.loc[
        min_date - pd.Timedelta(days=250, hours=1) : max_date - pd.Timedelta(hours=1)
    ]
    for symbol in daily_data:
        daily_data[symbol] = daily_data[symbol].loc[min_date:max_date]
    # 5. Scale the BTC data and each feature separately
    scalers_btc = {}
    for col in btc_data.columns:
        scaler = MinMaxScaler()
        btc_data[col] = scaler.fit_transform(btc_data[col].values.reshape(-1, 1))
        scalers_btc[col] = scaler

    # Create input tensor from scaled BTC hourly data

    input_tensor = create_input_tensor(btc_data)
    # 6. Scale target data (Close Price) for each currency
    scalers_targets = {}
    scaled_targets = {}
    for symbol, df in daily_data.items():
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(df["close"].values.reshape(-1, 1))
        scaled_targets[symbol] = pd.DataFrame(
            scaled_data, columns=["close"], index=df.index
        )  # Save the scaled data as dataframe
        scalers_targets[symbol] = scaler
    # Create target tensor using scaled data
    target_tensors = create_target_tensor(scaled_targets)

    return input_tensor, target_tensors, scalers_btc, scalers_targets


# Use the function
input_data, target_data, btc_scalers, target_scalers = prepare_data(PATH)

In [4]:
# Finding the index for an 80-20 split
index_80_percent = int(0.8 * len(input_data))

hyperparam_input = input_data[index_80_percent:]
hyperparam_target = target_data[index_80_percent:]

# 2. Splitting the 20% further into training and validation

# Finding the index for an 80-20 split within the hyperparameter data
index_hyperparam_80_percent = int(0.8 * len(hyperparam_input))

# Splitting the data
train_input = hyperparam_input[:index_hyperparam_80_percent]
train_target = hyperparam_target[:index_hyperparam_80_percent]

valid_input = hyperparam_input[index_hyperparam_80_percent:]
valid_target = hyperparam_target[index_hyperparam_80_percent:]

train_input.shape, valid_input.shape, train_target.shape, valid_target.shape

((294, 6000, 5), (74, 6000, 5), (294, 24), (74, 24))

In [5]:
class Time2Vec(Layer):
    def __init__(self, output_dim=None, **kwargs):
        self.output_dim = output_dim
        super(Time2Vec, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(
            name="W",
            shape=(input_shape[-1], self.output_dim),
            initializer="uniform",
            trainable=True,
        )
        self.P = self.add_weight(
            name="P",
            shape=(input_shape[1], self.output_dim),
            initializer="uniform",
            trainable=True,
        )
        self.w = self.add_weight(
            name="w", shape=(input_shape[1], 1), initializer="uniform", trainable=True
        )
        self.p = self.add_weight(
            name="p", shape=(input_shape[1], 1), initializer="uniform", trainable=True
        )
        super(Time2Vec, self).build(input_shape)

    def call(self, x):
        original = self.w * x + self.p
        sin_trans = K.sin(K.dot(x, self.W) + self.P)

        return K.concatenate([sin_trans, original], -1)

In [6]:
# No necesariamente time to vec, otros metodos, coseno dia de la semana, del mes ...
lookback = 6000

tf.keras.backend.clear_session()
def build_model(hp):
    input_shape = (lookback, 5)  # for example

    input_layer = Input(shape=input_shape)
    x = Time2Vec(lookback)(input_layer)

    decoder_layer_type = hp.Choice(
        "decoder_layer_type", values=["conv", "dense"], default="conv"
    )
    num_transformer_layers = hp.Int(
        "num_transformer_layers", min_value=1, max_value=2, step=1
    )
    num_attention_heads = hp.Int(
        "num_attention_heads", min_value=1, max_value=4, step=1
    )
    dropout_rate = hp.Float("dropout_rate", min_value=0.0, max_value=0.3, step=0.15)

    for _ in range(num_transformer_layers, 0, -1):
        mha = MultiHeadAttention(num_heads=num_attention_heads, key_dim=lookback)(x, x)
        x = Dropout(dropout_rate)(x)
        x = LayerNormalization()(x)

    encoder_output = x
    # Define output branches
    outputs = []
    losses = {}
    for i in range(8):
        if decoder_layer_type == "conv":
            kernel_size = hp.Int("kernel_size", min_value=16, max_value=64, step=16)
            conv_layers = hp.Int("conv_layers", min_value=1, max_value=2, step=1)
            for j in range(conv_layers, 0, -1):
                x = Conv1D(32 * (2**j), kernel_size)(encoder_output)
            x = MaxPooling1D(2)(x)
            x = Flatten()(x)
        # elif decoder_layer_type == "conv_lstm":
        #     kernel_size = hp.Int("kernel_size", min_value=2, max_value=128, step=16)
        #     conv_layers = hp.Int("conv_lstm_layers", min_value=1, max_value=3, step=1)
        #     for j in range(conv_layers, 0, -1):
        #         x = ConvLSTM1D(32 * (2**j), kernel_size)(encoder_output)
        #     x = MaxPooling1D(2)(x)
        else:
            dense_units = hp.Int("dense_units", min_value=16, max_value=64, step=32)
            dense_layers = hp.Int("dense_layers", min_value=1, max_value=2, step=1)

            for j in range(dense_layers, 0, -1):
                x = Dense(dense_units * (2**j), activation="relu")(encoder_output)
            x = BatchNormalization()(x)
        output_1 = Dense(1, name=f"output_{i}_1_prediction")(x)
        output_2 = Dense(1, name=f"output_{i}_2_quantile_05")(x)
        output_3 = Dense(1, name=f"output_{i}_3_quantile_95")(x)
        losses[f"output_{i}_1_prediction"] = "mse"
        losses[f"output_{i}_2_quantile_05"] = tfa.losses.PinballLoss(tau=0.05)
        losses[f"output_{i}_3_quantile_95"] = tfa.losses.PinballLoss(tau=0.95)
        outputs.extend([output_1, output_2, output_3])

    model = Model(inputs=input_layer, outputs=outputs)

    # Define losses
    optimizer = hp.Choice("optimizer", values=["Adam", "RMSprop"])
    opt = Adam if optimizer == "Adam" else RMSprop
    model.compile(
        loss=losses,
        optimizer=opt(
            learning_rate=hp.Float(
                "learning_rate", min_value=1e-5, max_value=1e-2, sampling="LOG"
            )
        ),
    )
    return model


class PrintHyperparameters(Callback):
    def on_trial_begin(self, trial):
        print(trial.hyperparameters.values)


def create_dataset(input_data, target_data, batch_size, window_size):
    # Create datasets
    input_dataset = tf.data.Dataset.from_tensor_slices(input_data)
    target_dataset = tf.data.Dataset.from_tensor_slices(target_data)

    # Windowing the data. This will create windows of `window_size` for inputs and targets.
    input_dataset = input_dataset.window(window_size, shift=1, drop_remainder=True)
    input_dataset = input_dataset.flat_map(lambda x: x.batch(window_size))

    target_dataset = target_dataset.window(window_size, shift=1, drop_remainder=True)
    target_dataset = target_dataset.flat_map(lambda y: y.batch(window_size))

    # Zip the datasets together
    dataset = tf.data.Dataset.zip((input_dataset, target_dataset))

    # Batching the data
    dataset = dataset.batch(batch_size).prefetch(1)

    return dataset


batch_size = 4
window_size = 10

train_dataset = create_dataset(train_input, train_target, batch_size, window_size)
val_dataset = create_dataset(valid_input, valid_target, batch_size, window_size)
train_dataset = train_dataset.map(lambda x, y: (tf.reshape(x, (-1, 6000, 5)), y))
val_dataset = val_dataset.map(lambda x, y: (tf.reshape(x, (-1, 6000, 5)), y))

# Tuner
tuner = RandomSearch(
    build_model,
    objective="val_loss",
    directory="random_search",
    project_name="TimeSeries",
    executions_per_trial=2,
)


# Pass the callback to the search method
tuner.search(
    train_dataset,
    validation_data=val_dataset,
    epochs=20,
    callbacks=[PrintHyperparameters()],
)
#
# )

2023-08-11 19:18:38.048570: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-11 19:18:38.064070: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-11 19:18:38.064188: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

INFO:tensorflow:Reloading Tuner from random_search/TimeSeries/tuner0.json


s-bus-pci#L344-L355
2023-08-11 19:18:38.065308: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-11 19:18:38.065357: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-08-11 19:18:38.120209: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentat


Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
conv              |conv              |decoder_layer_type
2                 |2                 |num_transformer_layers
1                 |1                 |num_attention_heads
0.15              |0.15              |dropout_rate
64                |64                |kernel_size
2                 |2                 |conv_layers
Adam              |Adam              |optimizer
1.0468e-05        |1.0468e-05        |learning_rate

Epoch 1/20


2023-08-11 19:18:38.895432: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype double and shape [294,6000,5]
	 [[{{node Placeholder/_0}}]]
2023-08-11 19:18:38.895590: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_8' with dtype double and shape [294,24]
	 [[{{node Placeholder/_8}}]]
2023-08-11 19:18:41.012571: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-08-11 19:18:51.045514: W tensorflow/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory tryi

RuntimeError: Number of consecutive failures excceeded the limit of 3.
Traceback (most recent call last):
  File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/base_tuner.py", line 270, in _try_run_and_update_trial
    self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
  File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/base_tuner.py", line 235, in _run_and_update_trial
    results = self.run_trial(trial, *fit_args, **fit_kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/tuner.py", line 287, in run_trial
    obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/tuner.py", line 214, in _build_and_fit_model
    results = self.hypermodel.fit(hp, model, *args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/hypermodel.py", line 144, in fit
    return model.fit(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 70, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/tensorflow/python/eager/execute.py", line 58, in quick_execute
    except TypeError as e:
tensorflow.python.framework.errors_impl.ResourceExhaustedError: Graph execution error:

Detected at node 'model/time2_vec/concat' defined at (most recent call last):
    File "<frozen runpy>", line 198, in _run_module_as_main
    File "<frozen runpy>", line 88, in _run_code
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/traitlets/config/application.py", line 992, in launch_instance
      app.start()
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 711, in start
      self.io_loop.start()
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/tornado/platform/asyncio.py", line 215, in start
      self.asyncio_loop.run_forever()
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/asyncio/base_events.py", line 607, in run_forever
      self._run_once()
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/asyncio/base_events.py", line 1922, in _run_once
      handle._run()
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/ipykernel/kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/ipykernel/ipkernel.py", line 411, in do_execute
      res = shell.run_cell(
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/ipykernel/zmqshell.py", line 531, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3006, in run_cell
      result = self._run_cell(
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3061, in _run_cell
      result = runner(coro)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3266, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3445, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_28177/2042983326.py", line 121, in <module>
      tuner.search(
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/base_tuner.py", line 230, in search
      self._try_run_and_update_trial(trial, *fit_args, **fit_kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/base_tuner.py", line 270, in _try_run_and_update_trial
      self._run_and_update_trial(trial, *fit_args, **fit_kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/base_tuner.py", line 235, in _run_and_update_trial
      results = self.run_trial(trial, *fit_args, **fit_kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/tuner.py", line 287, in run_trial
      obj_value = self._build_and_fit_model(trial, *args, **copied_kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/tuner.py", line 214, in _build_and_fit_model
      results = self.hypermodel.fit(hp, model, *args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras_tuner/engine/hypermodel.py", line 144, in fit
      return model.fit(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/training.py", line 1050, in train_step
      y_pred = self(x, training=True)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/training.py", line 558, in __call__
      return super().__call__(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/functional.py", line 512, in call
      return self._run_internal_graph(inputs, training=training, mask=mask)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/functional.py", line 669, in _run_internal_graph
      outputs = node.layer(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/engine/base_layer.py", line 1145, in __call__
      outputs = call_fn(inputs, *args, **kwargs)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/utils/traceback_utils.py", line 96, in error_handler
      return fn(*args, **kwargs)
    File "/tmp/ipykernel_28177/2654059636.py", line 31, in call
      return K.concatenate([sin_trans, original], -1)
    File "/home/apollo/anaconda3/envs/tfm/lib/python3.11/site-packages/keras/backend.py", line 3581, in concatenate
      return tf.concat([to_dense(x) for x in tensors], axis)
Node: 'model/time2_vec/concat'
OOM when allocating tensor with shape[40,6000,6005] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[{{node model/time2_vec/concat}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
 [Op:__inference_train_function_9129]
