In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

file_path = '/content/drive/MyDrive/df_2.parquet'

try:
  df = pd.read_parquet(file_path)
  print(df.head()) # Display first few rows of the dataframe
except FileNotFoundError:
  print(f"Error: File not found at {file_path}")
except Exception as e:
  print(f"An error occurred: {e}")

   stock_id  date_id  seconds_in_bucket  imbalance_size  \
0         0        0                300             0.0   
1         0        0                310             0.0   
2         0        0                320             0.0   
3         0        0                330             0.0   
4         0        0                340             0.0   

   imbalance_buy_sell_flag  reference_price  far_price  near_price  bid_price  \
0                        0         1.000241   1.000241    1.000241   1.000026   
1                        0         0.999919   0.999919    0.999919   0.999812   
2                        0         0.999919   0.999919    0.999919   0.999705   
3                        0         0.999812   0.999812    0.999812   0.999705   
4                        0         0.999491   0.999491    0.999491   0.999169   

   ask_price  ...  auction_signal_strength_mean_0_300  \
0   1.000241  ...                            0.000033   
1   0.999919  ...                           

In [4]:
# Special value for masking
SPECIAL_VALUE = -999.0
df['far_price'] = df.groupby('stock_id')['far_price'].ffill()
df['far_price'] = df['far_price'].fillna(1)

In [5]:
# prompt: Find columns with nans and number

# Find columns with NaNs and their counts
nan_counts = df.isna().sum()
nan_cols = nan_counts[nan_counts > 0]

print("Columns with NaN values:\n", nan_cols)

# Number of columns with NaNs
num_nan_cols = len(nan_cols)
print("\nNumber of columns with NaN values:", num_nan_cols)


Columns with NaN values:
 Series([], dtype: int64)

Number of columns with NaN values: 0


In [6]:
# Split into train and test sets
train_df = df[df['date_id'] <= 400].copy()
test_df = df[df['date_id'] > 400].copy()

# Print summary
print(f"Train days: {train_df['date_id'].min()} to {train_df['date_id'].max()} -> {train_df.shape}")
print(f"Test days: {test_df['date_id'].min()} to {test_df['date_id'].max()} -> {test_df.shape}")

Train days: 0 to 400 -> (1397376, 140)
Test days: 401 to 480 -> (282240, 140)


In [7]:
train_df.columns

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'far_price', 'near_price',
       'bid_price', 'ask_price',
       ...
       'auction_signal_strength_mean_0_300',
       'auction_signal_strength_std_0_300',
       'stock_vs_index_wap_ratio_min_0_300',
       'stock_vs_index_wap_ratio_max_0_300',
       'stock_vs_index_wap_ratio_mean_0_300',
       'stock_vs_index_wap_ratio_std_0_300', 'spread_min_0_300',
       'spread_max_0_300', 'spread_mean_0_300', 'spread_std_0_300'],
      dtype='object', length=140)

In [8]:
from sklearn.preprocessing import StandardScaler

# ----- 1. Separate features & target -----
target_col = 'target'  # Replace with your actual target
exclude_cols = ['date_id', 'time_id', 'target', 'seconds_in_bucket']  # Add others if needed

# All columns except target & known categorical/time identifiers
feature_cols = [col for col in train_df.columns if col not in exclude_cols]

# ----- 2. Identify numeric & categorical -----
categorical_cols = ['stock_id']
do_not_scale = ['imbalance_buy_sell_flag']
numeric_cols = [col for col in feature_cols if col not in categorical_cols]
scale_cols = [col for col in numeric_cols if col not in do_not_scale]

# ----- 3. Scale numeric features -----
scaler = StandardScaler()
train_df[scale_cols] = scaler.fit_transform(train_df[scale_cols])
test_df[scale_cols] = scaler.transform(test_df[scale_cols])

# for col in scale_cols:
#     valid_train = train_df[col] != SPECIAL_VALUE
#     valid_test = test_df[col] != SPECIAL_VALUE

#     scaled_train = scaler.fit_transform(train_df.loc[valid_train, [col]]).astype(np.float32).flatten()
#     scaled_test = scaler.transform(test_df.loc[valid_test, [col]]).astype(np.float32).flatten()

#     # Ensure the column is float first to prevent assignment issues
#     train_df[col] = train_df[col].astype(np.float32)
#     test_df[col] = test_df[col].astype(np.float32)

#     train_df.loc[valid_train, col] = scaled_train
#     test_df.loc[valid_test, col] = scaled_test


In [9]:
train_df = train_df.copy()
test_df = test_df.copy()

In [10]:
import numpy as np

def reshape_to_tensor(df, feature_cols, target_col='target', timesteps_per_day=18):
    """
    Converts a long-form DataFrame into (X, y) tensors for GRU sequence-to-sequence learning.

    Args:
        df: pandas DataFrame with 'stock_id', 'date_id', 'seconds_in_bucket', features, and target
        feature_cols: list of column names to be used as input features
        target_col: name of the column to use as target
        timesteps_per_day: number of time intervals per day (default: 30)

    Returns:
        X: np.array of shape (N_samples, 30, num_features)
        y: np.array of shape (N_samples, 30)
        keys: list of tuples (stock_id, date_id) representing each sample
    """
    grouped = df.groupby(['stock_id', 'date_id'])
    X_list, y_list, keys = [], [], []

    for (stock_id, date_id), group in grouped:
        group_sorted = group.sort_values('seconds_in_bucket')
        if len(group_sorted) != timesteps_per_day:
            # print(stock_id)
            continue  # skip incomplete days

        X_seq = group_sorted[feature_cols].values  # shape: (30, F)
        y_seq = group_sorted[target_col].values    # shape: (30,)

        X_list.append(X_seq)
        y_list.append(y_seq)
        keys.append((stock_id, date_id))

    X = np.stack(X_list)  # shape: (N, 30, F)
    y = np.stack(y_list)  # shape: (N, 30)
    return X, y, keys


In [11]:
train_X, train_y, train_keys = reshape_to_tensor(train_df, feature_cols)
test_X, test_y, test_keys = reshape_to_tensor(test_df, feature_cols)

In [12]:
train_X.shape
# test_X.shape

(77632, 18, 136)

In [13]:
train_y.shape
test_y.shape

(15680, 18)

In [14]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import TimeDistributed, Masking
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Model

In [15]:
# --- Time-Based Cross-Validation Split Function ---
def time_based_cv(df, group_col='date_id', n_splits=5):
    unique_dates = sorted(df[group_col].unique())
    split_size = len(unique_dates) // (n_splits + 1)
    for i in range(1, n_splits + 1):
        train_days = unique_dates[: i * split_size]
        test_days = unique_dates[i * split_size : (i + 1) * split_size]
        train_idx = df[df[group_col].isin(train_days)].index
        test_idx = df[df[group_col].isin(test_days)].index
        yield train_idx, test_idx

def build_gru_model(n_layers=1, input_shape=(18, 136), dropout=0.2):
    model = Sequential()
    model.add(Input(shape=input_shape))
    for i in range(n_layers):
        return_seq = True  # we want return_sequences=True in all layers
        model.add(GRU(64, return_sequences=return_seq))
        model.add(Dropout(dropout))
    model.add(TimeDistributed(Dense(1)))
    model.compile(optimizer='adam', loss='mae')
    return model

In [16]:
# --- Hyperparameter Search ---
def run_time_cv_gru(train_X, train_y, df_train_keys, layer_choices=[1, 2, 3, 4]):
    results = {}
    for n_layers in layer_choices:
        print(f"\nTraining model with {n_layers} GRU layer(s)...")
        fold = 0
        fold_mae = []
        for train_idx, val_idx in time_based_cv(df_train_keys, group_col='date_id', n_splits=5):
            fold += 1
            print(f"\nFold {fold}")
            X_train, X_val = train_X[train_idx], train_X[val_idx]
            y_train, y_val = train_y[train_idx], train_y[val_idx]

            model = build_gru_model(n_layers=n_layers, input_shape=(train_X.shape[1], train_X.shape[2]))
            es = EarlyStopping(patience=10, restore_best_weights=True)

            model.fit(X_train, y_train,
                      validation_data=(X_val, y_val),
                      epochs=100,
                      batch_size=32,
                      callbacks=[es],
                      verbose=1)

            preds = model.predict(X_val)
            mae = mean_absolute_error(y_val.flatten(), preds.flatten())
            print(f"Fold {fold} MAE: {mae:.5f}")
            fold_mae.append(mae)

        results[n_layers] = np.mean(fold_mae)
        print(f"Average MAE for {n_layers} layer(s): {results[n_layers]:.5f}")
    return results


In [17]:
train_keys_df = pd.DataFrame(train_keys, columns=["stock_id", "date_id"])

In [18]:
results = run_time_cv_gru(train_X, train_y, train_keys_df)


Training model with 1 GRU layer(s)...

Fold 1
Epoch 1/100
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - loss: 4.8191 - val_loss: 6.2736
Epoch 2/100
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 4.7085 - val_loss: 6.2499
Epoch 3/100
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 4.6750 - val_loss: 6.2839
Epoch 4/100
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 4.6578 - val_loss: 6.2398
Epoch 5/100
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 4.6222 - val_loss: 6.2443
Epoch 6/100
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 4.6199 - val_loss: 6.2384
Epoch 7/100
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 4.6558 - val_loss: 6.2490
Epoch 8/100
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 4.6562 - va

In [19]:
best_layers = min(results, key=results.get)
print(f"Best # of layers: {best_layers}, MAE: {results[best_layers]:.5f}")

# Rebuild and retrain on full data
final_model = build_gru_model(n_layers=best_layers, input_shape=(train_X.shape[1], train_X.shape[2]))
es = EarlyStopping(patience=10, restore_best_weights=True)

final_model.fit(train_X, train_y,
                validation_split=0.1,  # optional small holdout for early stopping
                epochs=100,
                batch_size=32,
                callbacks=[es],
                verbose=1)

Best # of layers: 2, MAE: 5.90020
Epoch 1/100
[1m2184/2184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 8ms/step - loss: 5.8224 - val_loss: 5.2397
Epoch 2/100
[1m2184/2184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - loss: 5.7841 - val_loss: 5.2372
Epoch 3/100
[1m2184/2184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - loss: 5.7485 - val_loss: 5.2254
Epoch 4/100
[1m2184/2184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - loss: 5.7182 - val_loss: 5.2341
Epoch 5/100
[1m2184/2184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - loss: 5.7128 - val_loss: 5.2244
Epoch 6/100
[1m2184/2184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - loss: 5.6956 - val_loss: 5.2158
Epoch 7/100
[1m2184/2184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - loss: 5.6904 - val_loss: 5.2243
Epoch 8/100
[1m2184/2184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 7ms/step - loss: 

<keras.src.callbacks.history.History at 0x7fd6c5191d90>

In [20]:
final_model.save('/content/drive/MyDrive/best_gru_model2.h5')



In [22]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import TimeDistributed
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.losses import MeanAbsoluteError

final_model = tf.keras.models.load_model(
    '/content/drive/MyDrive/best_gru_model2.h5',
    custom_objects={'mae': MeanAbsoluteError()}
)


# Make predictions on holdout test set
test_preds = final_model.predict(test_X)

test_mae = mean_absolute_error(test_y.flatten(), test_preds.flatten())
print(f"Test MAE on holdout: {test_mae:.5f}")




[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Test MAE on holdout: 5.21382
