## Installing Essential Dependencies

In [1]:
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install tensorflow




## Loading Data

In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('BADSS_training_data_filled.csv')
df.columns = df.columns.str.strip()
df

Unnamed: 0,Date,Symbol,Maturity,Strike,Bid Price,Bid Size,Ask Price,Ask Size,Undl Price,date_id,...,exposure_22,exposure_23,exposure_24,exposure_25,exposure_26,exposure_27,exposure_28,exposure_29,exposure_30,exposure_31
0,2024-04-11,SPY,2024-04-12,518.0,1.74,13,1.76,592,518.00,1,...,0.00000,,,,,,,,,
1,2024-04-11,SPY,2024-04-12,519.0,1.24,54,1.25,1619,518.00,1,...,0.00000,,,,,,,,,
2,2024-04-11,SPY,2024-04-12,520.0,0.84,94,0.85,2646,518.00,1,...,0.00000,,,,,,,,,
3,2024-04-11,SPY,2024-04-12,521.0,0.54,479,0.55,3311,518.00,1,...,0.00000,,,,,,,,,
4,2024-04-11,SPY,2024-04-12,522.0,0.33,1207,0.34,3614,518.00,1,...,0.00000,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28571,2024-05-10,QQQ,2024-05-21,440.0,0.00,0,0.00,0,442.06,22,...,18.12446,0.0,0.0,0.0,0.0,0.0,0.0,,,
28572,2024-05-10,QQQ,2024-05-21,441.0,0.00,0,0.00,0,442.06,22,...,18.12446,0.0,0.0,0.0,0.0,0.0,0.0,,,
28573,2024-05-10,QQQ,2024-05-22,440.0,0.00,0,0.00,0,442.06,22,...,18.12446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
28574,2024-05-10,QQQ,2024-05-22,441.0,0.00,0,0.00,0,442.06,22,...,18.12446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,


In [3]:
df["Date"] = pd.to_datetime(df["Date"])
df["Maturity"] = pd.to_datetime(df["Maturity"])

# Combine unique dates from "Date" and "Maturity", sort them, and remove duplicates
unique_dates = pd.concat([df["Date"], df["Maturity"]]).sort_values().unique()
# Create a mapping from each date to a unique sequential id (starting from 1)
date_mapping = {date: i+1 for i, date in enumerate(unique_dates)}

# Map the "Date" and "Maturity" columns to their corresponding ids using the mapping
df["date_id"] = df["Date"].map(date_mapping)
df["maturity_id"] = df["Maturity"].map(date_mapping)

maturity_strike_symbol_dict = (
    df.groupby(["maturity_id", "Strike", "Symbol"])["date_id"]
    .apply(set)
    .to_dict()
)

list(maturity_strike_symbol_dict.items())[990:1000]

[((8, 508.0, 'SPY'), {5, 6, 7}),
 ((8, 509.0, 'SPY'), {3, 4, 5, 6, 7}),
 ((8, 510.0, 'SPY'), {3, 4, 5, 6, 7}),
 ((8, 511.0, 'SPY'), {2, 3, 4, 5, 6, 7}),
 ((8, 512.0, 'SPY'), {2, 3, 4, 5, 6, 7}),
 ((8, 513.0, 'SPY'), {2, 3, 4, 5, 6, 7}),
 ((8, 514.0, 'SPY'), {2, 3, 4, 5, 6, 7}),
 ((8, 515.0, 'SPY'), {2, 3, 4, 5, 6, 7}),
 ((8, 516.0, 'SPY'), {2, 3, 4, 5, 6, 7}),
 ((8, 517.0, 'SPY'), {2, 3, 4, 5, 6, 7})]

In [4]:
for iterdate in range(1, 23):  # Loop over date_id values from 1 to 22
    current_exposure_col = f"exposure_{iterdate}"  # Get the exposure column for the current date_id
    current_PnL_col_ = f"PnL_{iterdate}"  # Get the PnL column for the current date_id

    if current_exposure_col in df.columns:  # Check if the exposure column exists
        # Compute denominator using the PnL column, add a small offset to avoid division by zero
        denominator = -df[current_PnL_col_] + 0.0001
        denominator = denominator.replace(0, np.nan)  # Replace zeros with NaN to avoid division errors

        # For rows with the current date_id, calculate the ratio of exposure to the denominator
        df.loc[df["date_id"] == iterdate, "exposure_1_ratio"] = df.loc[df["date_id"] == iterdate, current_exposure_col] / denominator

# Display a slice of the DataFrame for inspection
df[1000:1010]


Unnamed: 0,Date,Symbol,Maturity,Strike,Bid Price,Bid Size,Ask Price,Ask Size,Undl Price,date_id,...,exposure_24,exposure_25,exposure_26,exposure_27,exposure_28,exposure_29,exposure_30,exposure_31,maturity_id,exposure_1_ratio
1000,2024-04-15,SPY,2024-04-29,506.0,6.23,1011,6.3,1197,504.45,3,...,,,,,,,,,13,13.186584
1001,2024-04-15,SPY,2024-04-29,507.0,5.7,621,5.77,1248,504.45,3,...,,,,,,,,,13,12.582242
1002,2024-04-15,SPY,2024-04-29,508.0,5.19,868,5.26,1271,504.45,3,...,,,,,,,,,13,12.064889
1003,2024-04-15,SPY,2024-04-29,509.0,4.71,1114,4.78,1293,504.45,3,...,,,,,,,,,13,11.378884
1004,2024-04-15,SPY,2024-04-29,510.0,4.25,1232,4.31,1320,504.45,3,...,,,,,,,,,13,11.014251
1005,2024-04-15,SPY,2024-04-29,511.0,3.82,1221,3.89,1352,504.45,3,...,,,,,,,,,13,10.217236
1006,2024-04-15,SPY,2024-04-29,512.0,3.42,1211,3.48,1384,504.45,3,...,,,,,,,,,13,9.72119
1007,2024-04-15,SPY,2024-04-29,513.0,3.04,1129,3.1,1465,504.45,3,...,,,,,,,,,13,9.017258
1008,2024-04-15,SPY,2024-04-29,514.0,2.7,1048,2.76,1547,504.45,3,...,,,,,,,,,13,8.090856
1009,2024-04-15,SPY,2024-04-29,515.0,2.37,1079,2.43,1519,504.45,3,...,,,,,,,,,13,7.274242


## Define the labeling function

In [5]:
def custom_sort_key(row):
    """
    Calculate the ratio = exposure_d / PnL_d, where d is the date_id for the row (avoiding division by zero).
    Sorting rules:
      - If ratio > 0, return f(r) = 1 - 1/(1 + r), mapping positive ratios to (0,1);
      - If ratio < 0, return f(r) = 1 + 1/(1 - r), mapping negative ratios to (1,2);
      - If ratio == 0, return 3.
    This ensures that when sorting, positive ratios yield the smallest key, followed by negative values, and zeros come last.
    """
    d = int(row["date_id"])
    # Dynamically fetch the corresponding exposure_d and PnL_d values
    exposure_val = row.get(f"exposure_{d}", np.nan)
    pnl_val = row.get(f"PnL_{d}", np.nan)
    
    # Avoid division by zero: if pnl_val is 0 or NaN, add a small offset
    if pnl_val == 0 or np.isnan(pnl_val):
        ratio = exposure_val / (pnl_val + 1e-3)
    else:
        ratio = exposure_val / pnl_val

    if ratio > 0:
        return 1 - 1 / (1 + ratio)
    elif ratio < 0:
        return 1 + 1 / (1 - ratio)
    else:
        return 3


## Training Model

In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

# ---------------------------
# 1. Data Preprocessing
# ---------------------------
# Generate labels based on your evaluation criteria (if needed)
df["label"] = df.apply(custom_sort_key, axis=1)

# Define features
features = ['date_id', 'maturity_id', 'Symbol', 'Strike', 'Bid Price', 'Bid Size', 'Ask Price', 'Ask Size']

# Encode the 'Symbol' column
le = LabelEncoder()
df["Symbol_enc"] = le.fit_transform(df["Symbol"])
features = ['date_id', 'maturity_id', 'Symbol_enc', 'Strike', 'Bid Price', 'Bid Size', 'Ask Price', 'Ask Size']

# Construct the feature matrix and target vector
X = df[features].values
y = df["label"].values

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------
# 2. Define Model Building Function
# ---------------------------
def build_model(input_dim):
    # Define an exponential decay learning rate schedule
    lr_schedule = ExponentialDecay(
        initial_learning_rate=0.01,
        decay_steps=1000,
        decay_rate=0.9,
        staircase=True
    )
    optimizer = Adam(learning_rate=lr_schedule)
    
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1))  # Single output for regression
    model.compile(optimizer=optimizer, loss='mse')
    return model

# ---------------------------
# 3. 5-Fold Cross-Validation Training
# ---------------------------
num_epochs = 256
batch_size = 120
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

best_val_loss = np.inf
best_model = None

fold = 1
for train_index, val_index in kf.split(X_scaled):
    print(f"Training fold {fold} ...")
    X_train, X_val = X_scaled[train_index], X_scaled[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    model = build_model(input_dim=X_scaled.shape[1])
    history = model.fit(X_train, y_train,
                        epochs=num_epochs,
                        batch_size=batch_size,
                        validation_data=(X_val, y_val),
                        verbose=1)  # Set verbose to 1 to see the training progress
    
    # Retrieve the validation loss from the last epoch for this fold
    val_loss = history.history['val_loss'][-1]
    print(f"Fold {fold} validation loss: {val_loss:.6f}")
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model
    fold += 1

# ---------------------------
# 4. Save the Model with the Best Validation Performance
# ---------------------------
best_model.save("trained_model.h5")
print(f"\nBest validation loss: {best_val_loss:.6f}. Model saved as 'trained_model.h5'.")


Training fold 1 ...
Epoch 1/256
Epoch 2/256
Epoch 3/256
Epoch 4/256
Epoch 5/256
Epoch 6/256
Epoch 7/256
Epoch 8/256
Epoch 9/256
Epoch 10/256
Epoch 11/256
Epoch 12/256
Epoch 13/256
Epoch 14/256
Epoch 15/256
Epoch 16/256
Epoch 17/256
Epoch 18/256
Epoch 19/256
Epoch 20/256
Epoch 21/256
Epoch 22/256
Epoch 23/256
Epoch 24/256
Epoch 25/256
Epoch 26/256
Epoch 27/256
Epoch 28/256
Epoch 29/256
Epoch 30/256
Epoch 31/256
Epoch 32/256
Epoch 33/256
Epoch 34/256
Epoch 35/256
Epoch 36/256
Epoch 37/256
Epoch 38/256
Epoch 39/256
Epoch 40/256
Epoch 41/256
Epoch 42/256
Epoch 43/256
Epoch 44/256
Epoch 45/256
Epoch 46/256
Epoch 47/256
Epoch 48/256
Epoch 49/256
Epoch 50/256
Epoch 51/256
Epoch 52/256
Epoch 53/256
Epoch 54/256
Epoch 55/256
Epoch 56/256
Epoch 57/256
Epoch 58/256
Epoch 59/256
Epoch 60/256
Epoch 61/256
Epoch 62/256
Epoch 63/256
Epoch 64/256
Epoch 65/256
Epoch 66/256
Epoch 67/256
Epoch 68/256
Epoch 69/256
Epoch 70/256
Epoch 71/256
Epoch 72/256
Epoch 73/256
Epoch 74/256
Epoch 75/256
Epoch 76/256
E

  saving_api.save_model(
