In [14]:
import pandas as pd

# Load a sample of the dataset (first 10 features + 'resp')
df = pd.read_parquet(
    "../data/jane_street_train.parquet",
    columns=["resp"] + [f"feature_{i}" for i in range(10)]  # start small
)

# For safety: work with just the first 100k rows for now
df = df.iloc[:100].copy()

print(f"Loaded shape: {df.shape}")
df.head()

Loaded shape: (100, 11)


Unnamed: 0,resp,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9
0,0.00627,1,-1.872746,-2.191242,-0.474163,-0.323046,0.014688,-0.002484,,,-0.989982
1,-0.009792,-1,-1.349537,-1.704709,0.068058,0.028432,0.193794,0.138212,,,-0.151877
2,0.02397,-1,0.81278,-0.256156,0.806463,0.400221,-0.614188,-0.3548,,,5.448261
3,-0.0032,-1,1.174378,0.34464,0.066872,0.009357,-1.006373,-0.676458,,,4.508206
4,-0.002604,1,-3.172026,-3.093182,-0.161518,-0.128149,-0.195006,-0.14378,,,2.683018


In [15]:
df.info()
df.describe()
df.isna().sum().head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   resp       100 non-null    float64
 1   feature_0  100 non-null    int64  
 2   feature_1  100 non-null    float64
 3   feature_2  100 non-null    float64
 4   feature_3  100 non-null    float64
 5   feature_4  100 non-null    float64
 6   feature_5  100 non-null    float64
 7   feature_6  100 non-null    float64
 8   feature_7  0 non-null      float64
 9   feature_8  0 non-null      float64
 10  feature_9  100 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 8.7 KB


resp         0
feature_0    0
feature_1    0
feature_2    0
feature_3    0
dtype: int64

In [16]:
import numpy as np

def frac_diff_ffd(series, d=0.5, thresh=1e-3):
    """
    Apply fixed-width fractional differentiation (FFD) to a time series.
    Parameters:
        - series: pd.Series
        - d: float, differentiation order
        - thresh: float, minimum weight threshold to truncate window
    Returns:
        - np.array with FFD values (NaNs at the beginning)
    """
    # Compute weights
    w = [1.0]
    k = 1
    while True:
        w_ = -w[-1] * (d - k + 1) / k
        if abs(w_) < thresh:
            break
        w.append(w_)
        k += 1
    w = np.array(w[::-1])
    
    # Apply weights
    width = len(w)
    output = np.full(series.shape, np.nan)
    for i in range(width - 1, len(series)):
        output[i] = np.dot(w, series.values[i - width + 1:i + 1])
    return output

In [17]:
# Choose your features and FFD parameters
feature_cols = [col for col in df.columns if col.startswith("feature_")]
d = 0.5     # order of differentiation
tau = 1e-3  # truncation threshold

# Apply FFD to each feature
ffd_transformed = {}
for col in feature_cols:
    ffd_transformed[col] = frac_diff_ffd(df[col], d=d, thresh=tau)


print(ffd_df.head())
# Create new DataFrame with FFD features
ffd_df = pd.DataFrame(ffd_transformed, index=df.index)
ffd_df["resp"] = df["resp"]

# Drop rows with NaNs (caused by initial lag)
ffd_df.dropna(inplace=True)

print(f"FFD applied. Final shape: {ffd_df.shape}")
ffd_df.head()


Empty DataFrame
Columns: [feature_0, feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7, feature_8, feature_9, resp]
Index: []
FFD applied. Final shape: (0, 11)


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,resp


In [18]:
# Compute rolling volatility of resp
vol_window = 100  # lookback size
vol = ffd_df["resp"].rolling(window=vol_window, min_periods=10).std()

# Scale resp
ffd_df["resp_vol_scaled"] = ffd_df["resp"] / vol

# Drop rows with NaNs from rolling window
ffd_df.dropna(subset=["resp_vol_scaled"], inplace=True)

ffd_df[["resp", "resp_vol_scaled"]].head()


Unnamed: 0,resp,resp_vol_scaled


In [19]:
# Use inverse volatility as sample weight (proxy for uniqueness)
ffd_df["weight"] = 1.0 / vol
ffd_df["weight"] = ffd_df["weight"].clip(upper=10)  # cap extreme weights

# Final cleaned data
ffd_df.dropna(inplace=True)
print("Final data shape:", ffd_df.shape)
ffd_df[["resp_vol_scaled", "weight"]].head()


Final data shape: (0, 13)


Unnamed: 0,resp_vol_scaled,weight


In [20]:
import random

n_steps = 60
n_samples = 20_000  # Even smaller sample for now
feature_cols = [col for col in ffd_df.columns if col.startswith("feature_")]

valid_indices = range(n_steps, len(ffd_df))
sampled_indices = random.sample(valid_indices, min(n_samples, len(valid_indices)))

# Keep as lists for now (don't stack yet)
X, y, w = [], [], []

for i in sampled_indices:
    X.append(ffd_df[feature_cols].iloc[i - n_steps:i].values.astype(np.float32))
    y.append(float(ffd_df["resp_vol_scaled"].iloc[i]))
    w.append(float(ffd_df["weight"].iloc[i]))

print(f"✅ Collected {len(X)} window samples safely (not stacked)")

✅ Collected 0 window samples safely (not stacked)


In [21]:
import torch
from torch.utils.data import Dataset, DataLoader

class JaneStreetDataset(Dataset):
    def __init__(self, X, y, w):
        self.X = X
        self.y = y
        self.w = w

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x_tensor = torch.tensor(self.X[idx], dtype=torch.float32)
        y_tensor = torch.tensor(self.y[idx], dtype=torch.float32)
        w_tensor = torch.tensor(self.w[idx], dtype=torch.float32)
        return x_tensor, y_tensor, w_tensor


In [25]:
batch_size = 64  # safe for Kaggle memory

dataset = JaneStreetDataset(X, y, w)
X
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Quick test
for xb, yb, wb in dataloader:
    print("Batch shapes:", xb.shape, yb.shape, wb.shape)
    break

ValueError: num_samples should be a positive integer value, but got num_samples=0

In [23]:
import torch.nn as nn

class TransformerRegressor(nn.Module):
    def __init__(self, input_dim, n_steps, d_model=64, nhead=4, num_layers=2, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, d_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=128,
            dropout=dropout,
            batch_first=True  # Important: set True so input shape is (B, T, D)
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Linear(d_model, 1)  # Regression output

    def forward(self, x):
        x = self.embedding(x)               # (B, T, d_model)
        x = self.transformer(x)             # (B, T, d_model)
        x = x.mean(dim=1)                   # global average pooling across time
        out = self.head(x).squeeze(-1)      # final output (B,)
        return out


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

n_features = len(X[0][0])  # inferred from a single window
model = TransformerRegressor(input_dim=n_features, n_steps=n_steps).to(device)
print("Model ready on", device)


IndexError: list index out of range

In [None]:
import torch.optim as optim

def weighted_mse_loss(preds, targets, weights):
    return torch.mean(weights * (preds - targets) ** 2)

optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [None]:
from tqdm import tqdm

def train(model, dataloader, optimizer, device, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for xb, yb, wb in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            xb, yb, wb = xb.to(device), yb.to(device), wb.to(device)

            preds = model(xb)
            loss = weighted_mse_loss(preds, yb, wb)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * len(xb)

        avg_loss = total_loss / len(dataloader.dataset)
        print(f"Epoch {epoch+1} - Avg Loss: {avg_loss:.6f}")


In [None]:
train(model, dataloader, optimizer, device, epochs=5)


Epoch 1/5: 100%|██████████| 313/313 [00:21<00:00, 14.72it/s]


Epoch 1 - Avg Loss: 10.433811


Epoch 2/5: 100%|██████████| 313/313 [00:21<00:00, 14.43it/s]


Epoch 2 - Avg Loss: 10.355791


Epoch 3/5: 100%|██████████| 313/313 [00:21<00:00, 14.51it/s]


Epoch 3 - Avg Loss: 10.327574


Epoch 4/5: 100%|██████████| 313/313 [00:20<00:00, 14.97it/s]


Epoch 4 - Avg Loss: 10.316434


Epoch 5/5: 100%|██████████| 313/313 [00:21<00:00, 14.61it/s]

Epoch 5 - Avg Loss: 10.302765



