In [1]:
# In your notebook/*.ipynb
# Go up one level to the root directory
%cd ..
# Optional: change back to the notebook directory
# %cd notebook

/root/vscode/portfolio/mitsui_2025


In [2]:
# Since src folder is now 'packaged' by __init__.py
# to import specific abc.py file, use: from src.abc import ~
import logging 
import torch 
from src.dataprep import dataprep, WindowDataset
from src.configs import TrainConfig


data_preparer = dataprep(data_path='data')
data_preparer.one_shot_prep()  # Loads prices, targets, imputes NaNs, adds staleness/missing masks
X_raw = data_preparer.X_full_np  # (N_dates, 559 features)
Y_raw = data_preparer.Y_full_np  # (N_dates, 424 targets)

2025-10-02 01:35:25,861 - INFO - Loading train.csv from data/train.csv
2025-10-02 01:35:25,943 - INFO - Loading train_labels.csv from data/train_labels.csv
2025-10-02 01:35:26,049 - INFO - Loading target_pairs.csv from data/target_pairs.csv
2025-10-02 01:35:26,193 - INFO - X_full NaNs after imputation: 0
2025-10-02 01:35:26,193 - INFO - Y_full NaNs before imputation: 0
2025-10-02 01:35:26,196 - INFO - Y_full NaNs after imputation: 0
2025-10-02 01:35:26,196 - INFO - X_full feature count: 1671
2025-10-02 01:35:26,204 - INFO - Data loaded: X_full_np shape=(1917, 1671), Y_full_np shape=(1917, 424)


In [3]:
# Define TrainConfig with conservative settings
cfg = TrainConfig(
    input_len=32,      # Smaller context length to reduce memory usage
    output_len=1,      # One-step-ahead prediction
    batch_size=16,     # Smaller batch size for stability
    input_size=X_raw.shape[1],  # 559 features
    output_size=Y_raw.shape[1],  # 424 targets
    nhead=4,           # Fewer attention heads
    dim_feedforward=256,  # Smaller feedforward dim
    num_layers=1,      # Fewer layers
    dropout=0.2        # Higher dropout for regularization
)

try:
    X_transformed, Y_transformed = data_preparer.GluonTS_transform(cfg=cfg, model_type=['TSTP'])
    logging.info(f"X_transformed shape: {X_transformed.shape}, Y_transformed shape: {Y_transformed.shape}")
except Exception as e:
    logging.error(f"Error in GluonTS_transform: {str(e)}")

2025-10-02 01:35:26,227 - INFO - Manual time features added: X_transformed shape (1917, 1675), Y_transformed shape (1917, 424)
2025-10-02 01:35:26,227 - INFO - X_transformed shape: (1917, 1675), Y_transformed shape: (1917, 424)


In [4]:


# Optional: Add frequency-domain features (FFT)
X_raw = data_preparer.preprocess_for_frequency(X_raw, use_fft=False)  # Set use_fft=True if desired
logging.info(f"X_raw shape after preprocess_for_frequency: {X_raw.shape}")

# Apply GluonTS transformation
try:
    X_transformed, Y_transformed = data_preparer.GluonTS_transform(cfg=cfg, model_type=['TSTP'])
    logging.info(f"X_transformed shape: {X_transformed.shape}, Y_transformed shape: {Y_transformed.shape}")
except Exception as e:
    logging.error(f"GluonTS transformation failed: {str(e)}")
    raise

# Create WindowDataset
try:
    dataset = WindowDataset(X_transformed, Y_transformed, input_len=cfg.input_len, output_len=cfg.output_len)
    Xw, Yw = dataset.x, dataset.y
    logging.info(f"Windowed shapes: Xw={Xw.shape}, Yw={Yw.shape}")
except Exception as e:
    logging.error(f"WindowDataset creation failed: {str(e)}")
    raise

# Create TensorDataset and split
from torch.utils.data import TensorDataset, DataLoader, random_split
dataset = TensorDataset(
    torch.tensor(Xw, dtype=torch.float32),
    torch.tensor(Yw, dtype=torch.float32)
)
N = len(dataset)
n_val = max(1, int(0.15 * N))
n_test = max(1, int(0.1 * N))
n_train = N - n_val - n_test
logging.info(f"Dataset split: train={n_train}, val={n_val}, test={n_test}")

try:
    train_dataset, val_dataset, test_dataset = random_split(
        dataset, [n_train, n_val, n_test], generator=torch.Generator().manual_seed(42)
    )
except Exception as e:
    logging.error(f"Dataset split failed: {str(e)}")
    raise

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=cfg.batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, shuffle=False)
logging.info("DataLoaders created successfully")

2025-10-02 01:35:26,237 - INFO - X_raw shape after preprocess_for_frequency: (1917, 1671)
2025-10-02 01:35:26,243 - INFO - Manual time features added: X_transformed shape (1917, 1675), Y_transformed shape (1917, 424)
2025-10-02 01:35:26,244 - INFO - X_transformed shape: (1917, 1675), Y_transformed shape: (1917, 424)
2025-10-02 01:35:26,482 - INFO - WindowDataset created: Xw shape=(1885, 32, 1675), Yw shape=(1885, 1, 424), invalid_windows: 0
2025-10-02 01:35:26,483 - INFO - Windowed shapes: Xw=(1885, 32, 1675), Yw=(1885, 1, 424)
2025-10-02 01:35:26,618 - INFO - Dataset split: train=1415, val=282, test=188
2025-10-02 01:35:26,626 - INFO - DataLoaders created successfully
