In [None]:
# Import libraries
import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)

In [None]:
# Data paths
PROJECT_ROOT = Path.cwd().parent
DATA_RAW = PROJECT_ROOT / 'data' / 'raw'
DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'

# Create processed data directory if it doesn't exist
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

TRAIN_FEATURES_PATH = DATA_RAW / 'train.csv'
TRAIN_LABELS_PATH = DATA_RAW / 'train_labels.csv'
TEST_PATH = DATA_RAW / 'test.csv'
TARGET_PAIRS_PATH = DATA_RAW / 'target_pairs.csv'

print(f"Project root: {PROJECT_ROOT}")
print(f"Raw data: {DATA_RAW}")
print(f"Processed data: {DATA_PROCESSED}")

In [None]:
# Load data
print("Loading data...")
features = pd.read_csv(TRAIN_FEATURES_PATH, low_memory=False)
labels = pd.read_csv(TRAIN_LABELS_PATH, low_memory=False)
test = pd.read_csv(TEST_PATH, low_memory=False)
target_pairs = pd.read_csv(TARGET_PAIRS_PATH)

print(f"Features shape: {features.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Test shape: {test.shape}")
print(f"Target pairs shape: {target_pairs.shape}")

# Check date ranges
print(f"\nDate ranges:")
print(f"Features: {features['date_id'].min()} -> {features['date_id'].max()}")
print(f"Labels: {labels['date_id'].min()} -> {labels['date_id'].max()}")
print(f"Test: {test['date_id'].min()} -> {test['date_id'].max()}")

# Create proper time series splits with gaps
# Training: First 1600 days
train_size = 1600

# Gap: Next 100 days (to prevent data leakage)
gap_size = 100

# Validation: Next 90 days
val_size = 90

# Gap: Next 20 days (to prevent data leakage)
gap2_size = 20

# Test: Last 90 days (as defined by competition)
test_size = 90

print(f"\nTime series splits with gaps:")
print(f"Training samples: {train_size} (date_id 0 -> {train_size-1})")
print(f"Gap 1: {gap_size} days (date_id {train_size} -> {train_size + gap_size - 1})")
print(f"Validation samples: {val_size} (date_id {train_size + gap_size} -> {train_size + gap_size + val_size - 1})")
print(f"Gap 2: {gap2_size} days (date_id {train_size + gap_size + val_size} -> {train_size + gap_size + val_size + gap2_size - 1})")
print(f"Test samples: {test_size} (date_id {test['date_id'].min()} -> {test['date_id'].max()})")

# Split the data
train_features = features.iloc[:train_size].copy()
train_labels = labels.iloc[:train_size].copy()
val_features = features.iloc[train_size + gap_size:train_size + gap_size + val_size].copy()
val_labels = labels.iloc[train_size + gap_size:train_size + gap_size + val_size].copy()

print(f"\nSplit data shapes:")
print(f"Train features: {train_features.shape}")
print(f"Train labels: {train_labels.shape}")
print(f"Validation features: {val_features.shape}")
print(f"Validation labels: {val_labels.shape}")
print(f"Test features: {test.shape}")

In [None]:
# Identify feature columns
feature_cols = [c for c in train_features.columns if c != 'date_id']
target_cols = [c for c in train_labels.columns if c != 'date_id']

print(f"Number of features: {len(feature_cols)}")
print(f"Number of targets: {len(target_cols)}")

# Group features by type
feature_groups = {
    'JPX_Futures': [col for col in feature_cols if 'JPX_' in col],
    'LME_Metals': [col for col in feature_cols if 'LME_' in col],
    'US_Stocks': [col for col in feature_cols if 'US_Stock_' in col],
    'FX_Pairs': [col for col in feature_cols if 'FX_' in col]
}

print("\nFeature group sizes:")
for group_name, group_cols in feature_groups.items():
    print(f"{group_name}: {len(group_cols)} features")

In [None]:
# Create log returns for all price features
print("Creating log returns for training data...")

# Identify price columns (exclude date_id and any non-price columns)
price_cols = [col for col in feature_cols if not col.endswith('_missing') and col != 'date_id']

print(f"Creating log returns for {len(price_cols)} price columns")

# Create log returns for TRAINING data only
log_returns = np.log(train_features[price_cols] / train_features[price_cols].shift(1))

# Rename columns to indicate they're returns
log_returns.columns = [f"{col}_log_return" for col in price_cols]

# Check missing values in log returns
missing_log_returns = log_returns.isna().sum().sort_values(ascending=False)
print("\nMissing values in log returns (top 20):")
print(missing_log_returns.head(20))

# Show missing value statistics
print(f"\nTotal missing values: {log_returns.isna().sum().sum()}")
print(f"Missing value percentage: {log_returns.isna().sum().sum() / (log_returns.shape[0] * log_returns.shape[1]) * 100:.2f}%")

In [None]:
# Technical indicator functions
def calculate_rsi(prices, window=14):
    """Calculate RSI (Relative Strength Index)"""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

def calculate_bollinger_bands(prices, window=20, num_std=2):
    """Calculate Bollinger Bands"""
    rolling_mean = prices.rolling(window=window).mean()
    rolling_std = prices.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, rolling_mean, lower_band

def calculate_macd(prices, fast=12, slow=26, signal=9):
    """Calculate MACD (Moving Average Convergence Divergence)"""
    ema_fast = prices.ewm(span=fast).mean()
    ema_slow = prices.ewm(span=slow).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal).mean()
    histogram = macd_line - signal_line
    return macd_line, signal_line, histogram

In [None]:
# Create technical indicators for key instruments
print("Creating technical indicators for training data...")

technical_features = pd.DataFrame(index=train_features.index)

# Key instruments to create indicators for
key_instruments = [
    'LME_AH_Close',
    'JPX_Gold_Standard_Futures_Close',
    'US_Stock_VT_adj_close'
]

for instrument in key_instruments:
    if instrument in train_features.columns:
        prices = train_features[instrument]  # Use training data only
        
        # RSI
        technical_features[f'{instrument}_rsi_14'] = calculate_rsi(prices, 14)
        
        # Bollinger Bands
        bb_upper, bb_middle, bb_lower = calculate_bollinger_bands(prices, 20)
        technical_features[f'{instrument}_bb_upper'] = bb_upper
        technical_features[f'{instrument}_bb_middle'] = bb_middle
        technical_features[f'{instrument}_bb_lower'] = bb_lower
        technical_features[f'{instrument}_bb_position'] = (prices - bb_lower) / (bb_upper - bb_lower)
        
        # MACD
        macd_line, signal_line, histogram = calculate_macd(prices)
        technical_features[f'{instrument}_macd'] = macd_line
        technical_features[f'{instrument}_macd_signal'] = signal_line
        technical_features[f'{instrument}_macd_histogram'] = histogram
        
        print(f"Created indicators for {instrument}")
    else:
        print(f"Warning: {instrument} not found in features")

print(f"\nCreated {len(technical_features.columns)} technical indicators")

In [None]:
# Create lagged features
print("Creating lagged features for training data...")

lagged_features = pd.DataFrame(index=train_features.index)  # Use training data index

# Lag periods to create
lag_periods = [1, 5, 10, 20]

# Create lags for key instruments
for instrument in key_instruments:
    if instrument in train_features.columns:
        prices = train_features[instrument]  # Use training data only
        
        for lag in lag_periods:
            lagged_features[f'{instrument}_lag_{lag}'] = prices.shift(lag)
            
        print(f"Created lags for {instrument}")
    else:
        print(f"Warning: {instrument} not found in features")

print(f"\nCreated {len(lagged_features.columns)} lagged features")

In [None]:
# Create rolling statistics
print("Creating rolling statistics for training data...")

rolling_features = pd.DataFrame(index=train_features.index)  # Use training data index

# Rolling windows
rolling_windows = [5, 10, 20]

# Create rolling stats for key instruments
for instrument in key_instruments:
    if instrument in train_features.columns:
        prices = train_features[instrument]  # Use training data only
        
        for window in rolling_windows:
            rolling_features[f'{instrument}_rolling_mean_{window}'] = prices.rolling(window).mean()
            rolling_features[f'{instrument}_rolling_std_{window}'] = prices.rolling(window).std()
            rolling_features[f'{instrument}_rolling_min_{window}'] = prices.rolling(window).min()
            rolling_features[f'{instrument}_rolling_max_{window}'] = prices.rolling(window).max()
            
        print(f"Created rolling stats for {instrument}")
    else:
        print(f"Warning: {instrument} not found in features")

print(f"\nCreated {len(rolling_features.columns)} rolling features")

In [None]:
# Create cross-asset features
print("Creating cross-asset features for training data...")

cross_asset_features = pd.DataFrame(index=train_features.index)  # Use training data index

# Gold-Platinum relationship
if 'JPX_Gold_Standard_Futures_Close' in train_features.columns and 'JPX_Platinum_Standard_Futures_Close' in train_features.columns:
    gold = train_features['JPX_Gold_Standard_Futures_Close']  # Use training data
    platinum = train_features['JPX_Platinum_Standard_Futures_Close']  # Use training data
    
    cross_asset_features['gold_platinum_spread'] = gold - platinum
    cross_asset_features['gold_platinum_ratio'] = gold / platinum
    cross_asset_features['gold_platinum_spread_rolling_mean_20'] = cross_asset_features['gold_platinum_spread'].rolling(20).mean()
    
    print("Created Gold-Platinum features")

# Metals index (simple average of LME metals)
lme_metals = [col for col in train_features.columns if 'LME_' in col and 'Close' in col]  # Use training data
if lme_metals:
    metals_data = train_features[lme_metals].fillna(method='ffill')  # Use training data
    cross_asset_features['lme_metals_index'] = metals_data.mean(axis=1)
    cross_asset_features['lme_metals_volatility'] = metals_data.std(axis=1)
    
    print(f"Created LME metals index from {len(lme_metals)} instruments")

print(f"\nCreated {len(cross_asset_features.columns)} cross-asset features")

In [None]:
# Combine all features
print("Combining all training features...")

# Start with original training features
all_features = train_features.copy()  # Use training data

# Add log returns
all_features = pd.concat([all_features, log_returns], axis=1)
print(f"Added {len(log_returns.columns)} log return features")

# Add technical indicators
all_features = pd.concat([all_features, technical_features], axis=1)
print(f"Added {len(technical_features.columns)} technical indicator features")

# Add lagged features
all_features = pd.concat([all_features, lagged_features], axis=1)
print(f"Added {len(lagged_features.columns)} lagged features")

# Add rolling features
all_features = pd.concat([all_features, rolling_features], axis=1)
print(f"Added {len(rolling_features.columns)} rolling features")

# Add cross-asset features
all_features = pd.concat([all_features, cross_asset_features], axis=1)
print(f"Added {len(cross_asset_features.columns)} cross-asset features")

print(f"\nFinal training feature matrix shape: {all_features.shape}")
print(f"Total features: {len(all_features.columns)}")

In [None]:
# Feature quality analysis
print("Analyzing feature quality...")

# Missing value analysis
missing_pct = all_features.isna().sum() / len(all_features) * 100
print(f"\nFeatures with >50% missing values: {(missing_pct > 50).sum()}")
print(f"Features with >80% missing values: {(missing_pct > 80).sum()}")

# Constant features (no variance)
constant_features = []
for col in all_features.columns:
    if all_features[col].nunique() <= 1:
        constant_features.append(col)

print(f"\nConstant features: {len(constant_features)}")
if constant_features:
    print("Constant features:", constant_features[:10])

# Feature correlation analysis (sample)
print("\nAnalyzing feature correlations...")
sample_features = all_features.sample(n=min(100, len(all_features.columns)), axis=1, random_state=42)
correlation_matrix = sample_features.corr()

# Find highly correlated features
high_corr_pairs = np.where(np.abs(correlation_matrix) > 0.95)
high_corr_pairs = [(correlation_matrix.index[x], correlation_matrix.columns[y]) 
                   for x, y in zip(*high_corr_pairs) if x != y and x < y]

print(f"Highly correlated feature pairs (>0.95): {len(high_corr_pairs)}")
if high_corr_pairs:
    print("Sample high correlation pairs:", high_corr_pairs[:5])

In [None]:
# Apply feature engineering to validation data
print("Applying feature engineering to validation data...")

# Create log returns for validation data
val_log_returns = np.log(val_features[price_cols] / val_features[price_cols].shift(1))
val_log_returns.columns = [f"{col}_log_return" for col in price_cols]

# Create technical indicators for validation data
val_technical_features = pd.DataFrame(index=val_features.index)
for instrument in key_instruments:
    if instrument in val_features.columns:
        prices = val_features[instrument]
        
        # RSI
        val_technical_features[f'{instrument}_rsi_14'] = calculate_rsi(prices, 14)
        
        # Bollinger Bands
        bb_upper, bb_middle, bb_lower = calculate_bollinger_bands(prices, 20)
        val_technical_features[f'{instrument}_bb_upper'] = bb_upper
        val_technical_features[f'{instrument}_bb_middle'] = bb_middle
        val_technical_features[f'{instrument}_bb_lower'] = bb_lower
        val_technical_features[f'{instrument}_bb_position'] = (prices - bb_lower) / (bb_upper - bb_lower)
        
        # MACD
        macd_line, signal_line, histogram = calculate_macd(prices)
        val_technical_features[f'{instrument}_macd'] = macd_line
        val_technical_features[f'{instrument}_macd_signal'] = signal_line
        val_technical_features[f'{instrument}_macd_histogram'] = histogram

# Create lagged features for validation data
val_lagged_features = pd.DataFrame(index=val_features.index)
for instrument in key_instruments:
    if instrument in val_features.columns:
        prices = val_features[instrument]
        for lag in lag_periods:
            val_lagged_features[f'{instrument}_lag_{lag}'] = prices.shift(lag)

# Create rolling features for validation data
val_rolling_features = pd.DataFrame(index=val_features.index)
for instrument in key_instruments:
    if instrument in val_features.columns:
        prices = val_features[instrument]
        for window in rolling_windows:
            val_rolling_features[f'{instrument}_rolling_mean_{window}'] = prices.rolling(window).mean()
            val_rolling_features[f'{instrument}_rolling_std_{window}'] = prices.rolling(window).std()
            val_rolling_features[f'{instrument}_rolling_min_{window}'] = prices.rolling(window).min()
            val_rolling_features[f'{instrument}_rolling_max_{window}'] = prices.rolling(window).max()

# Create cross-asset features for validation data
val_cross_asset_features = pd.DataFrame(index=val_features.index)
if 'JPX_Gold_Standard_Futures_Close' in val_features.columns and 'JPX_Platinum_Standard_Futures_Close' in val_features.columns:
    gold = val_features['JPX_Gold_Standard_Futures_Close']
    platinum = val_features['JPX_Platinum_Standard_Futures_Close']
    
    val_cross_asset_features['gold_platinum_spread'] = gold - platinum
    val_cross_asset_features['gold_platinum_ratio'] = gold / platinum
    val_cross_asset_features['gold_platinum_spread_rolling_mean_20'] = val_cross_asset_features['gold_platinum_spread'].rolling(20).mean()

lme_metals = [col for col in val_features.columns if 'LME_' in col and 'Close' in col]
if lme_metals:
    metals_data = val_features[lme_metals].fillna(method='ffill')
    val_cross_asset_features['lme_metals_index'] = metals_data.mean(axis=1)
    val_cross_asset_features['lme_metals_volatility'] = metals_data.std(axis=1)

# Combine all validation features
val_all_features = val_features.copy()
val_all_features = pd.concat([val_all_features, val_log_returns], axis=1)
val_all_features = pd.concat([val_all_features, val_technical_features], axis=1)
val_all_features = pd.concat([val_all_features, val_lagged_features], axis=1)
val_all_features = pd.concat([val_all_features, val_rolling_features], axis=1)
val_all_features = pd.concat([val_all_features, val_cross_asset_features], axis=1)

print(f"Validation features shape: {val_all_features.shape}")
print(f"Validation features match training: {val_all_features.shape[1] == all_features.shape[1]}")

In [None]:
# Apply feature engineering to test data
print("Applying feature engineering to test data...")

# Create log returns for test data
test_log_returns = np.log(test[price_cols] / test[price_cols].shift(1))
test_log_returns.columns = [f"{col}_log_return" for col in price_cols]

# Create technical indicators for test data
test_technical_features = pd.DataFrame(index=test.index)
for instrument in key_instruments:
    if instrument in test.columns:
        prices = test[instrument]
        
        # RSI
        test_technical_features[f'{instrument}_rsi_14'] = calculate_rsi(prices, 14)
        
        # Bollinger Bands
        bb_upper, bb_middle, bb_lower = calculate_bollinger_bands(prices, 20)
        test_technical_features[f'{instrument}_bb_upper'] = bb_upper
        test_technical_features[f'{instrument}_bb_middle'] = bb_middle
        test_technical_features[f'{instrument}_bb_lower'] = bb_lower
        test_technical_features[f'{instrument}_bb_position'] = (prices - bb_lower) / (bb_upper - bb_lower)
        
        # MACD
        macd_line, signal_line, histogram = calculate_macd(prices)
        test_technical_features[f'{instrument}_macd'] = macd_line
        test_technical_features[f'{instrument}_macd_signal'] = signal_line
        test_technical_features[f'{instrument}_macd_histogram'] = histogram

# Create lagged features for test data
test_lagged_features = pd.DataFrame(index=test.index)
for instrument in key_instruments:
    if instrument in test.columns:
        prices = test[instrument]
        for lag in lag_periods:
            test_lagged_features[f'{instrument}_lag_{lag}'] = prices.shift(lag)

# Create rolling features for test data
test_rolling_features = pd.DataFrame(index=test.index)
for instrument in key_instruments:
    if instrument in test.columns:
        prices = test[instrument]
        for window in rolling_windows:
            test_rolling_features[f'{instrument}_rolling_mean_{window}'] = prices.rolling(window).mean()
            test_rolling_features[f'{instrument}_rolling_std_{window}'] = prices.rolling(window).std()
            test_rolling_features[f'{instrument}_rolling_min_{window}'] = prices.rolling(window).min()
            test_rolling_features[f'{instrument}_rolling_max_{window}'] = prices.rolling(window).max()

# Create cross-asset features for test data
test_cross_asset_features = pd.DataFrame(index=test.index)
if 'JPX_Gold_Standard_Futures_Close' in test.columns and 'JPX_Platinum_Standard_Futures_Close' in test.columns:
    gold = test['JPX_Gold_Standard_Futures_Close']
    platinum = test['JPX_Platinum_Standard_Futures_Close']
    
    test_cross_asset_features['gold_platinum_spread'] = gold - platinum
    test_cross_asset_features['gold_platinum_ratio'] = gold / platinum
    test_cross_asset_features['gold_platinum_spread_rolling_mean_20'] = test_cross_asset_features['gold_platinum_spread'].rolling(20).mean()

lme_metals = [col for col in test.columns if 'LME_' in col and 'Close' in col]
if lme_metals:
    metals_data = test[lme_metals].fillna(method='ffill')
    test_cross_asset_features['lme_metals_index'] = metals_data.mean(axis=1)
    test_cross_asset_features['lme_metals_volatility'] = metals_data.std(axis=1)

# Combine all test features
test_all_features = test.copy()
test_all_features = pd.concat([test_all_features, test_log_returns], axis=1)
test_all_features = pd.concat([test_all_features, test_technical_features], axis=1)
test_all_features = pd.concat([test_all_features, test_lagged_features], axis=1)
test_all_features = pd.concat([test_all_features, test_rolling_features], axis=1)
test_all_features = pd.concat([test_all_features, test_cross_asset_features], axis=1)

print(f"Test features shape: {test_all_features.shape}")
print(f"Test features match training: {test_all_features.shape[1] == all_features.shape[1]}")

In [None]:
# Save processed data with proper splits
print("Saving processed data...")

# Save training data (with engineered features)
train_features_output_path = DATA_PROCESSED / 'train_features_engineered.csv'
train_labels_output_path = DATA_PROCESSED / 'train_labels.csv'
val_features_output_path = DATA_PROCESSED / 'val_features_engineered.csv'
val_labels_output_path = DATA_PROCESSED / 'val_labels.csv'
test_output_path = DATA_PROCESSED / 'test_features_engineered.csv'

# Save training data with engineered features
all_features.to_csv(train_features_output_path, index=False)
train_labels.to_csv(train_labels_output_path, index=False)
print(f"Saved training features (engineered): {train_features_output_path}")
print(f"Saved training labels: {train_labels_output_path}")

# Save validation data with engineered features
val_all_features.to_csv(val_features_output_path, index=False)
val_labels.to_csv(val_labels_output_path, index=False)
print(f"Saved validation features (engineered): {val_features_output_path}")
print(f"Saved validation labels: {val_labels_output_path}")

# Save test data with engineered features
test_all_features.to_csv(test_output_path, index=False)
print(f"Saved test features (engineered): {test_output_path}")

print("\nFeature engineering complete!")
print(f"Final feature count: {len(all_features.columns)}")
print(f"Training samples: {len(all_features)}")
print(f"Validation samples: {len(val_all_features)}")
print(f"Test samples: {len(test_all_features)}")
print(f"\nTime series splits:")
print(f"Training: date_id 0 -> {train_size-1}")
print(f"Gap 1: date_id {train_size} -> {train_size + gap_size - 1}")
print(f"Validation: date_id {train_size + gap_size} -> {train_size + gap_size + val_size - 1}")
print(f"Gap 2: date_id {train_size + gap_size + val_size} -> {train_size + gap_size + val_size + gap2_size - 1}")
print(f"Test: date_id {test['date_id'].min()} -> {test['date_id'].max()}")