In [20]:
# Import libraries
import os
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
import time
from datetime import datetime
warnings.filterwarnings('ignore')

# Machine learning libraries
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Import competition metrics
import sys
sys.path.append(str(Path.cwd().parent))
from src.metrics import calculate_competition_score, interpret_competition_score

# Set display options
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 100)

In [21]:
# Setup paths
PROJECT_ROOT = Path.cwd().parent
DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'
MODELS_DIR = PROJECT_ROOT / 'models' / 'baseline'
RESULTS_DIR = PROJECT_ROOT / 'results'
PREDICTIONS_DIR = PROJECT_ROOT / 'models' / 'predictions'

# Create directories if they don't exist
MODELS_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
PREDICTIONS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Processed data: {DATA_PROCESSED}")
print(f"Models directory: {MODELS_DIR}")
print(f"Results directory: {RESULTS_DIR}")
print(f"Predictions directory: {PREDICTIONS_DIR}")

Project root: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction
Processed data: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/data/processed
Models directory: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline
Results directory: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/results
Predictions directory: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/predictions


In [22]:
# Load processed data from notebook 02
print("Loading processed data...")

# Check if processed data exists with new structure
train_features_path = DATA_PROCESSED / 'train_features_engineered.csv'
train_labels_path = DATA_PROCESSED / 'train_labels.csv'
val_features_path = DATA_PROCESSED / 'val_features_engineered.csv'
val_labels_path = DATA_PROCESSED / 'val_labels.csv'
test_features_path = DATA_PROCESSED / 'test_features_engineered.csv'

if train_features_path.exists() and val_features_path.exists():
    # Load processed data with train/val split
    train_features_engineered = pd.read_csv(train_features_path)
    train_labels = pd.read_csv(train_labels_path)
    val_features_engineered = pd.read_csv(val_features_path)
    val_labels = pd.read_csv(val_labels_path)
    test_features_engineered = pd.read_csv(test_features_path)
    
    print("Loaded processed data with train/validation split")
    print(f"Training features: {train_features_engineered.shape}")
    print(f"Training labels: {train_labels.shape}")
    print(f"Validation features: {val_features_engineered.shape}")
    print(f"Validation labels: {val_labels.shape}")
    print(f"Test features: {test_features_engineered.shape}")
    
    # Extract log returns for training data (baseline uses only log returns)
    log_return_cols = [col for col in train_features_engineered.columns if 'log_return' in col]
    print(f"Found {len(log_return_cols)} log return columns")
    
    if len(log_return_cols) > 0:
        train_log_returns = train_features_engineered[['date_id'] + log_return_cols]
        val_log_returns = val_features_engineered[['date_id'] + log_return_cols]
        print(f"Successfully extracted log returns")
    else:
        print("No log return columns found in processed data!")
        raise ValueError("No log returns found in processed data")
    
else:
    print("Processed data not found. Please run notebook 02 first.")
    raise FileNotFoundError("Processed data not found")

print(f"\nData summary:")
print(f"Training log returns: {train_log_returns.shape}")
print(f"Validation log returns: {val_log_returns.shape}")
print(f"Training labels: {train_labels.shape}")
print(f"Validation labels: {val_labels.shape}")

# Verify we have log returns
print(f"\nLog return columns: {len([col for col in train_log_returns.columns if 'log_return' in col])}")
print(f"Sample log return columns: {[col for col in train_log_returns.columns if 'log_return' in col][:5]}")

Loading processed data...
Loaded processed data with train/validation split
Training features: (1600, 1192)
Training labels: (1600, 425)
Validation features: (90, 1192)
Validation labels: (90, 425)
Test features: (90, 1193)
Found 557 log return columns
Successfully extracted log returns

Data summary:
Training log returns: (1600, 558)
Validation log returns: (90, 558)
Training labels: (1600, 425)
Validation labels: (90, 425)

Log return columns: 557
Sample log return columns: ['LME_AH_Close_log_return', 'LME_CA_Close_log_return', 'LME_PB_Close_log_return', 'LME_ZS_Close_log_return', 'JPX_Gold_Mini_Futures_Open_log_return']


In [23]:
# Prepare data for modeling
print("Preparing data for modeling...")

# Get target columns
target_cols = [c for c in train_labels.columns if c != 'date_id']
print(f"Number of targets: {len(target_cols)}")

# Prepare feature matrix for training (log returns only)
feature_cols = [col for col in train_log_returns.columns if col != 'date_id']
X_train = train_log_returns[feature_cols]
y_train = train_labels[target_cols]

# Prepare feature matrix for validation (log returns only)
X_val = val_log_returns[feature_cols]
y_val = val_labels[target_cols]

print(f"Training feature matrix shape: {X_train.shape}")
print(f"Training target matrix shape: {y_train.shape}")
print(f"Validation feature matrix shape: {X_val.shape}")
print(f"Validation target matrix shape: {y_val.shape}")
print(f"Feature-to-sample ratio (training): {X_train.shape[1] / X_train.shape[0]:.3f}")

# Check missing values
train_missing_pct = X_train.isna().sum().sum() / (X_train.shape[0] * X_train.shape[1]) * 100
val_missing_pct = X_val.isna().sum().sum() / (X_val.shape[0] * X_val.shape[1]) * 100
print(f"Missing values in training features: {train_missing_pct:.2f}%")
print(f"Missing values in validation features: {val_missing_pct:.2f}%")

# Check target missing values
train_target_missing_pct = y_train.isna().sum().sum() / (y_train.shape[0] * y_train.shape[1]) * 100
val_target_missing_pct = y_val.isna().sum().sum() / (y_val.shape[0] * y_val.shape[1]) * 100
print(f"Missing values in training targets: {train_target_missing_pct:.2f}%")
print(f"Missing values in validation targets: {val_target_missing_pct:.2f}%")

Preparing data for modeling...
Number of targets: 424
Training feature matrix shape: (1600, 557)
Training target matrix shape: (1600, 424)
Validation feature matrix shape: (90, 557)
Validation target matrix shape: (90, 424)
Feature-to-sample ratio (training): 0.348
Missing values in training features: 7.22%
Missing values in validation features: 4.69%
Missing values in training targets: 10.51%
Missing values in validation targets: 7.25%


In [25]:
# Baseline model training (start with first 10 targets)
print("Training baseline models...")

# Start with first 10 targets for quick testing
test_targets = target_cols[:10]
print(f"Training models for targets: {test_targets}")

# Store results
models = {}
scores = {}
training_times = {}
feature_importance = {}

# Note: Using single validation set approach (no cross-validation on training data)
# This is more appropriate for time series data with proper train/validation splits

for i, target in enumerate(test_targets):
    print(f"\nTraining model {i+1}/{len(test_targets)} for {target}...")
    
    # Get target data (remove missing values)
    train_target_data = y_train[target].dropna()
    train_feature_data = X_train.loc[train_target_data.index]
    
    print(f"  Training samples: {len(train_target_data)}")
    print(f"  Features: {train_feature_data.shape[1]}")
    
    # Train model
    start_time = time.time()
    
    model = xgb.XGBRegressor(
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    )
    
    # Train model on all training data (no cross-validation)
    model.fit(train_feature_data, train_target_data)
    
    training_time = time.time() - start_time
    
    # Store results
    models[target] = model
    scores[target] = 0.0  # We'll calculate this on validation set
    training_times[target] = training_time
    
    # Feature importance
    importance = model.feature_importances_
    feature_importance[target] = pd.DataFrame({
        'feature': train_feature_data.columns,
        'importance': importance
    }).sort_values('importance', ascending=False)
    
    print(f"  Training time: {training_time:.2f}s")
    print(f"  Top feature: {feature_importance[target].iloc[0]['feature']} ({feature_importance[target].iloc[0]['importance']:.4f})")

Training baseline models...
Training models for targets: ['target_0', 'target_1', 'target_2', 'target_3', 'target_4', 'target_5', 'target_6', 'target_7', 'target_8', 'target_9']

Training model 1/10 for target_0...
  Training samples: 1494
  Features: 557
  Training time: 4.77s
  Top feature: US_Stock_MPC_adj_low_log_return (0.0300)

Training model 2/10 for target_1...
  Training samples: 1456
  Features: 557
  Training time: 4.56s
  Top feature: US_Stock_MPC_adj_low_log_return (0.0254)

Training model 3/10 for target_2...
  Training samples: 1529
  Features: 557
  Training time: 4.78s
  Top feature: US_Stock_VWO_adj_low_log_return (0.0116)

Training model 4/10 for target_3...
  Training samples: 1529
  Features: 557
  Training time: 4.76s
  Top feature: US_Stock_BCS_adj_low_log_return (0.0118)

Training model 5/10 for target_4...
  Training samples: 1363
  Features: 557
  Training time: 4.66s
  Top feature: US_Stock_XOM_adj_low_log_return (0.0130)

Training model 6/10 for target_5...


In [31]:
# Performance summary
print("Baseline Model Performance Summary")
print("=" * 50)

performance_df = pd.DataFrame({
    'target': list(models.keys()),
    'training_time': list(training_times.values())
})

print(performance_df)
print(f"\nAverage training time: {performance_df['training_time'].mean():.2f}s")
print(f"Total training time: {performance_df['training_time'].sum():.2f}s")
print(f"Models trained: {len(models)}")

# Note: Performance metrics will be calculated on validation set
print("\nNote: Model performance will be evaluated on validation set using competition metric")

Baseline Model Performance Summary
     target  training_time
0  target_0       4.774156
1  target_1       4.563421
2  target_2       4.781083
3  target_3       4.759797
4  target_4       4.663609
5  target_5       4.554591
6  target_6       4.700098
7  target_7       4.647001
8  target_8       4.378689
9  target_9       4.844430

Average training time: 4.67s
Total training time: 46.67s
Models trained: 10

Note: Model performance will be evaluated on validation set using competition metric


In [32]:
# Feature importance analysis
print("Feature Importance Analysis")
print("=" * 50)

# Combine all feature importance
all_importance = pd.concat(feature_importance.values(), keys=feature_importance.keys())
all_importance = all_importance.reset_index()

# Check the actual column names
print("Actual column names:", all_importance.columns.tolist())

# Rename columns based on what we actually have
if len(all_importance.columns) == 4:
    # If we have 4 columns, it's likely: level_0, level_1, feature, importance
    all_importance.columns = ['target', 'index', 'feature', 'importance']
    # Drop the index column if it's not needed
    all_importance = all_importance.drop('index', axis=1)
elif len(all_importance.columns) == 3:
    # If we have 3 columns, it's likely: level_0, feature, importance
    all_importance.columns = ['target', 'feature', 'importance']
else:
    # Let's see what we actually have
    print("Unexpected number of columns. Actual columns:")
    print(all_importance.columns.tolist())
    print("\nFirst few rows:")
    print(all_importance.head())

# Overall feature importance (average across targets)
overall_importance = all_importance.groupby('feature')['importance'].mean().sort_values(ascending=False)

print("\nTop 20 most important features overall:")
print(overall_importance.head(20))

# Feature importance by target
print("\nTop 5 features for each target:")
for target in test_targets:
    print(f"\n{target}:")
    top_features = feature_importance[target].head(5)
    for _, row in top_features.iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")

Feature Importance Analysis
Actual column names: ['level_0', 'level_1', 'feature', 'importance']

Top 20 most important features overall:
feature
US_Stock_MPC_adj_low_log_return       0.007539
US_Stock_NUGT_adj_close_log_return    0.005575
US_Stock_VWO_adj_low_log_return       0.005164
US_Stock_EWJ_adj_close_log_return     0.005075
US_Stock_SPYV_adj_close_log_return    0.004545
US_Stock_GDX_adj_close_log_return     0.004341
US_Stock_MPC_adj_high_log_return      0.004272
US_Stock_TRGP_adj_high_log_return     0.004153
US_Stock_IGSB_adj_open_log_return     0.003968
US_Stock_EWZ_adj_low_log_return       0.003938
US_Stock_HES_adj_low_log_return       0.003574
US_Stock_BNDX_adj_high_log_return     0.003428
US_Stock_ACWI_adj_close_log_return    0.003415
US_Stock_EWT_adj_close_log_return     0.003378
US_Stock_JNK_adj_low_log_return       0.003353
US_Stock_IEF_adj_close_log_return     0.003262
US_Stock_EEM_adj_open_log_return      0.003158
FX_AUDUSD_log_return                  0.003117
US_Stock

In [34]:
# Save models and results
print("Saving models and results...")

# Save trained models
for target in models.keys():
    model_path = MODELS_DIR / f'{target}_model.joblib'
    joblib.dump(models[target], model_path)
    print(f"Saved model: {model_path}")

# Save training summary (no performance metrics since we're not using CV)
training_summary = pd.DataFrame({
    'target': list(models.keys()),
    'training_time': list(training_times.values())
})
training_summary_path = RESULTS_DIR / 'baseline_training_summary.csv'
training_summary.to_csv(training_summary_path, index=False)
print(f"Saved training summary: {training_summary_path}")

# Save feature importance
importance_path = RESULTS_DIR / 'feature_importance_baseline.csv'
all_importance.to_csv(importance_path, index=False)
print(f"Saved feature importance: {importance_path}")

# Save overall importance summary
overall_importance_path = RESULTS_DIR / 'feature_importance_summary.csv'
overall_importance.to_frame().to_csv(overall_importance_path)
print(f"Saved overall importance: {overall_importance_path}")

print(f"\nModels and results saved successfully!")
print(f"Note: Performance evaluation will be done on validation set")

Saving models and results...
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline/target_0_model.joblib
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline/target_1_model.joblib
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline/target_2_model.joblib
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline/target_3_model.joblib
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline/target_4_model.joblib
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline/target_5_model.joblib
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline/target_6_model.joblib
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/models/baseline/target_7_model.joblib
Saved model: /Users/fw1230/Documents/Projects/mitsui-commodity-prediction/m

In [29]:
# Generate predictions on validation set with competition metric
print("Generating predictions on validation set...")

# Use the already loaded validation data with log returns only
val_feature_cols = [col for col in val_log_returns.columns if col != 'date_id']
X_val = val_log_returns[val_feature_cols]
y_val = val_labels[target_cols]

print(f"Validation log returns shape: {val_log_returns.shape}")
print(f"Validation feature matrix shape: {X_val.shape}")
print(f"Validation target matrix shape: {y_val.shape}")

# Generate predictions on validation set
val_predictions = {}
val_metrics = {}

for target in models.keys():
    print(f"Generating predictions for {target}...")
    
    # Handle missing values
    val_features_clean = X_val.fillna(0)
    
    # Ensure we have the same features as training
    missing_features = set(feature_cols) - set(val_features_clean.columns)
    if missing_features:
        print(f"  Warning: Missing {len(missing_features)} features, adding zeros")
        for feature in missing_features:
            val_features_clean[feature] = 0
    
    # Ensure correct column order
    val_features_clean = val_features_clean[feature_cols]
    
    # Make predictions
    val_predictions[target] = models[target].predict(val_features_clean)
    
    # Calculate traditional metrics
    true_vals = y_val[target].dropna()
    pred_vals = val_predictions[target][:len(true_vals)]
    
    if len(true_vals) > 0:
        rmse = np.sqrt(mean_squared_error(true_vals, pred_vals))
        mae = mean_absolute_error(true_vals, pred_vals)
        r2 = r2_score(true_vals, pred_vals)
        
        val_metrics[target] = {'rmse': rmse, 'mae': mae, 'r2': r2}
        print(f"  RMSE: {rmse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}")
    else:
        print(f"  No valid target values for {target}")

# Create predictions DataFrame for competition metric
val_predictions_df = pd.DataFrame(val_predictions)
val_predictions_df['date_id'] = val_log_returns['date_id']
val_predictions_df = val_predictions_df[['date_id'] + list(val_predictions.keys())]

# Calculate competition score
print(f"\nCalculating competition score...")
competition_score = calculate_competition_score(y_val, val_predictions_df[list(models.keys())])
print(f"Competition Score: {competition_score:.4f}")
print(f"Score Interpretation: {interpret_competition_score(competition_score)}")

# Save validation predictions
val_predictions_path = RESULTS_DIR / 'baseline' / 'predictions.csv'
val_predictions_path.parent.mkdir(parents=True, exist_ok=True)
val_predictions_df.to_csv(val_predictions_path, index=False)
print(f"Saved validation predictions: {val_predictions_path}")

# Save validation metrics
val_metrics_df = pd.DataFrame(val_metrics).T
val_metrics_df.index.name = 'target'
val_metrics_df.reset_index(inplace=True)
val_metrics_path = RESULTS_DIR / 'baseline' / 'validation_metrics.csv'
val_metrics_df.to_csv(val_metrics_path, index=False)
print(f"Saved validation metrics: {val_metrics_path}")

# Save competition score
competition_score_path = RESULTS_DIR / 'baseline' / 'competition_score.txt'
with open(competition_score_path, 'w') as f:
    f.write(f"Competition Score: {competition_score:.4f}\n")
    f.write(f"Interpretation: {interpret_competition_score(competition_score)}")
print(f"Saved competition score: {competition_score_path}")

print(f"\nValidation predictions shape: {val_predictions_df.shape}")
print(f"Validation metrics calculated for {len(val_metrics)} targets")
print(f"Competition Score: {competition_score:.4f}")

Generating predictions on validation set...
Validation log returns shape: (90, 558)
Validation feature matrix shape: (90, 557)
Validation target matrix shape: (90, 424)
Generating predictions for target_0...
  RMSE: 0.0089, MAE: 0.0068, R²: -0.1727
Generating predictions for target_1...
  RMSE: 0.0132, MAE: 0.0107, R²: -0.1707
Generating predictions for target_2...
  RMSE: 0.0137, MAE: 0.0108, R²: -0.1189
Generating predictions for target_3...
  RMSE: 0.0153, MAE: 0.0120, R²: -0.0811
Generating predictions for target_4...
  RMSE: 0.0191, MAE: 0.0145, R²: -0.0704
Generating predictions for target_5...
  RMSE: 0.0245, MAE: 0.0191, R²: -0.1826
Generating predictions for target_6...
  RMSE: 0.0166, MAE: 0.0130, R²: -0.1059
Generating predictions for target_7...
  RMSE: 0.0169, MAE: 0.0127, R²: -0.0082
Generating predictions for target_8...
  RMSE: 0.0162, MAE: 0.0126, R²: -0.3046
Generating predictions for target_9...
  RMSE: 0.0154, MAE: 0.0122, R²: -0.0449

Calculating competition score.

In [35]:
# Summary and next steps
print("Baseline Model Summary")
print("=" * 50)
print(f"Models trained: {len(models)}")
print(f"Average training time: {performance_df['training_time'].mean():.2f}s")
print(f"Total training time: {performance_df['training_time'].sum():.2f}s")

print("\nTop 10 most important features:")
for i, (feature, importance) in enumerate(overall_importance.head(10).items()):
    print(f"{i+1:2d}. {feature}: {importance:.4f}")

# Load and display competition score if available
competition_score_path = RESULTS_DIR / 'baseline' / 'competition_score.txt'
if competition_score_path.exists():
    with open(competition_score_path, 'r') as f:
        score_content = f.read()
        print(f"\nCompetition Score Results:")
        print(score_content)
else:
    print(f"\nNote: Run validation prediction cell to get competition score")

print("\nNext Steps:")
print("1. Analyze competition score and feature importance patterns")
print("2. Train models for all 425 targets")
print("3. Try using all engineered features (technical indicators, lagged features, etc.)")
print("4. Experiment with different algorithms (Random Forest, Neural Networks)")
print("5. Try factor models (PCA) for dimensionality reduction")
print("6. Ensemble different approaches")
print("7. Feature selection to reduce noise")
print("8. Hyperparameter tuning for better rank correlation")

print(f"\nResults saved to: {RESULTS_DIR}")
print(f"Models saved to: {MODELS_DIR}")
print(f"Predictions saved to: {PREDICTIONS_DIR}")

print(f"\nKey Insights:")
print(f"• Using log returns only as baseline (557 features)")
print(f"• Competition metric: Rank Correlation Sharpe Ratio")
print(f"• Time series split with gaps to prevent data leakage")
print(f"• Single validation approach (no cross-validation)")

Baseline Model Summary
Models trained: 10
Average training time: 4.67s
Total training time: 46.67s

Top 10 most important features:
 1. US_Stock_MPC_adj_low_log_return: 0.0075
 2. US_Stock_NUGT_adj_close_log_return: 0.0056
 3. US_Stock_VWO_adj_low_log_return: 0.0052
 4. US_Stock_EWJ_adj_close_log_return: 0.0051
 5. US_Stock_SPYV_adj_close_log_return: 0.0045
 6. US_Stock_GDX_adj_close_log_return: 0.0043
 7. US_Stock_MPC_adj_high_log_return: 0.0043
 8. US_Stock_TRGP_adj_high_log_return: 0.0042
 9. US_Stock_IGSB_adj_open_log_return: 0.0040
10. US_Stock_EWZ_adj_low_log_return: 0.0039

Competition Score Results:
Competition Score: -0.2180
Interpretation: VERY POOR (negative or zero correlation)

Next Steps:
1. Analyze competition score and feature importance patterns
2. Train models for all 425 targets
3. Try using all engineered features (technical indicators, lagged features, etc.)
4. Experiment with different algorithms (Random Forest, Neural Networks)
5. Try factor models (PCA) for dime