# Ensemble Blending Experiments

This notebook experiments with different blending strategies to improve predictions beyond individual models.

**Current Best Models:**
- LightGBM: 1892 RMSE (3rd place! ðŸŽ‰)
- CatBoost: ~2023 RMSE (update after running with new features)
- XGBoost: ~2476 RMSE (update after running with new features)

**Goal:** Blend models to achieve < 1850 RMSE


## 1. Setup and Load Predictions


In [None]:
import sys
import os
sys.path.append(os.path.abspath("../"))

# Reload modules to pick up latest changes
import importlib
import ensemble.blending
import ensemble.config
importlib.reload(ensemble.blending)
importlib.reload(ensemble.config)

import pandas as pd
import numpy as np
from ensemble.blending import WeightedBlender
from ensemble.config import MODEL_CONFIGS, BEST_MODELS

print("âœ… Setup complete! Modules reloaded.")


In [None]:
# Load predictions for best models only
best_configs = {name: MODEL_CONFIGS[name] for name in BEST_MODELS}

blender = WeightedBlender(best_configs)
blender.load_predictions(results_dir="../results")


## 2. Experiment with Different Blending Methods


In [None]:
# Method 1: Inverse RMSE weighting (recommended)
blender.compute_weights(method='inverse_rmse')
blended_inv_rmse = blender.blend()

print("\nðŸ“Š Inverse RMSE Weighting:")
print(f"   Predictions range: {blended_inv_rmse.min():.2f} - {blended_inv_rmse.max():.2f}")
print(f"   Mean prediction: {blended_inv_rmse.mean():.2f}")


In [None]:
# Method 2: Equal weighting
blender.compute_weights(method='equal')
blended_equal = blender.blend()

print("\nðŸ“Š Equal Weighting:")
print(f"   Predictions range: {blended_equal.min():.2f} - {blended_equal.max():.2f}")
print(f"   Mean prediction: {blended_equal.mean():.2f}")


In [None]:
# Method 3: Manual weighting (experiment with different ratios)
# Example: Give more weight to LightGBM since it's the best
manual_configs = {
    'lightgbm': {'path': MODEL_CONFIGS['lightgbm']['path'], 'rmse': 1892, 'weight': 0.7},
    'catboost': {'path': MODEL_CONFIGS['catboost']['path'], 'rmse': 2023, 'weight': 0.3}
}

blender_manual = WeightedBlender(manual_configs)
blender_manual.load_predictions(results_dir="../results")
blender_manual.compute_weights(method='manual')
blended_manual = blender_manual.blend()

print("\nðŸ“Š Manual Weighting (70% LightGBM, 30% CatBoost):")
print(f"   Predictions range: {blended_manual.min():.2f} - {blended_manual.max():.2f}")
print(f"   Mean prediction: {blended_manual.mean():.2f}")


## 2b. More Aggressive Weighting (Favor LightGBM)
Since models are highly correlated (0.997), we need to give much more weight to the best model.


In [None]:
# Method 4: Inverse RMSE squared (more aggressive, favors best model)
blender.compute_weights(method='inverse_rmse_squared')
blended_inv_rmse_sq = blender.blend()

print("\nðŸ“Š Inverse RMSE Squared Weighting (More Aggressive):")
print(f"   Predictions range: {blended_inv_rmse_sq.min():.2f} - {blended_inv_rmse_sq.max():.2f}")
print(f"   Mean prediction: {blended_inv_rmse_sq.mean():.2f}")


In [None]:
# Method 5: Test different LightGBM weights (80%, 85%, 90%, 95%)
print("Testing different LightGBM weights:\n")
results = {}

for lgbm_weight in [0.80, 0.85, 0.90, 0.95]:
    manual_configs = {
        'lightgbm': {'path': MODEL_CONFIGS['lightgbm']['path'], 'rmse': 1892, 'weight': lgbm_weight},
        'catboost': {'path': MODEL_CONFIGS['catboost']['path'], 'rmse': 1982, 'weight': 1.0 - lgbm_weight}
    }
    
    blender_test = WeightedBlender(manual_configs)
    blender_test.load_predictions(results_dir="../results")
    blender_test.compute_weights(method='manual')
    blended_test = blender_test.blend()
    
    results[lgbm_weight] = blended_test
    print(f"  {lgbm_weight*100:.0f}% LightGBM: mean={blended_test.mean():.2f}, "
          f"range=[{blended_test.min():.2f}, {blended_test.max():.2f}]")


In [None]:
# Method 6: Best only (just LightGBM) - for comparison
blender.compute_weights(method='best_only')
blended_best_only = blender.blend()

print("\nðŸ“Š Best Only (LightGBM):")
print(f"   Predictions range: {blended_best_only.min():.2f} - {blended_best_only.max():.2f}")
print(f"   Mean prediction: {blended_best_only.mean():.2f}")
print("\n   Note: This should match LightGBM predictions exactly")


## 3. Compare Predictions


In [None]:
# Compare all blending methods
comparison = pd.DataFrame({
    'lightgbm': blender.predictions['lightgbm'],
    'catboost': blender.predictions['catboost'],
    'blended_inv_rmse': blended_inv_rmse,
    'blended_inv_rmse_sq': blended_inv_rmse_sq,
    'blended_manual_70': blended_manual,
    'blended_90_lgbm': results[0.90],
    'blended_95_lgbm': results[0.95],
    'blended_best_only': blended_best_only
})

print("ðŸ“Š Prediction Statistics (selected methods):")
print(comparison[['lightgbm', 'catboost', 'blended_inv_rmse', 'blended_inv_rmse_sq', 
                  'blended_90_lgbm', 'blended_95_lgbm']].describe())

print("\nðŸ“Š Correlation with LightGBM (higher = more similar):")
corr_with_lgbm = comparison.corr()['lightgbm'].sort_values(ascending=False)
print(corr_with_lgbm.round(4))


## 4. Save Best Blended Predictions

**Recommendation:** Since models are highly correlated, try:
1. **90-95% LightGBM** blend (may slightly improve)
2. **Best only (LightGBM)** if blending doesn't help

The 90% LightGBM blend might capture small complementary signals from CatBoost.


In [None]:
# Save 90% LightGBM blend (recommended - may capture small complementary signals)
manual_configs_90 = {
    'lightgbm': {'path': MODEL_CONFIGS['lightgbm']['path'], 'rmse': 1892, 'weight': 0.90},
    'catboost': {'path': MODEL_CONFIGS['catboost']['path'], 'rmse': 1982, 'weight': 0.10}
}

blender_90 = WeightedBlender(manual_configs_90)
blender_90.load_predictions(results_dir="../results")
blender_90.compute_weights(method='manual')
submission_90 = blender_90.save_blended(
    output_path="../results/ensemble_blended_90_lgbm.csv"
)

print(f"\nâœ… 90% LightGBM blend saved!")
print(f"   First few predictions:")
print(submission_90.head(10))


## 4b. Fine-tune Around 90% (Optimal Range)

Since 90% performed best, let's test values around it to find the exact sweet spot.


In [None]:
# Fine-tune around 90%: test 87%, 88%, 89%, 90%, 91%, 92%
print("Fine-tuning blend ratio around 90%:\n")
fine_tune_results = {}

for lgbm_weight in [0.87, 0.88, 0.89, 0.90, 0.91, 0.92]:
    manual_configs = {
        'lightgbm': {'path': MODEL_CONFIGS['lightgbm']['path'], 'rmse': 1892, 'weight': lgbm_weight},
        'catboost': {'path': MODEL_CONFIGS['catboost']['path'], 'rmse': 1982, 'weight': 1.0 - lgbm_weight}
    }
    
    blender_fine = WeightedBlender(manual_configs)
    blender_fine.load_predictions(results_dir="../results")
    blender_fine.compute_weights(method='manual')
    blended_fine = blender_fine.blend()
    
    fine_tune_results[lgbm_weight] = blended_fine
    print(f"  {lgbm_weight*100:.0f}% LightGBM: mean={blended_fine.mean():.2f}, "
          f"std={blended_fine.std():.2f}, range=[{blended_fine.min():.2f}, {blended_fine.max():.2f}]")

print("\nðŸ’¡ Submit these to Kaggle to find the exact optimal ratio!")
print("   Current best: 90% LightGBM = 1889.03 RMSE")


In [None]:
# Save a few promising candidates for submission
for lgbm_weight in [0.88, 0.89, 0.91, 0.92]:
    manual_configs = {
        'lightgbm': {'path': MODEL_CONFIGS['lightgbm']['path'], 'rmse': 1892, 'weight': lgbm_weight},
        'catboost': {'path': MODEL_CONFIGS['catboost']['path'], 'rmse': 1982, 'weight': 1.0 - lgbm_weight}
    }
    
    blender_candidate = WeightedBlender(manual_configs)
    blender_candidate.load_predictions(results_dir="../results")
    blender_candidate.compute_weights(method='manual')
    blender_candidate.save_blended(
        output_path=f"../results/ensemble_blended_{int(lgbm_weight*100)}_lgbm.csv"
    )

print("\nâœ… Fine-tuned blends saved! Test on Kaggle to find the optimal ratio.")


In [None]:
# Optionally save 95% LightGBM blend
manual_configs_95 = {
    'lightgbm': {'path': MODEL_CONFIGS['lightgbm']['path'], 'rmse': 1892, 'weight': 0.95},
    'catboost': {'path': MODEL_CONFIGS['catboost']['path'], 'rmse': 1982, 'weight': 0.05}
}

blender_95 = WeightedBlender(manual_configs_95)
blender_95.load_predictions(results_dir="../results")
blender_95.compute_weights(method='manual')
submission_95 = blender_95.save_blended(
    output_path="../results/ensemble_blended_95_lgbm.csv"
)

print(f"\nâœ… 95% LightGBM blend saved!")


## 3. Compare Predictions


In [None]:
# Compare individual vs blended predictions
comparison = pd.DataFrame({
    'lightgbm': blender.predictions['lightgbm'],
    'catboost': blender.predictions['catboost'],
    'blended_inv_rmse': blended_inv_rmse,
    'blended_equal': blended_equal,
    'blended_manual': blended_manual
})

print("ðŸ“Š Prediction Statistics:")
print(comparison.describe())

print("\nðŸ“Š Correlation Matrix:")
print(comparison.corr().round(3))


## 4. Save Best Blended Predictions


In [None]:
# Save inverse RMSE weighted blend (usually best)
blender.compute_weights(method='inverse_rmse')
submission = blender.save_blended(
    output_path="../results/ensemble_blended_inv_rmse.csv"
)

print(f"\nâœ… Submission ready! Shape: {submission.shape}")
print(f"   First few predictions:")
print(submission.head(10))
