In [141]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from pathlib import Path


# Build robust path to data folder (notebooks and data are siblings)
DATA_DIR = Path.cwd().parent / 'data'
SUB_DIR = Path.cwd().parent / 'submissions'
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'


# Load the datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)  # This is for final predictions (no 'W' column)

# Display basic information about the datasets
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"'W' column in train dataset: {'W' in train_df.columns}")
print(f"'W' column in test dataset: {'W' in test_df.columns}")

Training set shape: (1812, 51)
Test set shape: (453, 45)
'W' column in train dataset: True
'W' column in test dataset: False


In [142]:
# Create derived features for both train and test sets

# R_per_game: Runs per game
# RA_per_game: Runs allowed per game
train_df['R_per_game'] = train_df['R'] / train_df['G']
train_df['RA_per_game'] = train_df['RA'] / train_df['G']
test_df['R_per_game'] = test_df['R'] / test_df['G']
test_df['RA_per_game'] = test_df['RA'] / test_df['G']

print(f"\nCreated derived features: R_per_game, RA_per_game")
print(f"Train - R_per_game range: {train_df['R_per_game'].min():.3f} to {train_df['R_per_game'].max():.3f}")
print(f"Train - RA_per_game range: {train_df['RA_per_game'].min():.3f} to {train_df['RA_per_game'].max():.3f}")
print(f"Test - R_per_game range: {test_df['R_per_game'].min():.3f} to {test_df['R_per_game'].max():.3f}")
print(f"Test - RA_per_game range: {test_df['RA_per_game'].min():.3f} to {test_df['RA_per_game'].max():.3f}")

# Expected Wins of Season = G × (R²) / (R² + RA²)
train_df['Expected_Wins'] = train_df['G'] * (train_df['R_per_game'] ** 2) / ((train_df['R_per_game'] ** 2) + (train_df['RA_per_game'] ** 2))
test_df['Expected_Wins'] = test_df['G'] * (test_df['R_per_game'] ** 2) / ((test_df['R_per_game'] ** 2) + (test_df['RA_per_game'] ** 2))
# train_df['Expected_Wins'] = train_df['G'] * (train_df['R'] ** 2) / ((train_df['R'] ** 2) + (train_df['RA'] ** 2))
# test_df['Expected_Wins'] = test_df['G'] * (test_df['R'] ** 2) / ((test_df['R'] ** 2) + (test_df['RA'] ** 2))
print(f"\nCreated derived feature: Expected_Wins")   
print(f"Train - Expected_Wins range: {train_df['Expected_Wins'].min():.3f} to {train_df['Expected_Wins'].max():.3f}")
print(f"Test - Expected_Wins range: {test_df['Expected_Wins'].min():.3f} to {test_df['Expected_Wins'].max():.3f}")

# Times getting on base
train_df['Times_On_Base'] = train_df['H'] + train_df['BB']
test_df['Times_On_Base'] = test_df['H'] + test_df['BB']

print(f"\nCreated derived feature: Times_On_Base")
print(f"Train - Times_On_Base range: {train_df['Times_On_Base'].min():.3f} to {train_df['Times_On_Base'].max():.3f}")
print(f"Test - Times_On_Base range: {test_df['Times_On_Base'].min():.3f} to {test_df['Times_On_Base'].max():.3f}")

# BB Rate (Walk Percentage) - BB / AB + BB
train_df['BB_Rate'] = train_df['BB'] / (train_df['AB'] + train_df['BB'])
test_df['BB_Rate'] = test_df['BB'] / (test_df['AB'] + test_df['BB'])

print(f"\nCreated derived feature: BB_Rate")
print(f"Train - BB_Rate range: {train_df['BB_Rate'].min():.3f} to {train_df['BB_Rate'].max():.3f}") 
print(f"Test - BB_Rate range: {test_df['BB_Rate'].min():.3f} to {test_df['BB_Rate'].max():.3f}")

# Home Run Rate - HR / AB
train_df['HR_Rate'] = train_df['HR'] / train_df['AB']
test_df['HR_Rate'] = test_df['HR'] / test_df['AB']

print(f"\nCreated derived feature: HR_Rate")
print(f"Train - HR_Rate range: {train_df['HR_Rate'].min():.3f} to {train_df['HR_Rate'].max():.3f}")
print(f"Test - HR_Rate range: {test_df['HR_Rate'].min():.3f} to {test_df['HR_Rate'].max():.3f}")

# On-Base Percentage (OBP) - (H + BB) / (AB + BB)
train_df['OBP'] = (train_df['H'] + train_df['BB']) / (train_df['AB'] + train_df['BB'])
test_df['OBP'] = (test_df['H'] + test_df['BB']) / (test_df['AB'] + test_df['BB'])

print(f"\nCreated derived feature: OBP")
print(f"Train - OBP range: {train_df['OBP'].min():.3f} to {train_df['OBP'].max():.3f}") 
print(f"Test - OBP range: {test_df['OBP'].min():.3f} to {test_df['OBP'].max():.3f}")

# Slugging Percentage (SLG)
# Singles = H - (2B + 3B + HR)
# Total Bases = Singles + (2 * 2B) + (3 * 3B) + (4 * HR)
# SLG = Total Bases / AB
Singles_train = train_df['H'] - (train_df['2B'] + train_df['3B'] + train_df['HR'])
Total_Bases_train = Singles_train + (2 * train_df['2B']) + (3 * train_df['3B']) + (4 * train_df['HR'])
train_df['SLG'] = Total_Bases_train / train_df['AB']  

Singles_test = test_df['H'] - (test_df['2B'] + test_df['3B'] + test_df['HR'])
Total_Bases_test = Singles_test + (2 * test_df['2B']) + (3 * test_df['3B']) + (4 * test_df['HR'])
test_df['SLG'] = Total_Bases_test / test_df['AB']

print(f"\nCreated derived feature: SLG")
print(f"Train - SLG range: {train_df['SLG'].min():.3f} to {train_df['SLG'].max():.3f}") 
print(f"Test - SLG range: {test_df['SLG'].min():.3f} to {test_df['SLG'].max():.3f}")    

# Combined On-Base Plus Slugging (OPS) - OBP + SLG
train_df['OPS'] = train_df['OBP'] + train_df['SLG']
test_df['OPS'] = test_df['OBP'] + test_df['SLG']

print(f"\nCreated derived feature: OPS")
print(f"Train - OPS range: {train_df['OPS'].min():.3f} to {train_df['OPS'].max():.3f}") 
print(f"Test - OPS range: {test_df['OPS'].min():.3f} to {test_df['OPS'].max():.3f}")

# Time on Base Allowed - HA + BBA
train_df['Times_On_Base_Allowed'] = train_df['HA'] + train_df['BBA']
test_df['Times_On_Base_Allowed'] = test_df['HA'] + test_df['BBA']

print(f"\nCreated derived feature: Times_On_Base_Allowed")
print(f"Train - Times_On_Base_Allowed range: {train_df['Times_On_Base_Allowed'].min():.3f} to {train_df['Times_On_Base_Allowed'].max():.3f}")
print(f"Test - Times_On_Base_Allowed range: {test_df['Times_On_Base_Allowed'].min():.3f} to {test_df['Times_On_Base_Allowed'].max():.3f}")

# WHIP (Walks plus Hits per Inning Pitched)
# Inings Pitched = IPouts / 3
# Times_On_Base_Per_Inning = Times_On_Base_Allowed / Inings_Pitched
train_df['Innings_Pitched'] = train_df['IPouts'] / 3
train_df['WHIP'] = train_df['Times_On_Base_Allowed'] / train_df['Innings_Pitched']
test_df['Innings_Pitched'] = test_df['IPouts'] / 3
test_df['WHIP'] = test_df['Times_On_Base_Allowed'] / test_df['Innings_Pitched']

print(f"\nCreated derived feature: WHIP")
print(f"Train - WHIP range: {train_df['WHIP'].min():.3f} to {train_df['WHIP'].max():.3f}")
print(f"Test - WHIP range: {test_df['WHIP'].min():.3f} to {test_df['WHIP'].max():.3f}")

# K/9 (Strikeouts per 9 Innings) - SOA / Innings_Pitched * 9
train_df['K_per_9'] = (train_df['SOA'] / train_df['Innings_Pitched']) * 9
test_df['K_per_9'] = (test_df['SOA'] / test_df['Innings_Pitched']) * 9  

print(f"\nCreated derived feature: K_per_9")
print(f"Train - K_per_9 range: {train_df['K_per_9'].min():.3f} to {train_df['K_per_9'].max():.3f}")
print(f"Test - K_per_9 range: {test_df['K_per_9'].min():.3f} to {test_df['K_per_9'].max():.3f}")

# HR/9 (Home Runs Allowed per 9 Innings) - HRA / Innings_Pitched * 9
train_df['HR_per_9'] = (train_df['HRA'] / train_df['Innings_Pitched']) * 9
test_df['HR_per_9'] = (test_df['HRA'] / test_df['Innings_Pitched']) * 9

print(f"\nCreated derived feature: HR_per_9")
print(f"Train - HR_per_9 range: {train_df['HR_per_9'].min():.3f} to {train_df['HR_per_9'].max():.3f}")
print(f"Test - HR_per_9 range: {test_df['HR_per_9'].min():.3f} to {test_df['HR_per_9'].max():.3f}")

# Run Environment Idex (REI) - (R + RA) / G / mlb_rpg
train_df['REI'] = (train_df['R'] + train_df['RA']) / train_df['G'] / train_df['mlb_rpg']
test_df['REI'] = (test_df['R'] + test_df['RA']) / test_df['G'] / test_df['mlb_rpg']
print(f"\nCreated derived feature: REI")
print(f"Train - REI range: {train_df['REI'].min():.3f} to {train_df['REI'].max():.3f}")
print(f"Test - REI range: {test_df['REI'].min():.3f} to {test_df['REI'].max():.3f}")    

# Power Environement Index (PEI) -  (HR + HRA) / G / (mlb_rpg * avg_hr_rate)
avg_hr_rate = train_df['HR_Rate'].mean()
train_df['PEI'] = (train_df['HR'] + train_df['HRA']) / train_df['G'] / (train_df['mlb_rpg'] * avg_hr_rate)
test_df['PEI'] = (test_df['HR'] + test_df['HRA']) / test_df['G'] / (test_df['mlb_rpg'] * avg_hr_rate)
print(f"\nCreated derived feature: PEI")
print(f"Train - PEI range: {train_df['PEI'].min():.3f} to {train_df['PEI'].max():.3f}")
print(f"Test - PEI range: {test_df['PEI'].min():.3f} to {test_df['PEI'].max():.3f}") 

# Era adjusted OBP, SLG, OPS, WHIP, K_per_9, HR_per_9, BB_Rate, HR_Rate
# Historical average runs per game (RPG) for MLB
historical_avg_rpg_train = train_df['mlb_rpg'].mean()
historical_avg_rpg_test = test_df['mlb_rpg'].mean()
# historical_avg_rpg_train = 4.4
# historical_avg_rpg_test = 4.4

# Era adjusted OBP
train_df['Era_Adjusted_OBP'] = train_df['OBP'] * (historical_avg_rpg_train / train_df['mlb_rpg'])
test_df['Era_Adjusted_OBP'] = test_df['OBP'] * (historical_avg_rpg_test / test_df['mlb_rpg'])
print(f"\nCreated derived feature: Era_Adjusted_OBP")
print(f"Train - Era_Adjusted_OBP range: {train_df['Era_Adjusted_OBP'].min():.3f} to {train_df['Era_Adjusted_OBP'].max():.3f}") 
print(f"Test - Era_Adjusted_OBP range: {test_df['Era_Adjusted_OBP'].min():.3f} to {test_df['Era_Adjusted_OBP'].max():.3f}") 

# Era adjusted SLG
train_df['Era_Adjusted_SLG'] = train_df['SLG'] * (historical_avg_rpg_train / train_df['mlb_rpg'])
test_df['Era_Adjusted_SLG'] = test_df['SLG'] * (historical_avg_rpg_test / test_df['mlb_rpg'])
print(f"\nCreated derived feature: Era_Adjusted_SLG")
print(f"Train - Era_Adjusted_SLG range: {train_df['Era_Adjusted_SLG'].min():.3f} to {train_df['Era_Adjusted_SLG'].max():.3f}") 
print(f"Test - Era_Adjusted_SLG range: {test_df['Era_Adjusted_SLG'].min():.3f} to {test_df['Era_Adjusted_SLG'].max():.3f}") 

# Era adjusted OPS
train_df['Era_Adjusted_OPS'] = train_df['OPS'] * (historical_avg_rpg_train / train_df['mlb_rpg'])
test_df['Era_Adjusted_OPS'] = test_df['OPS'] * (historical_avg_rpg_test / test_df['mlb_rpg'])
print(f"\nCreated derived feature: Era_Adjusted_OPS")
print(f"Train - Era_Adjusted_OPS range: {train_df['Era_Adjusted_OPS'].min():.3f} to {train_df['Era_Adjusted_OPS'].max():.3f}") 
print(f"Test - Era_Adjusted_OPS range: {test_df['Era_Adjusted_OPS'].min():.3f} to {test_df['Era_Adjusted_OPS'].max():.3f}")

# Era adjusted WHIP
train_df['Era_Adjusted_WHIP'] = train_df['WHIP'] * (historical_avg_rpg_train / train_df['mlb_rpg'])
test_df['Era_Adjusted_WHIP'] = test_df['WHIP'] * (historical_avg_rpg_test / test_df['mlb_rpg'])
print(f"\nCreated derived feature: Era_Adjusted_WHIP")
print(f"Train - Era_Adjusted_WHIP range: {train_df['Era_Adjusted_WHIP'].min():.3f} to {train_df['Era_Adjusted_WHIP'].max():.3f}")
print(f"Test - Era_Adjusted_WHIP range: {test_df['Era_Adjusted_WHIP'].min():.3f} to {test_df['Era_Adjusted_WHIP'].max():.3f}")

# Era adjusted K_per_9
train_df['Era_Adjusted_K_per_9'] = train_df['K_per_9'] * (historical_avg_rpg_train / train_df['mlb_rpg'])
test_df['Era_Adjusted_K_per_9'] = test_df['K_per_9'] * (historical_avg_rpg_test / test_df['mlb_rpg'])
print(f"\nCreated derived feature: Era_Adjusted_K_per_9")
print(f"Train - Era_Adjusted_K_per_9 range: {train_df['Era_Adjusted_K_per_9'].min():.3f} to {train_df['Era_Adjusted_K_per_9'].max():.3f}")
print(f"Test - Era_Adjusted_K_per_9 range: {test_df['Era_Adjusted_K_per_9'].min():.3f} to {test_df['Era_Adjusted_K_per_9'].max():.3f}") 

# Era adjusted HR_per_9
train_df['Era_Adjusted_HR_per_9'] = train_df['HR_per_9'] * (historical_avg_rpg_train / train_df['mlb_rpg'])
test_df['Era_Adjusted_HR_per_9'] = test_df['HR_per_9'] * (historical_avg_rpg_test / test_df['mlb_rpg'])
print(f"\nCreated derived feature: Era_Adjusted_HR_per_9")
print(f"Train - Era_Adjusted_HR_per_9 range: {train_df['Era_Adjusted_HR_per_9'].min():.3f} to {train_df['Era_Adjusted_HR_per_9'].max():.3f}")
print(f"Test - Era_Adjusted_HR_per_9 range: {test_df['Era_Adjusted_HR_per_9'].min():.3f} to {test_df['Era_Adjusted_HR_per_9'].max():.3f}")

# Era adjusted BB_Rate
train_df['Era_Adjusted_BB_Rate'] = train_df['BB_Rate'] * (historical_avg_rpg_train / train_df['mlb_rpg'])
test_df['Era_Adjusted_BB_Rate'] = test_df['BB_Rate'] * (historical_avg_rpg_test / test_df['mlb_rpg'])
print(f"\nCreated derived feature: Era_Adjusted_BB_Rate")
print(f"Train - Era_Adjusted_BB_Rate range: {train_df['Era_Adjusted_BB_Rate'].min():.3f} to {train_df['Era_Adjusted_BB_Rate'].max():.3f}")
print(f"Test - Era_Adjusted_BB_Rate range: {test_df['Era_Adjusted_BB_Rate'].min():.3f} to {test_df['Era_Adjusted_BB_Rate'].max():.3f}") 

# Era adjusted HR_Rate
train_df['Era_Adjusted_HR_Rate'] = train_df['HR_Rate'] * (historical_avg_rpg_train / train_df['mlb_rpg'])
test_df['Era_Adjusted_HR_Rate'] = test_df['HR_Rate'] * (historical_avg_rpg_test / test_df['mlb_rpg'])
print(f"\nCreated derived feature: Era_Adjusted_HR_Rate")
print(f"Train - Era_Adjusted_HR_Rate range: {train_df['Era_Adjusted_HR_Rate'].min():.3f} to {train_df['Era_Adjusted_HR_Rate'].max():.3f}")
print(f"Test - Era_Adjusted_HR_Rate range: {test_df['Era_Adjusted_HR_Rate'].min():.3f} to {test_df['Era_Adjusted_HR_Rate'].max():.3f}")



Created derived features: R_per_game, RA_per_game
Train - R_per_game range: 2.409 to 6.884
Train - RA_per_game range: 2.458 to 7.686
Test - R_per_game range: 2.783 to 6.896
Test - RA_per_game range: 2.867 to 6.865

Created derived feature: Expected_Wins
Train - Expected_Wins range: 35.860 to 119.963
Test - Expected_Wins range: 40.352 to 107.111

Created derived feature: Times_On_Base
Train - Times_On_Base range: 1367.000 to 2415.000
Test - Times_On_Base range: 1453.000 to 2327.000

Created derived feature: BB_Rate
Train - BB_Rate range: 0.051 to 0.136
Test - BB_Rate range: 0.052 to 0.123

Created derived feature: HR_Rate
Train - HR_Rate range: 0.001 to 0.047
Test - HR_Rate range: 0.001 to 0.045

Created derived feature: OBP
Train - OBP range: 0.262 to 0.382
Test - OBP range: 0.267 to 0.382

Created derived feature: SLG
Train - SLG range: 0.274 to 0.491
Test - SLG range: 0.261 to 0.488

Created derived feature: OPS
Train - OPS range: 0.539 to 0.870
Test - OPS range: 0.530 to 0.870

Cre

In [143]:
default_features = [
    # Basic Statistics
    'G', 'R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF',
    'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA',
    'E', 'DP', 'FP', 'attendance', 'BPF', 'PPF',
    
    # Derived Features
    'Expected_Wins', 'Times_On_Base', 'Times_On_Base_Allowed', 'mlb_rpg',

    'Era_Adjusted_OBP', 'Era_Adjusted_SLG', 'Era_Adjusted_OPS', 'Era_Adjusted_WHIP',
    'Era_Adjusted_K_per_9', 'Era_Adjusted_HR_per_9', 'Era_Adjusted_BB_Rate', 'Era_Adjusted_HR_Rate',
    
    'OBP', 'SLG', 'OPS', 'WHIP', 'K_per_9', 'HR_per_9', 'BB_Rate', 'HR_Rate', 
    
    'PEI', 'REI',
    
    # Era Indicators
    'era_1', 'era_2', 'era_3', 'era_4', 'era_5', 'era_6', 'era_7', 'era_8',
    
    # Decade Indicators
    'decade_1910', 'decade_1920', 'decade_1930', 'decade_1940', 'decade_1950',
    'decade_1960', 'decade_1970', 'decade_1980', 'decade_1990', 'decade_2000', 'decade_2010'
 ]

# Filter features that exist in both training data AND test data
available_features = [col for col in default_features 
                     if col in train_df.columns and col in test_df.columns]
print(f"Number of available default features: {len(available_features)}")

# Print available features in a column
print("Available features:")
for feature in available_features:
    print(feature)


Number of available default features: 65
Available features:
G
R
AB
H
2B
3B
HR
BB
SO
SB
RA
ER
ERA
CG
SHO
SV
IPouts
HA
HRA
BBA
SOA
E
DP
FP
Expected_Wins
Times_On_Base
Times_On_Base_Allowed
mlb_rpg
Era_Adjusted_OBP
Era_Adjusted_SLG
Era_Adjusted_OPS
Era_Adjusted_WHIP
Era_Adjusted_K_per_9
Era_Adjusted_HR_per_9
Era_Adjusted_BB_Rate
Era_Adjusted_HR_Rate
OBP
SLG
OPS
WHIP
K_per_9
HR_per_9
BB_Rate
HR_Rate
PEI
REI
era_1
era_2
era_3
era_4
era_5
era_6
era_7
era_8
decade_1910
decade_1920
decade_1930
decade_1940
decade_1950
decade_1960
decade_1970
decade_1980
decade_1990
decade_2000
decade_2010


In [144]:
# Prepare training data (split the train.csv for model evaluation)
X_full = train_df[available_features]
y_full = train_df['W']

# Split training data into train/validation sets for model evaluation
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42
)

# Prepare final test data for predictions (this has no target variable)
X_test_final = test_df[available_features]

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")
print(f"Final test set shape: {X_test_final.shape}")

Training set shape: (1449, 65)
Validation set shape: (363, 65)
Final test set shape: (453, 65)


In [145]:
# Remove highly correlated features
import pandas as pd
import numpy as np

def remove_correlated_features(X_train, X_test, threshold=0.95, verbose=True):
    """
    Remove highly correlated features from training and test sets.
    
    Parameters:
    - X_train: Training feature DataFrame
    - X_test: Test feature DataFrame  
    - threshold: Correlation threshold (default 0.95)
    - verbose: Print information about removed features
    
    Returns:
    - X_train_filtered, X_test_filtered: DataFrames with correlated features removed
    """
    
    # Calculate correlation matrix
    corr_matrix = X_train.corr().abs()
    
    # Find pairs of highly correlated features
    upper_tri = corr_matrix.where(
        np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
    )
    
    # Find features to remove (those with correlation > threshold)
    features_to_remove = [column for column in upper_tri.columns if any(upper_tri[column] > threshold)]
    
    if verbose:
        print(f"🔍 CORRELATION ANALYSIS")
        print(f"{'='*50}")
        print(f"Correlation threshold: {threshold}")
        print(f"Original features: {X_train.shape[1]}")
        print(f"Features to remove: {len(features_to_remove)}")
        
        if features_to_remove:
            print(f"\nHighly correlated features to remove:")
            for feature in features_to_remove:
                # Find what it's correlated with
                high_corr = upper_tri[feature].dropna()
                high_corr = high_corr[high_corr > threshold]
                if len(high_corr) > 0:
                    corr_with = high_corr.index[0]
                    corr_value = high_corr.iloc[0]
                    print(f"  • {feature} (corr={corr_value:.3f} with {corr_with})")
        else:
            print(f"\n✅ No highly correlated features found above threshold {threshold}")
    
    # Remove highly correlated features from both datasets
    X_train_filtered = X_train.drop(columns=features_to_remove)
    X_test_filtered = X_test.drop(columns=features_to_remove)
    
    if verbose:
        print(f"\nFeatures after removal: {X_train_filtered.shape[1]}")
        print(f"Features removed: {len(features_to_remove)}")
        if len(features_to_remove) > 0:
            improvement = len(features_to_remove) / X_train.shape[1] * 100
            print(f"Dimensionality reduction: {improvement:.1f}%")
    
    return X_train_filtered, X_test_filtered

# Apply correlation removal to our datasets
# Store original datasets for backup
X_full_original = X_full.copy()
X_test_final_original = X_test_final.copy()

# Remove correlated features
X_full_filtered, X_test_final_filtered = remove_correlated_features(
    X_full, X_test_final, 
    threshold=0.95, 
    verbose=True
)

# Update the main datasets (so later cells use the filtered versions)
X_full = X_full_filtered
X_test_final = X_test_final_filtered

# Update available_features list to match the filtered features
available_features_filtered = list(X_full.columns)

print(f"\n📊 UPDATED DATASET INFO")
print(f"{'='*50}")
print(f"X_full shape: {X_full.shape}")
print(f"X_test_final shape: {X_test_final.shape}")
print(f"Available features updated: {len(available_features_filtered)}")

# Verify both datasets have the same features
assert list(X_full.columns) == list(X_test_final.columns), "Feature mismatch between train and test!"
print(f"✅ Feature alignment verified between train and test sets")

# Update available_features for downstream compatibility
available_features = available_features_filtered

print(f"\n🔄 Variables updated for downstream compatibility:")
print(f"  • X_full: {X_full.shape}")
print(f"  • X_test_final: {X_test_final.shape}")  
print(f"  • available_features: {len(available_features)} features")
print(f"\n💡 To disable correlation removal, simply comment out this entire cell")

🔍 CORRELATION ANALYSIS
Correlation threshold: 0.95
Original features: 65
Features to remove: 13

Highly correlated features to remove:
  • ERA (corr=0.959 with RA)
  • FP (corr=0.996 with E)
  • Era_Adjusted_K_per_9 (corr=0.953 with SOA)
  • Era_Adjusted_HR_per_9 (corr=0.981 with HRA)
  • Era_Adjusted_HR_Rate (corr=0.979 with HR)
  • OPS (corr=0.969 with SLG)
  • K_per_9 (corr=0.999 with SOA)
  • HR_per_9 (corr=0.999 with HRA)
  • BB_Rate (corr=0.982 with BB)
  • HR_Rate (corr=0.999 with HR)
  • PEI (corr=0.959 with Era_Adjusted_HR_Rate)
  • decade_1910 (corr=1.000 with era_1)
  • decade_2010 (corr=1.000 with era_8)

Features after removal: 52
Features removed: 13
Dimensionality reduction: 20.0%

📊 UPDATED DATASET INFO
X_full shape: (1812, 52)
X_test_final shape: (453, 52)
Available features updated: 52
✅ Feature alignment verified between train and test sets

🔄 Variables updated for downstream compatibility:
  • X_full: (1812, 52)
  • X_test_final: (453, 52)
  • available_features: 52

In [146]:
# Import boosting libraries
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_validate
import time
import warnings

# Silence XGBoost FutureWarnings about deprecated pandas functions
warnings.filterwarnings("ignore", category=FutureWarning, module="xgboost")

print("BOOSTING MODELS COMPARISON")
print("="*50)

# Prepare data
X = X_full
y = y_full

print(f"\nDataset shape: {X.shape}")
print(f"Features being used: {list(X.columns)}")

# Define models with GPU acceleration where available
models = {
    'XGBoost': XGBRegressor(
        n_estimators=100,  # 🔧 Reduced from 200
        max_depth=4,       # 🔧 Reduced from 6
        learning_rate=0.05, # 🔧 Reduced from 0.1
        subsample=0.7,     # 🔧 Reduced from 0.8
        colsample_bytree=0.7, # 🔧 Reduced from 0.8
        reg_alpha=1.0,     # 🔧 Added L1 regularization
        reg_lambda=1.0,    # 🔧 Added L2 regularization
        min_child_weight=3, # 🔧 Added minimum samples per leaf
        random_state=42,
        verbosity=0,
        tree_method="hist",
        device="cuda"  # 🚀 GPU acceleration
    ),
    
    'LightGBM': LGBMRegressor(
        n_estimators=80,     # 🔧 Reduced from 100 to 80
        max_depth=3,         # 🔧 Reduced from 4 to 3 (shallower trees)
        learning_rate=0.04,  # 🔧 Reduced from 0.05 to 0.04
        subsample=0.6,       # 🔧 Reduced from 0.7 to 0.6 (more aggressive sampling)
        colsample_bytree=0.6, # 🔧 Reduced from 0.7 to 0.6 (fewer features per tree)
        reg_alpha=2.0,       # 🔧 Increased from 1.0 to 2.0 (stronger L1)
        reg_lambda=2.0,      # 🔧 Increased from 1.0 to 2.0 (stronger L2)
        min_child_samples=15, # 🔧 Increased from 10 to 15 (larger min samples)
        min_child_weight=0.01, # 🔧 Added minimum child weight
        bagging_freq=1,      # 🔧 Added bagging frequency
        feature_fraction=0.6, # 🔧 Added feature fraction (same as colsample_bytree)
        random_state=42,
        verbose=-1,
        device='cuda'  # 🚀 GPU acceleration
    ),
    
    'CatBoost': CatBoostRegressor(
        iterations=100,    # 🔧 Reduced from 200
        depth=4,          # 🔧 Reduced from 6
        learning_rate=0.05, # 🔧 Reduced from 0.1
        l2_leaf_reg=3,    # 🔧 Added L2 regularization
        bagging_temperature=0.2, # 🔧 Added bagging regularization
        random_strength=0.2,     # 🔧 Added random strength
        random_seed=42,
        verbose=False,
        task_type="GPU"  # 🚀 GPU acceleration
    )
}

# Cross-validation
cv_results = {}

for name, model in models.items():
    print(f"\nTesting {name}...")
    start_time = time.time()
    
    try:
        # Get both R² and MAE
        cv_scores = cross_validate(
            model, X, y, 
            cv=5, 
            scoring=['r2', 'neg_mean_absolute_error'],
            return_train_score=True,
            verbose=0
        )
        
        end_time = time.time()
        
        cv_results[name] = {
            'test_r2': cv_scores['test_r2'].mean(),
            'test_r2_std': cv_scores['test_r2'].std(),
            'test_mae': -cv_scores['test_neg_mean_absolute_error'].mean(),
            'test_mae_std': cv_scores['test_neg_mean_absolute_error'].std(),
            'train_r2': cv_scores['train_r2'].mean(),
            'overfitting': cv_scores['train_r2'].mean() - cv_scores['test_r2'].mean(),
            'time': end_time - start_time,
            'gpu_status': '✅ GPU'
        }
        
    except Exception as e:
        print(f"   ⚠️ GPU failed, falling back to CPU: {str(e)}")
        # Fallback to CPU version
        if name == 'XGBoost':
            model.set_params(device="cpu")
        elif name == 'LightGBM':
            model.set_params(device="cpu")
        elif name == 'CatBoost':
            model.set_params(task_type="CPU")
            
        cv_scores = cross_validate(
            model, X, y, 
            cv=5, 
            scoring=['r2', 'neg_mean_absolute_error'],
            return_train_score=True,
            verbose=0
        )
        
        end_time = time.time()
        
        cv_results[name] = {
            'test_r2': cv_scores['test_r2'].mean(),
            'test_r2_std': cv_scores['test_r2'].std(),
            'test_mae': -cv_scores['test_neg_mean_absolute_error'].mean(),
            'test_mae_std': cv_scores['test_neg_mean_absolute_error'].std(),
            'train_r2': cv_scores['train_r2'].mean(),
            'overfitting': cv_scores['train_r2'].mean() - cv_scores['test_r2'].mean(),
            'time': end_time - start_time,
            'gpu_status': '⚠️ CPU fallback'
        }

print("\n" + "="*90)
print("RESULTS SUMMARY")
print("="*90)
print(f"{'Model':<22} {'Test R²':<10} {'Test MAE':<11} {'Overfitting':<13} {'Time (s)':<10}")
print("-" * 90)

for name, result in sorted(cv_results.items(), key=lambda x: x[1]['test_r2'], reverse=True):
    overfit_warning = "⚠️" if result['overfitting'] > 0.05 else "✓"
    print(f"{name:<22} {result['test_r2']:.4f}    {result['test_mae']:.2f}       "
          f"{result['overfitting']:>6.4f} {overfit_warning:<5} {result['time']:>6.1f}")

# Feature importance for best model
best_model_name = max(cv_results.keys(), key=lambda x: cv_results[x]['test_r2'])
best_model = models[best_model_name]

print(f"\n🏆 Best Model: {best_model_name}")
print(f"Training {best_model_name} on full dataset for feature importance...")

best_model.fit(X, y)

# Get feature importance
if hasattr(best_model, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'feature': X.columns,
        'importance': best_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nTop 15 Features from {best_model_name}:")
    print("-" * 40)
    for i, row in importance_df.head(15).iterrows():
        print(f"{row['feature']:>20}: {row['importance']:.4f}")


BOOSTING MODELS COMPARISON

Dataset shape: (1812, 52)
Features being used: ['G', 'R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'RA', 'ER', 'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP', 'Expected_Wins', 'Times_On_Base', 'Times_On_Base_Allowed', 'mlb_rpg', 'Era_Adjusted_OBP', 'Era_Adjusted_SLG', 'Era_Adjusted_OPS', 'Era_Adjusted_WHIP', 'Era_Adjusted_BB_Rate', 'OBP', 'SLG', 'WHIP', 'REI', 'era_1', 'era_2', 'era_3', 'era_4', 'era_5', 'era_6', 'era_7', 'era_8', 'decade_1920', 'decade_1930', 'decade_1940', 'decade_1950', 'decade_1960', 'decade_1970', 'decade_1980', 'decade_1990', 'decade_2000']

Testing XGBoost...

Testing LightGBM...

Testing CatBoost...

RESULTS SUMMARY
Model                  Test R²    Test MAE    Overfitting   Time (s)  
------------------------------------------------------------------------------------------
CatBoost               0.9120    3.07       0.0153 ✓        2.4
XGBoost                0.9081    3.16       0.0346 ✓        0.5
LightGBM

In [147]:
print("LINEAR MODELS COMPARISON")
print("="*70)

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, HuberRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, KFold
import time

# Prepare data
X_linear = X_full
y = y_full

print(f"Dataset shape: {X_linear.shape}")
print(f"Using {len(available_features)} engineered features\n")

# Define linear models with pipelines (include scaling)
models_linear = {
    'Linear Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('model', LinearRegression())
    ]),
    
    'Ridge (alpha=0.1)': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=0.1))
    ]),
    
    'Ridge (alpha=1.0)': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=1.0))
    ]),
    
    'Ridge (alpha=10)': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=10.0))
    ]),
    
    'Lasso (alpha=0.01)': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso(alpha=0.01, max_iter=5000))
    ]),
    
    'Lasso (alpha=0.1)': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso(alpha=0.1, max_iter=5000))
    ]),
    
    'Elastic Net': Pipeline([
        ('scaler', StandardScaler()),
        ('model', ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=5000))
    ]),
    
    'Huber Regressor': Pipeline([
        ('scaler', StandardScaler()),
        ('model', HuberRegressor(
            epsilon=1.35, 
            max_iter=2000,  # 🔧 Increased from 1000 to 2000
            alpha=0.0001,   # 🔧 Added regularization
            tol=1e-05       # 🔧 Adjusted tolerance
        ))
    ]),
    
    'Polynomial Ridge': Pipeline([
        ('poly', PolynomialFeatures(degree=2, include_bias=False)),
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=10.0))
    ])
}

# Cross-validation setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)
results_linear = {}

# Test all models
for name, model in models_linear.items():
    print(f"Testing {name}...")
    start_time = time.time()
    
    try:
        cv_scores = cross_validate(
            model, X_linear, y,
            cv=cv,
            scoring=['r2', 'neg_mean_absolute_error'],
            return_train_score=True,
            error_score='raise'
        )
        
        end_time = time.time()
        
        results_linear[name] = {
            'test_r2': cv_scores['test_r2'].mean(),
            'test_r2_std': cv_scores['test_r2'].std(),
            'test_mae': -cv_scores['test_neg_mean_absolute_error'].mean(),
            'test_mae_std': cv_scores['test_neg_mean_absolute_error'].std(),
            'train_r2': cv_scores['train_r2'].mean(),
            'overfitting': cv_scores['train_r2'].mean() - cv_scores['test_r2'].mean(),
            'time': end_time - start_time,
            'status': 'Success'
        }
    except Exception as e:
        print(f"  ⚠️  Error: {str(e)}")
        results_linear[name] = {'status': 'Failed', 'error': str(e)}

print("\n" + "="*90)
print("RESULTS SUMMARY")
print("="*90)
print(f"{'Model':<22} {'Test R²':<10} {'Test MAE':<11} {'Overfitting':<13} {'Time (s)':<10}")
print("-" * 90)

# Sort by Test R²
successful_results = {k: v for k, v in results_linear.items() if v.get('status') == 'Success'}
sorted_results = sorted(successful_results.items(), key=lambda x: x[1]['test_r2'], reverse=True)

for name, result in sorted_results:
    overfit_warning = "⚠️" if result['overfitting'] > 0.05 else "✓"
    print(f"{name:<22} {result['test_r2']:.4f}    {result['test_mae']:.2f}       "
          f"{result['overfitting']:>6.4f} {overfit_warning:<5} {result['time']:>6.1f}")

# Identify best model
if sorted_results:
    best_model_name = sorted_results[0][0]
    best_score = sorted_results[0][1]['test_r2']
    best_mae = sorted_results[0][1]['test_mae']
    
    print(f"\n🏆 Best Linear Model: {best_model_name}")
    print(f"   CV R² = {best_score:.4f} (±{sorted_results[0][1]['test_r2_std']:.4f})")
    print(f"   CV MAE = {best_mae:.2f} wins (±{sorted_results[0][1]['test_mae_std']:.2f})")

# Feature importance for best linear model
print(f"\n📊 FEATURE IMPORTANCE ANALYSIS")
print("-" * 50)
print(f"Training {best_model_name} on full dataset for feature importance...")

# Get the best model and train it on full data
best_linear_model = models_linear[best_model_name]
best_linear_model.fit(X_linear, y)

# Handle different model types for feature importance
if 'Polynomial' in best_model_name:
    # For polynomial features, we need to get feature names from the pipeline
    print("⚠️  Note: Polynomial Ridge creates many interaction features")
    print("    Showing top 15 coefficients (may include feature interactions)")
    
    # Get the polynomial feature transformer
    poly_transformer = best_linear_model.named_steps['poly']
    feature_names_poly = poly_transformer.get_feature_names_out(X_linear.columns)
    
    # Get coefficients from the final model
    coefficients = best_linear_model.named_steps['model'].coef_
    
    # Create feature importance DataFrame
    feature_importance_df = pd.DataFrame({
        'feature': feature_names_poly,
        'coefficient': coefficients,
        'abs_coefficient': np.abs(coefficients)
    }).sort_values('abs_coefficient', ascending=False)
    
else:
    # For regular linear models, use original feature names
    coefficients = best_linear_model.named_steps['model'].coef_
    
    # Create feature importance DataFrame using absolute coefficients
    feature_importance_df = pd.DataFrame({
        'feature': X_linear.columns,
        'coefficient': coefficients,
        'abs_coefficient': np.abs(coefficients)
    }).sort_values('abs_coefficient', ascending=False)

print(f"\nTop 15 Most Important Features ({best_model_name}):")
print("-" * 60)
print(f"{'Feature':<25} {'Coefficient':<15} {'Abs Value':<12}")
print("-" * 60)

for i, row in feature_importance_df.head(15).iterrows():
    coef_sign = "+" if row['coefficient'] >= 0 else ""
    print(f"{row['feature']:<25} {coef_sign}{row['coefficient']:<14.4f} {row['abs_coefficient']:<12.4f}")

# Additional insights
print(f"\nFeature Importance Insights:")
print(f"• Positive coefficients increase wins")
print(f"• Negative coefficients decrease wins") 
print(f"• Larger absolute values = stronger influence")

# Count positive vs negative coefficients in top 15
top15_coeffs = feature_importance_df.head(15)['coefficient']
positive_count = (top15_coeffs > 0).sum()
negative_count = (top15_coeffs < 0).sum()
print(f"• Top 15 features: {positive_count} positive, {negative_count} negative coefficients")

LINEAR MODELS COMPARISON
Dataset shape: (1812, 52)
Using 52 engineered features

Testing Linear Regression...
Testing Ridge (alpha=0.1)...
Testing Ridge (alpha=1.0)...
Testing Ridge (alpha=10)...
Testing Lasso (alpha=0.01)...
Testing Lasso (alpha=0.1)...
Testing Elastic Net...
Testing Huber Regressor...
Testing Polynomial Ridge...

RESULTS SUMMARY
Model                  Test R²    Test MAE    Overfitting   Time (s)  
------------------------------------------------------------------------------------------
Lasso (alpha=0.01)     0.9312    2.72       0.0041 ✓        0.2
Ridge (alpha=10)       0.9308    2.73       0.0044 ✓        0.0
Ridge (alpha=1.0)      0.9308    2.73       0.0051 ✓        0.0
Ridge (alpha=0.1)      0.9305    2.73       0.0054 ✓        0.0
Linear Regression      0.9301    2.74       0.0059 ✓        0.1
Huber Regressor        0.9300    2.75       0.0057 ✓        0.5
Polynomial Ridge       0.9246    2.85       0.0260 ✓        0.6
Elastic Net            0.9235    2.88   

# Phase 1: Weighted Ensemble Implementation

Based on your individual model results, we'll now implement a weighted ensemble approach to combine the best performing models. This should help us break below the 3.0 MAE barrier by leveraging the strengths of different model types.

## Strategy:
- Select top performing models from both linear and boosting categories
- Use cross-validation performance to determine optimal weights
- Weight linear regression higher since it performed best on Kaggle (3.05136)
- Generate ensemble predictions for final submission

In [121]:
print("PHASE 1: WEIGHTED ENSEMBLE IMPLEMENTATION")
print("="*60)

from scipy.optimize import minimize
import numpy as np

# First, let's identify our top performing models from both categories
print("\n1. SELECTING TOP MODELS")
print("-" * 40)

# Top 3 linear models (based on CV R²)
top_linear_models = dict(sorted(successful_results.items(), key=lambda x: x[1]['test_r2'], reverse=True)[:3])
print("Top Linear Models (by CV R²):")
for name, result in top_linear_models.items():
    print(f"  {name}: R² = {result['test_r2']:.4f}, MAE = {result['test_mae']:.2f}")

# Top 2 boosting models (from previous results)
top_boosting_models = dict(sorted(cv_results.items(), key=lambda x: x[1]['test_r2'], reverse=True)[:2])
print("\nTop Boosting Models (by CV R²):")
for name, result in top_boosting_models.items():
    print(f"  {name}: R² = {result['test_r2']:.4f}, MAE = {result['test_mae']:.2f}")

# Select our ensemble candidates (top performers from each category)
ensemble_models = {}

# Add top 2 linear models
linear_names = list(top_linear_models.keys())[:2]
for name in linear_names:
    ensemble_models[name] = models_linear[name]

# Add top 1 boosting model
boosting_name = list(top_boosting_models.keys())[0]
ensemble_models[boosting_name] = models[boosting_name]

print(f"\n2. ENSEMBLE COMPOSITION")
print("-" * 40)
print(f"Selected {len(ensemble_models)} models for ensemble:")
for name in ensemble_models.keys():
    print(f"  ✓ {name}")
    
print(f"\nTotal ensemble models: {len(ensemble_models)}")

# Store performance metrics for weight calculation
model_performance = {}
for name in ensemble_models.keys():
    if name in successful_results:  # Linear model
        model_performance[name] = {
            'mae': successful_results[name]['test_mae'],
            'r2': successful_results[name]['test_r2']
        }
    else:  # Boosting model
        model_performance[name] = {
            'mae': cv_results[name]['test_mae'], 
            'r2': cv_results[name]['test_r2']
        }

print(f"\n3. MODEL PERFORMANCE SUMMARY")
print("-" * 40)
for name, perf in model_performance.items():
    print(f"{name}: MAE = {perf['mae']:.2f}, R² = {perf['r2']:.4f}")

PHASE 1: WEIGHTED ENSEMBLE IMPLEMENTATION

1. SELECTING TOP MODELS
----------------------------------------
Top Linear Models (by CV R²):
  Lasso (alpha=0.01): R² = 0.9312, MAE = 2.72
  Ridge (alpha=10): R² = 0.9308, MAE = 2.73
  Ridge (alpha=1.0): R² = 0.9308, MAE = 2.73

Top Boosting Models (by CV R²):
  CatBoost: R² = 0.9120, MAE = 3.07
  XGBoost: R² = 0.9081, MAE = 3.16

2. ENSEMBLE COMPOSITION
----------------------------------------
Selected 3 models for ensemble:
  ✓ Lasso (alpha=0.01)
  ✓ Ridge (alpha=10)
  ✓ CatBoost

Total ensemble models: 3

3. MODEL PERFORMANCE SUMMARY
----------------------------------------
Lasso (alpha=0.01): MAE = 2.72, R² = 0.9312
Ridge (alpha=10): MAE = 2.73, R² = 0.9308
CatBoost: MAE = 3.07, R² = 0.9120


In [122]:
# Generate out-of-fold predictions for weight optimization
print("\n4. GENERATING OUT-OF-FOLD PREDICTIONS")
print("-" * 40)

from sklearn.model_selection import cross_val_predict

# Generate OOF predictions for each model
oof_predictions = {}
model_names = list(ensemble_models.keys())

for name, model in ensemble_models.items():
    print(f"Generating OOF predictions for {name}...")
    
    # Use the same CV strategy as before
    oof_pred = cross_val_predict(model, X_full, y_full, cv=cv, method='predict')
    oof_predictions[name] = oof_pred
    
    # Calculate OOF MAE
    oof_mae = mean_absolute_error(y_full, oof_pred)
    print(f"  OOF MAE: {oof_mae:.3f}")

# Create OOF prediction matrix
oof_matrix = np.column_stack([oof_predictions[name] for name in model_names])
print(f"\nOOF prediction matrix shape: {oof_matrix.shape}")

print("\n5. OPTIMIZING ENSEMBLE WEIGHTS")
print("-" * 40)

def ensemble_mae_objective(weights, predictions, targets):
    """Objective function to minimize: weighted ensemble MAE"""
    weights = np.array(weights)
    weights = weights / weights.sum()  # Normalize to sum to 1
    ensemble_pred = np.dot(predictions, weights)
    return mean_absolute_error(targets, ensemble_pred)

# Initial weights based on inverse MAE (better models get higher weights)
initial_weights = []
for name in model_names:
    mae = model_performance[name]['mae']
    # Inverse weight: lower MAE = higher weight
    weight = 1.0 / mae if mae > 0 else 1.0
    initial_weights.append(weight)

# Normalize initial weights
initial_weights = np.array(initial_weights)
initial_weights = initial_weights / initial_weights.sum()

print("Initial weights (based on inverse MAE):")
for i, name in enumerate(model_names):
    print(f"  {name}: {initial_weights[i]:.3f}")

# Constraint: weights must sum to 1
constraints = ({'type': 'eq', 'fun': lambda w: np.sum(w) - 1.0})

# Bounds: each weight between 0 and 1
bounds = [(0.0, 1.0) for _ in range(len(model_names))]

# Optimize weights
result = minimize(
    ensemble_mae_objective,
    initial_weights,
    args=(oof_matrix, y_full),
    method='SLSQP',
    bounds=bounds,
    constraints=constraints
)

optimal_weights = result.x
optimal_mae = result.fun

print(f"\nOptimization successful: {result.success}")
print(f"Optimal ensemble OOF MAE: {optimal_mae:.4f}")
print("\nOptimal weights:")
for i, name in enumerate(model_names):
    print(f"  {name}: {optimal_weights[i]:.3f}")

# Calculate improvement over best individual model
best_individual_mae = min([model_performance[name]['mae'] for name in model_names])
improvement = best_individual_mae - optimal_mae
print(f"\nImprovement over best individual model:")
print(f"  Best individual MAE: {best_individual_mae:.4f}")
print(f"  Ensemble MAE: {optimal_mae:.4f}")
print(f"  Improvement: {improvement:.4f} ({improvement/best_individual_mae*100:.2f}%)")


4. GENERATING OUT-OF-FOLD PREDICTIONS
----------------------------------------
Generating OOF predictions for Lasso (alpha=0.01)...
  OOF MAE: 2.723
Generating OOF predictions for Ridge (alpha=10)...
  OOF MAE: 2.730
Generating OOF predictions for CatBoost...
  OOF MAE: 2.723
Generating OOF predictions for Ridge (alpha=10)...
  OOF MAE: 2.730
Generating OOF predictions for CatBoost...
  OOF MAE: 3.093

OOF prediction matrix shape: (1812, 3)

5. OPTIMIZING ENSEMBLE WEIGHTS
----------------------------------------
Initial weights (based on inverse MAE):
  Lasso (alpha=0.01): 0.347
  Ridge (alpha=10): 0.346
  CatBoost: 0.307

Optimization successful: True
Optimal ensemble OOF MAE: 2.7231

Optimal weights:
  Lasso (alpha=0.01): 0.952
  Ridge (alpha=10): 0.014
  CatBoost: 0.033

Improvement over best individual model:
  Best individual MAE: 2.7234
  Ensemble MAE: 2.7231
  Improvement: 0.0002 (0.01%)
  OOF MAE: 3.093

OOF prediction matrix shape: (1812, 3)

5. OPTIMIZING ENSEMBLE WEIGHTS
--

In [123]:
# Train final models and generate test predictions
print("\n6. TRAINING FINAL ENSEMBLE MODELS")
print("-" * 40)

# Train each model on the full training dataset
final_models = {}
test_predictions = {}

for name, model in ensemble_models.items():
    print(f"Training {name} on full dataset...")
    
    # Clone and train the model
    final_model = model  # Pipeline already configured
    final_model.fit(X_full, y_full)
    final_models[name] = final_model
    
    # Generate test predictions
    test_pred = final_model.predict(X_test_final)
    test_predictions[name] = test_pred
    
    print(f"  Test predictions range: {test_pred.min():.2f} to {test_pred.max():.2f}")

print(f"\nAll {len(final_models)} models trained successfully!")

# Create test prediction matrix
test_matrix = np.column_stack([test_predictions[name] for name in model_names])
print(f"Test prediction matrix shape: {test_matrix.shape}")

print("\n7. GENERATING ENSEMBLE PREDICTIONS")
print("-" * 40)

# Generate weighted ensemble predictions
ensemble_test_pred = np.dot(test_matrix, optimal_weights)

print(f"Ensemble test predictions:")
print(f"  Range: {ensemble_test_pred.min():.2f} to {ensemble_test_pred.max():.2f}")
print(f"  Mean: {ensemble_test_pred.mean():.2f}")
print(f"  Std: {ensemble_test_pred.std():.2f}")

# Compare with individual model predictions
print(f"\nComparison with individual models:")
for i, name in enumerate(model_names):
    individual_pred = test_predictions[name]
    weight = optimal_weights[i]
    print(f"  {name} (weight={weight:.3f}): mean={individual_pred.mean():.2f}, std={individual_pred.std():.2f}")

print(f"\n8. CREATING SUBMISSION FILE")
print("-" * 40)

# Create submission DataFrame
submission_df = pd.DataFrame({
    'ID': test_df['ID'],  # Use the actual ID column from test.csv
    'W': ensemble_test_pred
})

# Generate timestamp for unique filename
from datetime import datetime
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
submission_filename = f"submission_weighted_ensemble_{timestamp}.csv"
submission_path = SUB_DIR / submission_filename

# Save submission
submission_df.to_csv(submission_path, index=False)

print(f"✅ Submission saved: {submission_filename}")
print(f"📁 Path: {submission_path}")
print(f"📊 Predictions shape: {submission_df.shape}")

# Display first few predictions
print(f"\nFirst 10 predictions:")
print(submission_df.head(10))

print(f"\n9. ENSEMBLE SUMMARY")
print("-" * 40)
print(f"Ensemble Composition:")
for i, name in enumerate(model_names):
    print(f"  {name}: {optimal_weights[i]:.1%}")
print(f"\nExpected Performance:")
print(f"  Cross-validation MAE: {optimal_mae:.4f}")
print(f"  Expected Kaggle score: ~{optimal_mae:.2f}")
print(f"  Improvement vs best individual: {improvement:.4f}")




6. TRAINING FINAL ENSEMBLE MODELS
----------------------------------------
Training Lasso (alpha=0.01) on full dataset...
  Test predictions range: 44.85 to 109.52
Training Ridge (alpha=10) on full dataset...
  Test predictions range: 44.55 to 109.03
Training CatBoost on full dataset...
  Test predictions range: 47.82 to 101.78

All 3 models trained successfully!
Test prediction matrix shape: (453, 3)

7. GENERATING ENSEMBLE PREDICTIONS
----------------------------------------
Ensemble test predictions:
  Range: 44.95 to 109.25
  Mean: 79.08
  Std: 12.03

Comparison with individual models:
  Lasso (alpha=0.01) (weight=0.952): mean=79.08, std=12.05
  Ridge (alpha=10) (weight=0.014): mean=79.11, std=12.01
  CatBoost (weight=0.033): mean=79.07, std=11.77

8. CREATING SUBMISSION FILE
----------------------------------------
✅ Submission saved: submission_weighted_ensemble_20251004_170139.csv
📁 Path: /home/chrisfkh/sctp-ds-ai/mod3/kaggle_moneyball/submissions/submission_weighted_ensemble_2

# Stacked Ensemble Implementation

The weighted ensemble scored 3.04775 on Kaggle vs 2.7190 CV, indicating distribution mismatch between CV and test set. 

## Stacking Strategy:
- **Expand base models**: Include more diverse models (XGBoost, LightGBM, different regularization strengths)
- **Two-level stacking**: Level 1 base models → Level 2 meta-learner 
- **Robust meta-learner**: Use Ridge regression to combine predictions and learn complex relationships
- **Better generalization**: Out-of-fold training prevents overfitting to specific data splits

In [124]:
print("STACKED ENSEMBLE IMPLEMENTATION")
print("="*50)

from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
import warnings
warnings.filterwarnings("ignore")

print(f"\n1. EXPANDING BASE MODEL DIVERSITY") 
print("-" * 40)

# Create more diverse base models for better generalization
stacking_models = {
    # Linear models with different regularization strengths
    'Ridge_weak': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Ridge(alpha=0.1))  # Less regularization
    ]),
    
    'Ridge_strong': Pipeline([
        ('scaler', StandardScaler()), 
        ('model', Ridge(alpha=10.0))  # More regularization
    ]),
    
    'Lasso_weak': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso(alpha=0.01, max_iter=5000))
    ]),
    
    'Lasso_strong': Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso(alpha=0.1, max_iter=5000))
    ]),
    
    'ElasticNet': Pipeline([
        ('scaler', StandardScaler()),
        ('model', ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=5000))
    ]),
    
    # Tree-based models
    'XGBoost_conservative': XGBRegressor(
        n_estimators=150,
        max_depth=4,  # Shallower for better generalization
        learning_rate=0.08,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbosity=0
    ),
    
    'LightGBM_conservative': LGBMRegressor(
        n_estimators=150,
        max_depth=4,
        learning_rate=0.08,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        verbose=-1
    ),
    
    'CatBoost_conservative': CatBoostRegressor(
        iterations=150,
        depth=4,
        learning_rate=0.08,
        random_seed=42,
        verbose=False
    )
}

print(f"Base models for stacking: {len(stacking_models)}")
for name in stacking_models.keys():
    print(f"  ✓ {name}")

print(f"\n2. IMPLEMENTING STACKED ENSEMBLE")
print("-" * 40)

# Use the same CV folds for all models to ensure consistency  
stacking_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Level 1: Generate out-of-fold predictions from base models
print("Generating Level 1 out-of-fold predictions...")

level1_oof_preds = np.zeros((len(X_full), len(stacking_models)))
level1_test_preds = np.zeros((len(X_test_final), len(stacking_models)))

model_names_stack = list(stacking_models.keys())

for i, (name, model) in enumerate(stacking_models.items()):
    print(f"  Processing {name}...")
    
    # Generate OOF predictions
    oof_pred = cross_val_predict(model, X_full, y_full, cv=stacking_cv, method='predict')
    level1_oof_preds[:, i] = oof_pred
    
    # Train on full dataset and predict test set
    model_clone = clone(model)
    model_clone.fit(X_full, y_full)
    test_pred = model_clone.predict(X_test_final)
    level1_test_preds[:, i] = test_pred
    
    # Calculate individual model OOF MAE
    oof_mae = mean_absolute_error(y_full, oof_pred)
    print(f"    OOF MAE: {oof_mae:.4f}")

print(f"\nLevel 1 OOF predictions shape: {level1_oof_preds.shape}")
print(f"Level 1 test predictions shape: {level1_test_preds.shape}")

STACKED ENSEMBLE IMPLEMENTATION

1. EXPANDING BASE MODEL DIVERSITY
----------------------------------------
Base models for stacking: 8
  ✓ Ridge_weak
  ✓ Ridge_strong
  ✓ Lasso_weak
  ✓ Lasso_strong
  ✓ ElasticNet
  ✓ XGBoost_conservative
  ✓ LightGBM_conservative
  ✓ CatBoost_conservative

2. IMPLEMENTING STACKED ENSEMBLE
----------------------------------------
Generating Level 1 out-of-fold predictions...
  Processing Ridge_weak...
    OOF MAE: 2.7347
  Processing Ridge_strong...
    OOF MAE: 2.7302
  Processing Lasso_weak...
    OOF MAE: 2.7234
  Processing Lasso_strong...
    OOF MAE: 2.8830
  Processing ElasticNet...
    OOF MAE: 2.8773
  Processing XGBoost_conservative...
    OOF MAE: 2.7234
  Processing Lasso_strong...
    OOF MAE: 2.8830
  Processing ElasticNet...
    OOF MAE: 2.8773
  Processing XGBoost_conservative...
    OOF MAE: 3.1026
  Processing LightGBM_conservative...
    OOF MAE: 3.0608
  Processing CatBoost_conservative...
    OOF MAE: 3.1026
  Processing LightGBM_

In [125]:
# Level 2: Train meta-learner
print(f"\n3. TRAINING LEVEL 2 META-LEARNER")
print("-" * 40)

# Import the missing function
from sklearn.model_selection import cross_val_score

# Try multiple meta-learners to find the best one
meta_learners = {
    'Ridge_meta_weak': Ridge(alpha=0.1),
    'Ridge_meta_medium': Ridge(alpha=1.0), 
    'Ridge_meta_strong': Ridge(alpha=10.0),
    'Lasso_meta': Lasso(alpha=0.01, max_iter=5000),
    'ElasticNet_meta': ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=5000)
}

best_meta_mae = float('inf')
best_meta_name = None
best_meta_model = None

print("Evaluating meta-learners:")
for name, meta_model in meta_learners.items():
    # Cross-validate the meta-learner on OOF predictions
    meta_cv_scores = cross_val_score(
        meta_model, level1_oof_preds, y_full,
        cv=5, scoring='neg_mean_absolute_error'
    )
    meta_mae = -meta_cv_scores.mean()
    meta_mae_std = meta_cv_scores.std()
    
    print(f"  {name}: MAE = {meta_mae:.4f} (±{meta_mae_std:.4f})")
    
    if meta_mae < best_meta_mae:
        best_meta_mae = meta_mae
        best_meta_name = name
        best_meta_model = meta_model

print(f"\nBest meta-learner: {best_meta_name}")
print(f"Best meta-learner CV MAE: {best_meta_mae:.4f}")

# Train the best meta-learner on all OOF predictions
print(f"\n4. TRAINING FINAL STACKED MODEL")
print("-" * 40)

final_meta_model = clone(best_meta_model)
final_meta_model.fit(level1_oof_preds, y_full)

# Generate final stacked predictions
stacked_test_pred = final_meta_model.predict(level1_test_preds)

print(f"Stacked ensemble test predictions:")
print(f"  Range: {stacked_test_pred.min():.2f} to {stacked_test_pred.max():.2f}")
print(f"  Mean: {stacked_test_pred.mean():.2f}")
print(f"  Std: {stacked_test_pred.std():.2f}")

# Compare with Phase 1 ensemble
print(f"\n5. COMPARISON WITH PHASE 1")
print("-" * 40)
print(f"Phase 1 ensemble predictions:")
print(f"  Range: {ensemble_test_pred.min():.2f} to {ensemble_test_pred.max():.2f}")
print(f"  Mean: {ensemble_test_pred.mean():.2f}")
print(f"  Std: {ensemble_test_pred.std():.2f}")

print(f"\nPhase 2 stacked predictions:")
print(f"  Range: {stacked_test_pred.min():.2f} to {stacked_test_pred.max():.2f}")  
print(f"  Mean: {stacked_test_pred.mean():.2f}")
print(f"  Std: {stacked_test_pred.std():.2f}")

# Calculate correlation between Phase 1 and Phase 2 predictions
correlation = np.corrcoef(ensemble_test_pred, stacked_test_pred)[0, 1]
print(f"\nCorrelation between Phase 1 and Phase 2: {correlation:.4f}")

print(f"\nPhase 2 (stacked ensemble):")
print(f"  CV MAE: {best_meta_mae:.4f}")
improvement_vs_phase1 = optimal_mae - best_meta_mae
print(f"  Expected improvement vs Phase 1: {improvement_vs_phase1:.4f}")

if best_meta_mae < optimal_mae:
    print(f"  ✅ Phase 2 shows improvement over Phase 1!")
else:
    print(f"  ⚠️  Phase 2 CV did not improve Phase 1")
    


3. TRAINING LEVEL 2 META-LEARNER
----------------------------------------
Evaluating meta-learners:
  Ridge_meta_weak: MAE = 2.7300 (±0.0549)
  Ridge_meta_medium: MAE = 2.7299 (±0.0546)
  Ridge_meta_strong: MAE = 2.7293 (±0.0523)
  Lasso_meta: MAE = 2.7305 (±0.0480)
  ElasticNet_meta: MAE = 2.7313 (±0.0491)

Best meta-learner: Ridge_meta_strong
Best meta-learner CV MAE: 2.7293

4. TRAINING FINAL STACKED MODEL
----------------------------------------
Stacked ensemble test predictions:
  Range: 45.03 to 109.12
  Mean: 79.04
  Std: 12.08

5. COMPARISON WITH PHASE 1
----------------------------------------
Phase 1 ensemble predictions:
  Range: 44.95 to 109.25
  Mean: 79.08
  Std: 12.03

Phase 2 stacked predictions:
  Range: 45.03 to 109.12
  Mean: 79.04
  Std: 12.08

Correlation between Phase 1 and Phase 2: 0.9999

Phase 2 (stacked ensemble):
  CV MAE: 2.7293
  Expected improvement vs Phase 1: -0.0062
  ⚠️  Phase 2 CV did not improve Phase 1


In [126]:
# Create stacked ensemble submission
print(f"\n7. CREATING STACKED ENSEMBLE SUBMISSION")
print("-" * 40)

# Create submission DataFrame for stacked ensemble
stacked_submission_df = pd.DataFrame({
    'ID': test_df['ID'],
    'W': stacked_test_pred
})

# Generate timestamp
timestamp_stacked = datetime.now().strftime("%Y%m%d_%H%M%S")
stacked_submission_filename = f"submission_stacked_ensemble_{timestamp_stacked}.csv"
stacked_submission_path = SUB_DIR / stacked_submission_filename

# Save submission
stacked_submission_df.to_csv(stacked_submission_path, index=False)

print(f"✅ Stacked ensemble submission saved: {stacked_submission_filename}")
print(f"📁 Path: {stacked_submission_path}")
print(f"📊 Predictions shape: {stacked_submission_df.shape}")

# Display first few predictions
print(f"\nFirst 10 predictions:")
print(stacked_submission_df.head(10))

# Final summary
print(f"\n8. STACKED ENSEMBLE SUMMARY")
print("-" * 40)
print(f"Base Models: {len(stacking_models)}")
for name in model_names_stack:
    print(f"  • {name}")

print(f"\nMeta-learner: {best_meta_name}")
print(f"Expected CV MAE: {best_meta_mae:.4f}")



7. CREATING STACKED ENSEMBLE SUBMISSION
----------------------------------------
✅ Stacked ensemble submission saved: submission_stacked_ensemble_20251004_170141.csv
📁 Path: /home/chrisfkh/sctp-ds-ai/mod3/kaggle_moneyball/submissions/submission_stacked_ensemble_20251004_170141.csv
📊 Predictions shape: (453, 2)

First 10 predictions:
     ID          W
0  1756  69.210882
1  1282  74.421222
2   351  84.214536
3   421  87.177143
4    57  93.234779
5  1557  97.552946
6   846  79.052509
7  1658  83.860262
8   112  73.189547
9  2075  83.637957

8. STACKED ENSEMBLE SUMMARY
----------------------------------------
Base Models: 8
  • Ridge_weak
  • Ridge_strong
  • Lasso_weak
  • Lasso_strong
  • ElasticNet
  • XGBoost_conservative
  • LightGBM_conservative
  • CatBoost_conservative

Meta-learner: Ridge_meta_strong
Expected CV MAE: 2.7293
