In [1]:
pip install huggingface_hub PyPDF2

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# ML libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
import xgboost as xgb

# PDF extraction
import PyPDF2

# HuggingFace data loading
import os
from huggingface_hub import hf_hub_download, snapshot_download

In [3]:
REPO_ID = "lainmn/AgentDS-Healthcare"

# Download CSV files from HuggingFace
TRAIN_CSV = hf_hub_download(REPO_ID, "Healthcare/ed_cost_train.csv", repo_type="dataset")
TEST_CSV = hf_hub_download(REPO_ID, "Healthcare/ed_cost_test.csv", repo_type="dataset")

# Download PDF receipts folder
dataset_path = snapshot_download(REPO_ID, repo_type="dataset", allow_patterns="Healthcare/receipts_pdf/*")
PDF_FOLDER = os.path.join(dataset_path, "Healthcare", "receipts_pdf")




In [4]:
def extract_pdf_features(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
        
        features = {}
        patient_match = re.search(r'Patient ID:\s*(\d+)', text)
        if not patient_match:
            return None
        features['patient_id'] = int(patient_match.group(1))
        
        zip_match = re.search(r'ZIP3:\s*(\d+)', text)
        features['zip3'] = int(zip_match.group(1)) if zip_match else 0
        
        ins_match = re.search(r'Insurance:\s*(\w+)', text)
        features['insurance'] = ins_match.group(1).lower() if ins_match else 'unknown'
        
        total_match = re.search(r'TOTAL\s+([\d,]+\.?\d*)', text)
        features['pdf_total_cost'] = float(total_match.group(1).replace(',', '')) if total_match else 0
        
        cpt_pattern = r'(\d{5})\s+(.+?)\s+(\d+)\s+([\d.]+)\s+([\d.]+)'
        cpt_matches = re.findall(cpt_pattern, text)
        features['pdf_num_line_items'] = len(cpt_matches)
        
        if cpt_matches:
            cpt_codes = [match[0] for match in cpt_matches]
            ed_visit_codes = [c for c in cpt_codes if c.startswith('9928')]
            features['pdf_num_ed_visits'] = len(ed_visit_codes)
            features['pdf_high_complexity_visits'] = sum(1 for c in ed_visit_codes if c in ['99284', '99285'])
            features['pdf_num_lab_tests'] = sum(1 for c in cpt_codes if 80000 <= int(c) <= 89999)
            features['pdf_num_imaging'] = sum(1 for c in cpt_codes if 70000 <= int(c) <= 79999)
            
            line_costs = [float(match[4]) for match in cpt_matches]
            features['pdf_avg_line_cost'] = np.mean(line_costs)
            features['pdf_max_line_cost'] = np.max(line_costs)
            features['pdf_std_line_cost'] = np.std(line_costs)
            features['pdf_avg_cost_per_visit'] = features['pdf_total_cost'] / features['pdf_num_ed_visits'] if features['pdf_num_ed_visits'] > 0 else 0
        else:
            for key in ['pdf_num_ed_visits', 'pdf_high_complexity_visits', 'pdf_num_lab_tests', 
                       'pdf_num_imaging', 'pdf_avg_line_cost', 'pdf_max_line_cost', 
                       'pdf_std_line_cost', 'pdf_avg_cost_per_visit']:
                features[key] = 0
        
        return features
    except:
        return None

def load_pdf_features(pdf_folder):
    pdf_folder = Path(pdf_folder)
    pdf_files = list(pdf_folder.glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")
    
    all_features = []
    for pdf_file in pdf_files:
        features = extract_pdf_features(pdf_file)
        if features:
            all_features.append(features)
    
    if not all_features:
        return pd.DataFrame()
    
    print(f"Extracted features from {len(all_features)} PDFs")
    return pd.DataFrame(all_features)

In [5]:
def create_features(df, pdf_features_df):
    if not pdf_features_df.empty:
        df = df.merge(pdf_features_df, on='patient_id', how='left')
        pdf_cols = [c for c in pdf_features_df.columns if c != 'patient_id']
        df[pdf_cols] = df[pdf_cols].fillna(0)
    
    df['cost_per_visit_ratio'] = df['prior_ed_cost_5y_usd'] / (df['prior_ed_visits_5y'] + 1)
    df['log_prior_cost'] = np.log1p(df['prior_ed_cost_5y_usd'])
    df['log_prior_visits'] = np.log1p(df['prior_ed_visits_5y'])
    
    if 'pdf_avg_cost_per_visit' in df.columns:
        df['pdf_vs_table_cost_diff'] = df['pdf_total_cost'] - df['prior_ed_cost_5y_usd']
        df['cost_consistency_ratio'] = df['pdf_total_cost'] / (df['prior_ed_cost_5y_usd'] + 1)
        df['high_complexity_ratio'] = df['pdf_high_complexity_visits'] / (df['pdf_num_ed_visits'] + 1)
        df['lab_per_visit'] = df['pdf_num_lab_tests'] / (df['pdf_num_ed_visits'] + 1)
        df['imaging_per_visit'] = df['pdf_num_imaging'] / (df['pdf_num_ed_visits'] + 1)
        df['cost_variability'] = df['pdf_std_line_cost'] / (df['pdf_avg_line_cost'] + 1)
        df['log_pdf_total_cost'] = np.log1p(df['pdf_total_cost'])
    
    if 'insurance' in df.columns:
        df['insurance_private'] = (df['insurance'] == 'private').astype(int)
        df['insurance_medicare'] = (df['insurance'] == 'medicare').astype(int)
        df['insurance_medicaid'] = (df['insurance'] == 'medicaid').astype(int)
    
    if 'primary_chronic' in df.columns:
        chronic_dummies = pd.get_dummies(df['primary_chronic'], prefix='chronic')
        df = pd.concat([df, chronic_dummies], axis=1)
    
    return df

In [6]:
print("Loading data...")
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

print("Extracting PDF features...")
pdf_features = load_pdf_features(PDF_FOLDER)

print("Engineering features...")
train_enhanced = create_features(train_df.copy(), pdf_features)
test_enhanced = create_features(test_df.copy(), pdf_features)

target_col = 'ed_cost_next3y_usd'
exclude_cols = ['patient_id', target_col, 'primary_chronic', 'insurance']
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

train_enhanced[feature_cols] = train_enhanced[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)
test_enhanced[feature_cols] = test_enhanced[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)

X_train = train_enhanced[feature_cols]
y_train = train_enhanced[target_col]
X_test = test_enhanced[feature_cols]

print(f"Features: {len(feature_cols)}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")

Extracted features from 4000 PDFs
Engineering features...
Features: 29
Training samples: 2000
Test samples: 2000


In [7]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

scaler_val = StandardScaler()
X_tr_scaled = scaler_val.fit_transform(X_tr)
X_val_scaled = scaler_val.transform(X_val)

val_model = xgb.XGBRegressor(
    n_estimators=600,
    learning_rate=0.04,
    max_depth=7,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

val_model.fit(X_tr_scaled, y_tr)
val_pred = val_model.predict(X_val_scaled)
val_mae = mean_absolute_error(y_val, val_pred)
print(f"Validation MAE: ${val_mae:.2f}")

Validation MAE: $449.22


In [8]:
print("\nTraining final model on full data...")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Best model: XGBoost
final_model = xgb.XGBRegressor(
    n_estimators=600,
    learning_rate=0.04,
    max_depth=7,
    min_child_weight=3,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_train_scaled, y_train)
predictions = final_model.predict(X_test_scaled)

# Create submission
submission_df = pd.DataFrame({
    'patient_id': test_df['patient_id'],
    'ed_cost_next3y_usd': predictions
})

submission_file = "healthcare_challenge2_predictions.csv"
submission_df.to_csv(submission_file, index=False)

print(f"\nPredictions saved to {submission_file}")
print(f"Predictions: {len(predictions)}")
print(f"Mean: ${predictions.mean():.2f}, Median: ${np.median(predictions):.2f}")
print(f"Range: ${predictions.min():.2f} to ${predictions.max():.2f}")


Predictions saved to healthcare_challenge2_predictions.csv
Predictions: 2000
Mean: $3918.46, Median: $3565.82
Range: $854.03 to $10885.12


In [9]:
# NOTE: The AgentDS benchmark leaderboard is no longer hosted.
# Submission via BenchmarkClient is no longer available.
# The score achieved was MAE = $480.33.

print("Done.")

Done.


In [10]:
# Healthcare Challenge 2 - Best Model Solution
# Target: < $450 MAE

# ============================================================
# 2. PDF EXTRACTION
# ============================================================
def extract_pdf_features(pdf_path):
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page in pdf_reader.pages:
                text += page.extract_text()
        
        features = {}
        patient_match = re.search(r'Patient ID:\s*(\d+)', text)
        if not patient_match:
            return None
        features['patient_id'] = int(patient_match.group(1))
        
        zip_match = re.search(r'ZIP3:\s*(\d+)', text)
        features['zip3'] = int(zip_match.group(1)) if zip_match else 0
        
        ins_match = re.search(r'Insurance:\s*(\w+)', text)
        features['insurance'] = ins_match.group(1).lower() if ins_match else 'unknown'
        
        total_match = re.search(r'TOTAL\s+([\d,]+\.?\d*)', text)
        features['pdf_total_cost'] = float(total_match.group(1).replace(',', '')) if total_match else 0
        
        cpt_pattern = r'(\d{5})\s+(.+?)\s+(\d+)\s+([\d.]+)\s+([\d.]+)'
        cpt_matches = re.findall(cpt_pattern, text)
        features['pdf_num_line_items'] = len(cpt_matches)
        
        if cpt_matches:
            cpt_codes = [match[0] for match in cpt_matches]
            ed_visit_codes = [c for c in cpt_codes if c.startswith('9928')]
            features['pdf_num_ed_visits'] = len(ed_visit_codes)
            features['pdf_high_complexity_visits'] = sum(1 for c in ed_visit_codes if c in ['99284', '99285'])
            features['pdf_med_complexity_visits'] = sum(1 for c in ed_visit_codes if c == '99283')
            features['pdf_num_lab_tests'] = sum(1 for c in cpt_codes if 80000 <= int(c) <= 89999)
            features['pdf_num_imaging'] = sum(1 for c in cpt_codes if 70000 <= int(c) <= 79999)
            
            line_costs = [float(match[4]) for match in cpt_matches]
            features['pdf_avg_line_cost'] = np.mean(line_costs)
            features['pdf_max_line_cost'] = np.max(line_costs)
            features['pdf_min_line_cost'] = np.min(line_costs)
            features['pdf_std_line_cost'] = np.std(line_costs)
            features['pdf_avg_cost_per_visit'] = features['pdf_total_cost'] / features['pdf_num_ed_visits'] if features['pdf_num_ed_visits'] > 0 else 0
        else:
            for key in ['pdf_num_ed_visits', 'pdf_high_complexity_visits', 'pdf_med_complexity_visits',
                       'pdf_num_lab_tests', 'pdf_num_imaging', 'pdf_avg_line_cost', 'pdf_max_line_cost', 
                       'pdf_min_line_cost', 'pdf_std_line_cost', 'pdf_avg_cost_per_visit']:
                features[key] = 0
        
        return features
    except:
        return None

def load_pdf_features(pdf_folder):
    pdf_folder = Path(pdf_folder)
    pdf_files = list(pdf_folder.glob("*.pdf"))
    print(f"Found {len(pdf_files)} PDF files")
    
    all_features = []
    for pdf_file in pdf_files:
        features = extract_pdf_features(pdf_file)
        if features:
            all_features.append(features)
    
    if not all_features:
        return pd.DataFrame()
    
    print(f"Extracted features from {len(all_features)} PDFs")
    return pd.DataFrame(all_features)

# ============================================================
# 3. ENHANCED FEATURE ENGINEERING
# ============================================================
def create_features(df, pdf_features_df):
    if not pdf_features_df.empty:
        df = df.merge(pdf_features_df, on='patient_id', how='left')
        pdf_cols = [c for c in pdf_features_df.columns if c != 'patient_id']
        df[pdf_cols] = df[pdf_cols].fillna(0)
    
    # Basic ratios
    df['cost_per_visit_ratio'] = df['prior_ed_cost_5y_usd'] / (df['prior_ed_visits_5y'] + 1)
    
    # Log transforms
    df['log_prior_cost'] = np.log1p(df['prior_ed_cost_5y_usd'])
    df['log_prior_visits'] = np.log1p(df['prior_ed_visits_5y'])
    df['sqrt_prior_cost'] = np.sqrt(df['prior_ed_cost_5y_usd'])
    
    # Polynomial features
    df['prior_cost_squared'] = df['prior_ed_cost_5y_usd'] ** 2
    df['prior_visits_squared'] = df['prior_ed_visits_5y'] ** 2
    df['cost_visit_interaction'] = df['prior_ed_cost_5y_usd'] * df['prior_ed_visits_5y']
    
    if 'pdf_avg_cost_per_visit' in df.columns:
        # PDF features
        df['pdf_vs_table_cost_diff'] = df['pdf_total_cost'] - df['prior_ed_cost_5y_usd']
        df['pdf_vs_table_visits_diff'] = df['pdf_num_ed_visits'] - df['prior_ed_visits_5y']
        df['cost_consistency_ratio'] = df['pdf_total_cost'] / (df['prior_ed_cost_5y_usd'] + 1)
        df['high_complexity_ratio'] = df['pdf_high_complexity_visits'] / (df['pdf_num_ed_visits'] + 1)
        df['med_complexity_ratio'] = df['pdf_med_complexity_visits'] / (df['pdf_num_ed_visits'] + 1)
        df['lab_per_visit'] = df['pdf_num_lab_tests'] / (df['pdf_num_ed_visits'] + 1)
        df['imaging_per_visit'] = df['pdf_num_imaging'] / (df['pdf_num_ed_visits'] + 1)
        df['cost_variability'] = df['pdf_std_line_cost'] / (df['pdf_avg_line_cost'] + 1)
        
        # Service intensity score
        df['service_intensity'] = (
            df['pdf_high_complexity_visits'] * 3 + 
            df['pdf_med_complexity_visits'] * 2 + 
            df['pdf_num_lab_tests'] * 1 + 
            df['pdf_num_imaging'] * 2
        ) / (df['pdf_num_ed_visits'] + 1)
        
        df['log_pdf_total_cost'] = np.log1p(df['pdf_total_cost'])
        df['log_pdf_avg_cost'] = np.log1p(df['pdf_avg_cost_per_visit'])
        
        # Cost range features
        df['pdf_cost_range'] = df['pdf_max_line_cost'] - df['pdf_min_line_cost']
        df['pdf_avg_per_table_cost'] = df['pdf_avg_cost_per_visit'] / (df['cost_per_visit_ratio'] + 1)
    
    if 'insurance' in df.columns:
        df['insurance_private'] = (df['insurance'] == 'private').astype(int)
        df['insurance_medicare'] = (df['insurance'] == 'medicare').astype(int)
        df['insurance_medicaid'] = (df['insurance'] == 'medicaid').astype(int)
    
    if 'primary_chronic' in df.columns:
        chronic_dummies = pd.get_dummies(df['primary_chronic'], prefix='chronic')
        df = pd.concat([df, chronic_dummies], axis=1)
    
    if 'zip3' in df.columns:
        df['zip3_normalized'] = df['zip3'] / 100.0
    
    return df

# ============================================================
# 4. LOAD DATA
# ============================================================
print("Loading data...")
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
print(f"Train: {train_df.shape}, Test: {test_df.shape}")

print("Extracting PDF features...")
pdf_features = load_pdf_features(PDF_FOLDER)

print("Engineering features...")
train_enhanced = create_features(train_df.copy(), pdf_features)
test_enhanced = create_features(test_df.copy(), pdf_features)

target_col = 'ed_cost_next3y_usd'
exclude_cols = ['patient_id', target_col, 'primary_chronic', 'insurance']
feature_cols = [c for c in train_enhanced.columns if c not in exclude_cols]

train_enhanced[feature_cols] = train_enhanced[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)
test_enhanced[feature_cols] = test_enhanced[feature_cols].fillna(0).replace([np.inf, -np.inf], 0)

X_train = train_enhanced[feature_cols]
y_train = train_enhanced[target_col]
X_test = test_enhanced[feature_cols]

print(f"Features: {len(feature_cols)}")

# ============================================================
# 5. VALIDATION WITH ENSEMBLE
# ============================================================
print("\nValidating ensemble...")
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

scaler_val = StandardScaler()
X_tr_scaled = scaler_val.fit_transform(X_tr)
X_val_scaled = scaler_val.transform(X_val)

# Train 5 models
models_val = []

# XGBoost
xgb_val = xgb.XGBRegressor(n_estimators=700, learning_rate=0.03, max_depth=8, 
                           min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                           gamma=0.1, reg_alpha=0.1, reg_lambda=1.0, random_state=42, n_jobs=-1)
xgb_val.fit(X_tr_scaled, y_tr)
models_val.append(('XGB', xgb_val, X_val_scaled, 0.35))

# GradientBoosting
gb_val = GradientBoostingRegressor(n_estimators=500, learning_rate=0.03, max_depth=7,
                                   subsample=0.8, min_samples_split=8, random_state=42)
gb_val.fit(X_tr_scaled, y_tr)
models_val.append(('GB', gb_val, X_val_scaled, 0.30))

# RandomForest
rf_val = RandomForestRegressor(n_estimators=500, max_depth=20, min_samples_split=6,
                               min_samples_leaf=3, random_state=42, n_jobs=-1)
rf_val.fit(X_tr, y_tr)
models_val.append(('RF', rf_val, X_val, 0.15))

# ExtraTrees
et_val = ExtraTreesRegressor(n_estimators=500, max_depth=20, min_samples_split=6,
                             min_samples_leaf=3, random_state=42, n_jobs=-1)
et_val.fit(X_tr, y_tr)
models_val.append(('ET', et_val, X_val, 0.15))

# Ridge
ridge_val = Ridge(alpha=5.0, random_state=42)
ridge_val.fit(X_tr_scaled, y_tr)
models_val.append(('Ridge', ridge_val, X_val_scaled, 0.05))

# Ensemble validation predictions
val_preds = []
for name, model, X_v, weight in models_val:
    pred = model.predict(X_v)
    mae = mean_absolute_error(y_val, pred)
    val_preds.append(pred * weight)
    print(f"{name} ({weight*100:.0f}%): MAE ${mae:.2f}")

ensemble_val_pred = np.sum(val_preds, axis=0)
val_mae = mean_absolute_error(y_val, ensemble_val_pred)
print(f"\nEnsemble Validation MAE: ${val_mae:.2f}")

# ============================================================
# 6. TRAIN FINAL ENSEMBLE ON FULL DATA
# ============================================================
print("\nTraining final ensemble on full data...")

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

final_models = []

# XGBoost
xgb_final = xgb.XGBRegressor(n_estimators=700, learning_rate=0.03, max_depth=8,
                             min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                             gamma=0.1, reg_alpha=0.1, reg_lambda=1.0, random_state=42, n_jobs=-1)
xgb_final.fit(X_train_scaled, y_train)
final_models.append((xgb_final, X_test_scaled, 0.35))

# GradientBoosting
gb_final = GradientBoostingRegressor(n_estimators=500, learning_rate=0.03, max_depth=7,
                                     subsample=0.8, min_samples_split=8, random_state=42)
gb_final.fit(X_train_scaled, y_train)
final_models.append((gb_final, X_test_scaled, 0.30))

# RandomForest
rf_final = RandomForestRegressor(n_estimators=500, max_depth=20, min_samples_split=6,
                                 min_samples_leaf=3, random_state=42, n_jobs=-1)
rf_final.fit(X_train, y_train)
final_models.append((rf_final, X_test, 0.15))

# ExtraTrees
et_final = ExtraTreesRegressor(n_estimators=500, max_depth=20, min_samples_split=6,
                                min_samples_leaf=3, random_state=42, n_jobs=-1)
et_final.fit(X_train, y_train)
final_models.append((et_final, X_test, 0.15))

# Ridge
ridge_final = Ridge(alpha=5.0, random_state=42)
ridge_final.fit(X_train_scaled, y_train)
final_models.append((ridge_final, X_test_scaled, 0.05))

# Ensemble predictions
test_preds = []
for model, X_te, weight in final_models:
    pred = model.predict(X_te)
    test_preds.append(pred * weight)

predictions = np.sum(test_preds, axis=0)

# Create submission
submission_df = pd.DataFrame({
    'patient_id': test_df['patient_id'],
    'ed_cost_next3y_usd': predictions
})

submission_file = "healthcare_challenge2_ensemble.csv"
submission_df.to_csv(submission_file, index=False)

print(f"\nPredictions saved to {submission_file}")
print(f"Mean: ${predictions.mean():.2f}, Median: ${np.median(predictions):.2f}")

# NOTE: The AgentDS benchmark leaderboard is no longer hosted.
# Submission via BenchmarkClient is no longer available.
# The final score achieved was MAE = $475.03.

print("\nDone.")


Predictions saved to healthcare_challenge2_ensemble.csv
Mean: $3920.70, Median: $3579.17

Done.


In [11]:
# ============================================================
# Score Estimation via 5-Fold Cross-Validation
# (Test labels are not available on HuggingFace, so we estimate
#  the MAE score using cross-validation on training data)
# ============================================================

print("ðŸ“Š Estimating MAE via 5-fold CV on training data...")
print("   Evaluating: Simple XGBoost vs Weighted Ensemble\n")

kf = KFold(n_splits=5, shuffle=True, random_state=42)

# --- Simple XGBoost CV ---
simple_xgb_maes = []
for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train), 1):
    X_tr_f, X_val_f = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr_f, y_val_f = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    
    sc = StandardScaler()
    X_tr_sc = sc.fit_transform(X_tr_f)
    X_val_sc = sc.transform(X_val_f)
    
    m = xgb.XGBRegressor(n_estimators=600, learning_rate=0.04, max_depth=7,
                         min_child_weight=3, subsample=0.8, colsample_bytree=0.8,
                         gamma=0.1, reg_alpha=0.1, reg_lambda=1.0, random_state=42, n_jobs=-1)
    m.fit(X_tr_sc, y_tr_f)
    mae = mean_absolute_error(y_val_f, m.predict(X_val_sc))
    simple_xgb_maes.append(mae)
    print(f"  Fold {fold} - Simple XGBoost MAE: ${mae:.2f}")

print(f"  >> Simple XGBoost Mean MAE: ${np.mean(simple_xgb_maes):.2f} (+/- ${np.std(simple_xgb_maes):.2f})\n")

# --- Weighted Ensemble CV ---
ensemble_maes = []
for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train), 1):
    X_tr_f, X_val_f = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr_f, y_val_f = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    
    sc = StandardScaler()
    X_tr_sc = sc.fit_transform(X_tr_f)
    X_val_sc = sc.transform(X_val_f)
    
    # XGBoost (35%)
    xgb_m = xgb.XGBRegressor(n_estimators=700, learning_rate=0.03, max_depth=8,
                              min_child_weight=2, subsample=0.8, colsample_bytree=0.8,
                              gamma=0.1, reg_alpha=0.1, reg_lambda=1.0, random_state=42, n_jobs=-1)
    xgb_m.fit(X_tr_sc, y_tr_f)
    
    # GradientBoosting (30%)
    gb_m = GradientBoostingRegressor(n_estimators=500, learning_rate=0.03, max_depth=7,
                                     subsample=0.8, min_samples_split=8, random_state=42)
    gb_m.fit(X_tr_sc, y_tr_f)
    
    # RandomForest (15%) - no scaling
    rf_m = RandomForestRegressor(n_estimators=500, max_depth=20, min_samples_split=6,
                                 min_samples_leaf=3, random_state=42, n_jobs=-1)
    rf_m.fit(X_tr_f, y_tr_f)
    
    # ExtraTrees (15%) - no scaling
    et_m = ExtraTreesRegressor(n_estimators=500, max_depth=20, min_samples_split=6,
                               min_samples_leaf=3, random_state=42, n_jobs=-1)
    et_m.fit(X_tr_f, y_tr_f)
    
    # Ridge (5%)
    ridge_m = Ridge(alpha=5.0)
    ridge_m.fit(X_tr_sc, y_tr_f)
    
    # Weighted ensemble prediction
    ens_pred = (
        0.35 * xgb_m.predict(X_val_sc) +
        0.30 * gb_m.predict(X_val_sc) +
        0.15 * rf_m.predict(X_val_f) +
        0.15 * et_m.predict(X_val_f) +
        0.05 * ridge_m.predict(X_val_sc)
    )
    
    mae = mean_absolute_error(y_val_f, ens_pred)
    ensemble_maes.append(mae)
    print(f"  Fold {fold} - Ensemble MAE: ${mae:.2f}")

print(f"  >> Ensemble Mean MAE: ${np.mean(ensemble_maes):.2f} (+/- ${np.std(ensemble_maes):.2f})")

print(f"\n{'='*60}")
print(f"  Simple XGBoost  5-Fold CV MAE:  ${np.mean(simple_xgb_maes):.2f}")
print(f"  Weighted Ensemble 5-Fold CV MAE: ${np.mean(ensemble_maes):.2f}")
print(f"{'='*60}")

  Fold 5 - Ensemble MAE: $454.78
  >> Ensemble Mean MAE: $458.51 (+/- $19.93)

  Simple XGBoost  5-Fold CV MAE:  $467.22
  Weighted Ensemble 5-Fold CV MAE: $458.51
