In [2]:
# ========================================================================
# BREAKTHROUGH TO 0.65+ PIPELINE
# Problem: Adding more models isn't helping (0.6385 → 0.6370)
# Solution: Focus on feature quality + truly diverse models
# ========================================================================

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler

import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("=" * 80)
print("BREAKTHROUGH PIPELINE - QUALITY OVER QUANTITY")
print("Focus: Better features + Model diversity + Smart ensembling")
print("=" * 80)

BREAKTHROUGH PIPELINE - QUALITY OVER QUANTITY
Focus: Better features + Model diversity + Smart ensembling


In [3]:
# ========================================================================
# STEP 1: LOAD DATA
# ========================================================================
train = pd.read_csv('train1.csv')
test = pd.read_csv('test.csv')

TARGET = 'target'
ID_COL = 'id'

print(f"\nData loaded: {train.shape[0]} train, {test.shape[0]} test")
print(f"Class balance: {train[TARGET].value_counts(normalize=True).to_dict()}")


Data loaded: 296209 train, 126948 test
Class balance: {0: 0.948732145208282, 1: 0.05126785479171801}


In [4]:
# ========================================================================
# STEP 2: INTELLIGENT FEATURE ENGINEERING
# ========================================================================
print("\n" + "=" * 80)
print("STEP 2: INTELLIGENT FEATURE ENGINEERING")
print("=" * 80)

def create_breakthrough_features(df, is_train=True):
    """
    Focus on high-quality, insurance-domain features
    """
    df = df.copy()
    
    print("  Creating domain-specific features...")
    
    # === Risk Profiling Features ===
    
    # 1. Total risk indicators
    risk_cols = [c for c in df.columns if c.startswith('ps_ind_')]
    if risk_cols:
        df['total_ind_features'] = df[risk_cols].sum(axis=1)
        df['avg_ind_features'] = df[risk_cols].mean(axis=1)
        df['nonzero_ind_count'] = (df[risk_cols] != 0).sum(axis=1)
    
    # 2. Car-related risk
    car_cols = [c for c in df.columns if c.startswith('ps_car_')]
    if car_cols:
        df['total_car_features'] = df[car_cols].sum(axis=1)
        df['car_missing_count'] = df[car_cols].isnull().sum(axis=1)
    
    # 3. Registration quality
    if all(c in df.columns for c in ['ps_reg_01', 'ps_reg_02', 'ps_reg_03']):
        df['reg_total'] = df['ps_reg_01'] + df['ps_reg_02'] + df['ps_reg_03']
        df['reg_product'] = df['ps_reg_01'] * df['ps_reg_02'] * df['ps_reg_03']
        df['reg_std'] = df[['ps_reg_01', 'ps_reg_02', 'ps_reg_03']].std(axis=1)
    
    # === Key Interactions (insurance domain) ===
    
    # Driver profile x Car value
    if 'ps_ind_15' in df.columns and 'ps_car_13' in df.columns:
        df['driver_car_interaction'] = df['ps_ind_15'] * df['ps_car_13']
        df['driver_car_ratio'] = df['ps_ind_15'] / (df['ps_car_13'] + 1)
    
    # Car age x Region risk
    if 'ps_car_15' in df.columns and 'ps_reg_03' in df.columns:
        df['car_age_region'] = df['ps_car_15'] * df['ps_reg_03']
    
    # Binary risk flags combined
    bin_cols = [c for c in df.columns if c.endswith('_bin')]
    if len(bin_cols) >= 3:
        # Create risk score from binary features
        df['binary_risk_score'] = df[bin_cols].sum(axis=1)
        df['binary_risk_pct'] = df['binary_risk_score'] / len(bin_cols)
    
    # === Statistical features ===
    
    # Data completeness (important for insurance)
    df['completeness_score'] = df.notna().sum(axis=1) / len(df.columns)
    
    # Calculated features analysis
    calc_cols = [c for c in df.columns if c.startswith('ps_calc_')]
    if calc_cols:
        df['calc_sum'] = df[calc_cols].sum(axis=1)
        df['calc_max'] = df[calc_cols].max(axis=1)
        df['calc_min'] = df[calc_cols].min(axis=1)
        df['calc_range'] = df['calc_max'] - df['calc_min']
    
    # === Categorical transformations ===
    
    # High cardinality categoricals
    if 'ps_car_11_cat' in df.columns:
        df['ps_car_11_cat_freq'] = df.groupby('ps_car_11_cat')['ps_car_11_cat'].transform('count')
    
    return df

print("Applying to train...")
train_fe = create_breakthrough_features(train, is_train=True)

print("Applying to test...")
test_fe = create_breakthrough_features(test, is_train=False)


STEP 2: INTELLIGENT FEATURE ENGINEERING
Applying to train...
  Creating domain-specific features...
Applying to test...
  Creating domain-specific features...


In [5]:
# ========================================================================
# STEP 3: SMART FEATURE SELECTION
# ========================================================================
print("\n" + "=" * 80)
print("STEP 3: SMART FEATURE SELECTION")
print("=" * 80)

X_full = train_fe.drop([TARGET, ID_COL], axis=1)
y_full = train_fe[TARGET]
test_full = test_fe.drop([ID_COL], axis=1, errors='ignore')

# Align
common_features = list(set(X_full.columns) & set(test_full.columns))
X_full = X_full[common_features]
test_full = test_full[common_features]

# Fill missing
X_full = X_full.fillna(-999)
test_full = test_full.fillna(-999)

print(f"Total features: {X_full.shape[1]}")

# Use BOTH LightGBM and permutation importance
print("\nCalculating feature importance (LightGBM)...")
selector = lgb.LGBMClassifier(n_estimators=200, random_state=RANDOM_SEED, verbose=-1, class_weight='balanced')
selector.fit(X_full, y_full)

importance_df = pd.DataFrame({
    'feature': X_full.columns,
    'importance': selector.feature_importances_
}).sort_values('importance', ascending=False)

# Remove zero-importance features
zero_importance = importance_df[importance_df['importance'] == 0]['feature'].tolist()
if zero_importance:
    print(f"Removing {len(zero_importance)} zero-importance features")
    X_full = X_full.drop(zero_importance, axis=1)
    test_full = test_full.drop(zero_importance, axis=1)

# Keep top features
N_FEATURES = min(80, len(X_full.columns))
selected_features = importance_df.head(N_FEATURES)['feature'].tolist()
selected_features = [f for f in selected_features if f in X_full.columns]

X_full = X_full[selected_features]
test_full = test_full[selected_features]

print(f"Selected {len(selected_features)} features")
print("\nTop 15 features:")
for i, row in importance_df.head(15).iterrows():
    print(f"  {row['feature']:30s}: {row['importance']:.1f}")


STEP 3: SMART FEATURE SELECTION
Total features: 84

Calculating feature importance (LightGBM)...


  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py

Removing 7 zero-importance features
Selected 77 features

Top 15 features:
  calc_sum                      : 267.0
  driver_car_ratio              : 250.0
  ps_car_13                     : 237.0
  feature6                      : 231.0
  reg_std                       : 211.0
  feature7                      : 209.0
  ps_ind_03                     : 205.0
  total_car_features            : 190.0
  driver_car_interaction        : 186.0
  ps_car_14                     : 186.0
  feature2                      : 170.0
  feature4                      : 159.0
  car_age_region                : 141.0
  ps_reg_03                     : 131.0
  avg_ind_features              : 125.0


In [6]:
# ========================================================================
# STEP 4: AGGRESSIVE TARGET ENCODING (PROPERLY)
# ========================================================================
print("\n" + "=" * 80)
print("STEP 4: PROPER TARGET ENCODING")
print("=" * 80)

def aggressive_target_encoding(X_train, y_train, X_test, alpha=15):
    """
    Target encode ALL categorical features with proper CV
    """
    X_train_te = X_train.copy()
    X_test_te = X_test.copy()
    
    # Identify categorical columns
    cat_cols = [col for col in X_train.columns 
                if col.endswith('_cat') or X_train[col].nunique() < 50]
    
    print(f"Target encoding {len(cat_cols)} categorical features...")
    
    global_mean = y_train.mean()
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
    
    for col in cat_cols:
        if col not in X_train.columns:
            continue
        
        X_train_te[f'{col}_te'] = global_mean
        
        # CV encoding for train
        for train_idx, val_idx in kf.split(X_train, y_train):
            X_tr = X_train.iloc[train_idx]
            y_tr = y_train.iloc[train_idx]
            X_vl = X_train.iloc[val_idx]
            
            # Calculate encoding
            encoding = y_tr.groupby(X_tr[col]).agg(['mean', 'count'])
            smoothed = (encoding['count'] * encoding['mean'] + alpha * global_mean) / (encoding['count'] + alpha)
            
            X_train_te.loc[X_train.index[val_idx], f'{col}_te'] = X_vl[col].map(smoothed).fillna(global_mean).values
        
        # Full train for test
        encoding_full = y_train.groupby(X_train[col]).agg(['mean', 'count'])
        smoothed_full = (encoding_full['count'] * encoding_full['mean'] + alpha * global_mean) / (encoding_full['count'] + alpha)
        
        X_test_te[f'{col}_te'] = X_test[col].map(smoothed_full).fillna(global_mean)
    
    return X_train_te, X_test_te

X_full, test_full = aggressive_target_encoding(X_full, y_full, test_full, alpha=15)

print(f"Final feature count: {X_full.shape[1]}")


STEP 4: PROPER TARGET ENCODING
Target encoding 58 categorical features...
Final feature count: 135


In [8]:
# ========================================================================
# STEP 5: TRAIN PROPERLY TUNED MODELS WITH 10-FOLD CV
# ========================================================================
print("\n" + "=" * 80)
print("STEP 5: TRAINING OPTIMIZED MODELS (10-FOLD CV)")
print("=" * 80)

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

# Model 1: CatBoost (your best)
print("\n1️⃣  CatBoost (optimized)...")
oof_cat = np.zeros(len(X_full))
test_cat = np.zeros(len(test_full))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full, y_full)):
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    model = CatBoostClassifier(
        iterations=800,
        depth=5,
        learning_rate=0.025,
        l2_leaf_reg=8,
        random_strength=1.5,
        bagging_temperature=0.7,
        border_count=128,
        auto_class_weights='Balanced',
        random_state=RANDOM_SEED + fold,
        verbose=0
    )
    
    model.fit(X_tr, y_tr, eval_set=(X_vl, y_vl), early_stopping_rounds=50, verbose=False)
    
    oof_cat[val_idx] = model.predict_proba(X_vl)[:, 1]
    test_cat += model.predict_proba(test_full)[:, 1] / 10

auc_cat = roc_auc_score(y_full, oof_cat)
print(f"   OOF AUROC: {auc_cat:.4f}")

# Model 2: LightGBM (dart mode for diversity)
print("\n2️⃣  LightGBM (DART boosting)...")
oof_lgb = np.zeros(len(X_full))
test_lgb = np.zeros(len(test_full))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full, y_full)):
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    model = lgb.LGBMClassifier(
        boosting_type='dart',  # Different from default gbdt
        n_estimators=800,
        num_leaves=48,
        learning_rate=0.025,
        min_child_samples=35,
        subsample=0.85,
        colsample_bytree=0.85,
        reg_alpha=0.3,
        reg_lambda=0.3,
        class_weight='balanced',
        random_state=RANDOM_SEED + fold,
        verbose=-1
    )
    
    model.fit(X_tr, y_tr, eval_set=[(X_vl, y_vl)],
              callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
    
    oof_lgb[val_idx] = model.predict_proba(X_vl)[:, 1]
    test_lgb += model.predict_proba(test_full)[:, 1] / 10

auc_lgb = roc_auc_score(y_full, oof_lgb)
print(f"   OOF AUROC: {auc_lgb:.4f}")

# Model 3: XGBoost (regularized)
print("\n3️⃣  XGBoost (heavily regularized)...")
oof_xgb = np.zeros(len(X_full))
test_xgb = np.zeros(len(test_full))

scale_pos_weight = (y_full == 0).sum() / (y_full == 1).sum()

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full, y_full)):
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    model = xgb.XGBClassifier(
        n_estimators=800,
        max_depth=4,
        learning_rate=0.025,
        min_child_weight=6,
        subsample=0.85,
        colsample_bytree=0.85,
        gamma=1.5,
        reg_alpha=0.5,
        reg_lambda=0.5,
        scale_pos_weight=scale_pos_weight,
        random_state=RANDOM_SEED + fold
    )
    
    model.fit(X_tr, y_tr, eval_set=[(X_vl, y_vl)])
    
    oof_xgb[val_idx] = model.predict_proba(X_vl)[:, 1]
    test_xgb += model.predict_proba(test_full)[:, 1] / 10

auc_xgb = roc_auc_score(y_full, oof_xgb)
print(f"   OOF AUROC: {auc_xgb:.4f}")

# Model 4: LightGBM (gbdt with different params)
print("\n4️⃣  LightGBM (GBDT variant)...")
oof_lgb2 = np.zeros(len(X_full))
test_lgb2 = np.zeros(len(test_full))

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full, y_full)):
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    model = lgb.LGBMClassifier(
        boosting_type='gbdt',
        n_estimators=800,
        num_leaves=64,
        learning_rate=0.02,
        min_child_samples=50,
        subsample=0.75,
        colsample_bytree=0.75,
        reg_alpha=1.0,
        reg_lambda=1.0,
        class_weight='balanced',
        random_state=RANDOM_SEED + fold + 100,
        verbose=-1
    )
    
    model.fit(X_tr, y_tr, eval_set=[(X_vl, y_vl)],
              callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)])
    
    oof_lgb2[val_idx] = model.predict_proba(X_vl)[:, 1]
    test_lgb2 += model.predict_proba(test_full)[:, 1] / 10

auc_lgb2 = roc_auc_score(y_full, oof_lgb2)
print(f"   OOF AUROC: {auc_lgb2:.4f}")


STEP 5: TRAINING OPTIMIZED MODELS (10-FOLD CV)

1️⃣  CatBoost (optimized)...
   OOF AUROC: 0.6384

2️⃣  LightGBM (DART boosting)...
   OOF AUROC: 0.6368

3️⃣  XGBoost (heavily regularized)...
[0]	validation_0-logloss:0.69222
[1]	validation_0-logloss:0.69128
[2]	validation_0-logloss:0.69052
[3]	validation_0-logloss:0.68964
[4]	validation_0-logloss:0.68888
[5]	validation_0-logloss:0.68816
[6]	validation_0-logloss:0.68744
[7]	validation_0-logloss:0.68672
[8]	validation_0-logloss:0.68597
[9]	validation_0-logloss:0.68529
[10]	validation_0-logloss:0.68467
[11]	validation_0-logloss:0.68397
[12]	validation_0-logloss:0.68341
[13]	validation_0-logloss:0.68272
[14]	validation_0-logloss:0.68226
[15]	validation_0-logloss:0.68168
[16]	validation_0-logloss:0.68112
[17]	validation_0-logloss:0.68070
[18]	validation_0-logloss:0.68019
[19]	validation_0-logloss:0.67935
[20]	validation_0-logloss:0.67882
[21]	validation_0-logloss:0.67836
[22]	validation_0-logloss:0.67801
[23]	validation_0-logloss:0.67752
[

In [9]:
# ========================================================================
# STEP 6: ANALYZE DIVERSITY
# ========================================================================
print("\n" + "=" * 80)
print("STEP 6: MODEL DIVERSITY ANALYSIS")
print("=" * 80)

predictions = {
    'CatBoost': oof_cat,
    'LightGBM_DART': oof_lgb,
    'XGBoost': oof_xgb,
    'LightGBM_GBDT': oof_lgb2
}

# Correlation matrix
corr_matrix = pd.DataFrame(predictions).corr()
print("\nPrediction Correlations:")
print(corr_matrix.round(3))

avg_corr = corr_matrix.values[np.triu_indices_from(corr_matrix.values, k=1)].mean()
print(f"\nAverage correlation: {avg_corr:.3f}")

if avg_corr > 0.90:
    print("⚠️  WARNING: Models are too similar!")
else:
    print("✅ Good diversity")


STEP 6: MODEL DIVERSITY ANALYSIS

Prediction Correlations:
               CatBoost  LightGBM_DART  XGBoost  LightGBM_GBDT
CatBoost          1.000          0.950    0.866          0.886
LightGBM_DART     0.950          1.000    0.859          0.906
XGBoost           0.866          0.859    1.000          0.844
LightGBM_GBDT     0.886          0.906    0.844          1.000

Average correlation: 0.885
✅ Good diversity


In [10]:
# ========================================================================
# STEP 7: SMART ENSEMBLING
# ========================================================================
print("\n" + "=" * 80)
print("STEP 7: SMART ENSEMBLING")
print("=" * 80)

# Method 1: Power mean (emphasizes agreement)
def power_mean(predictions, power=2):
    return np.mean([p**power for p in predictions], axis=0)**(1/power)

oof_power = power_mean([oof_cat, oof_lgb, oof_xgb, oof_lgb2], power=2)
test_power = power_mean([test_cat, test_lgb, test_xgb, test_lgb2], power=2)
auc_power = roc_auc_score(y_full, oof_power)

# Method 2: Rank average
from scipy.stats import rankdata
oof_rank = np.mean([rankdata(p) for p in [oof_cat, oof_lgb, oof_xgb, oof_lgb2]], axis=0)
test_rank = np.mean([rankdata(p) for p in [test_cat, test_lgb, test_xgb, test_lgb2]], axis=0)
auc_rank = roc_auc_score(y_full, oof_rank)

# Method 3: Weighted by inverse correlation
weights = 1 / corr_matrix.sum(axis=1)
weights = weights / weights.sum()

oof_weighted = sum(w * p for w, p in zip(weights.values, [oof_cat, oof_lgb, oof_xgb, oof_lgb2]))
test_weighted = sum(w * p for w, p in zip(weights.values, [test_cat, test_lgb, test_xgb, test_lgb2]))
auc_weighted = roc_auc_score(y_full, oof_weighted)

# Method 4: Hill climbing optimization
from scipy.optimize import differential_evolution

def neg_auc_objective(weights):
    weights = np.abs(weights)
    weights = weights / weights.sum()
    pred = sum(w * p for w, p in zip(weights, [oof_cat, oof_lgb, oof_xgb, oof_lgb2]))
    return -roc_auc_score(y_full, pred)

result = differential_evolution(
    neg_auc_objective,
    bounds=[(0, 1)] * 4,
    seed=RANDOM_SEED,
    maxiter=500
)

opt_weights = np.abs(result.x)
opt_weights = opt_weights / opt_weights.sum()

oof_optimized = sum(w * p for w, p in zip(opt_weights, [oof_cat, oof_lgb, oof_xgb, oof_lgb2]))
test_optimized = sum(w * p for w, p in zip(opt_weights, [test_cat, test_lgb, test_xgb, test_lgb2]))
auc_optimized = roc_auc_score(y_full, oof_optimized)

print("\n📊 Ensemble Results:")
print(f"   CatBoost alone:        {auc_cat:.4f}")
print(f"   LightGBM DART alone:   {auc_lgb:.4f}")
print(f"   XGBoost alone:         {auc_xgb:.4f}")
print(f"   LightGBM GBDT alone:   {auc_lgb2:.4f}")
print(f"   Power Mean:            {auc_power:.4f}")
print(f"   Rank Average:          {auc_rank:.4f}")
print(f"   Inverse Corr Weighted: {auc_weighted:.4f}")
print(f"   Optimized Weights:     {auc_optimized:.4f}")

print(f"\nOptimized Weights:")
for name, w in zip(['CatBoost', 'LightGBM_DART', 'XGBoost', 'LightGBM_GBDT'], opt_weights):
    print(f"   {name:20s}: {w:.3f}")


STEP 7: SMART ENSEMBLING

📊 Ensemble Results:
   CatBoost alone:        0.6384
   LightGBM DART alone:   0.6368
   XGBoost alone:         0.6291
   LightGBM GBDT alone:   0.6324
   Power Mean:            0.6395
   Rank Average:          0.6392
   Inverse Corr Weighted: 0.6391
   Optimized Weights:     0.6401

Optimized Weights:
   CatBoost            : 0.543
   LightGBM_DART       : 0.203
   XGBoost             : 0.050
   LightGBM_GBDT       : 0.203


In [11]:
# ========================================================================
# STEP 8: SELECT BEST & CREATE SUBMISSIONS
# ========================================================================
print("\n" + "=" * 80)
print("STEP 8: FINAL SELECTION")
print("=" * 80)

results = {
    'CatBoost': (auc_cat, test_cat),
    'Power Mean': (auc_power, test_power),
    'Rank Average': (auc_rank, test_rank),
    'Inverse Corr Weighted': (auc_weighted, test_weighted),
    'Optimized Weights': (auc_optimized, test_optimized)
}

best_method = max(results, key=lambda x: results[x][0])
best_auc, best_preds = results[best_method]

print(f"\n🏆 Best Method: {best_method}")
print(f"📊 OOF AUROC: {best_auc:.4f}")
print(f"📈 Improvement from 0.6385: {best_auc - 0.6385:+.4f}")

if best_auc >= 0.65:
    print("\n🎉🎉🎉 TARGET REACHED! 🎉🎉🎉")
else:
    print(f"\n📊 Gap to 0.65: {0.65 - best_auc:.4f}")

# Create submissions
test_ids = test[ID_COL] if ID_COL in test.columns else range(len(test))

for method_name, (oof_score, test_preds) in sorted(results.items(), key=lambda x: x[1][0], reverse=True):
    submission = pd.DataFrame({
        'id': test_ids,
        'target': test_preds
    })
    
    filename = f'submission_breakthrough_{method_name.lower().replace(" ", "_")}_{oof_score:.4f}.csv'
    submission.to_csv(filename, index=False)
    print(f"✅ {filename}")

print("\n" + "=" * 80)
print("💡 KEY INSIGHTS")
print("=" * 80)

print(f"\nWhat changed:")
print(f"   • 10-fold CV (was 5) → more reliable OOF")
print(f"   • DART boosting → different learning")
print(f"   • Better features → domain knowledge")
print(f"   • Target encoding → captures patterns")
print(f"   • Optimized ensembling → best combination")

print(f"\n🎯 If still below 0.65:")
print(f"   1. Check Kaggle score vs OOF (should be ±0.003)")
print(f"   2. Try 'Rank Average' (most robust)")
print(f"   3. Consider external data or manual features")
print(f"   4. The 0.65 barrier might require domain expertise")

print("\n" + "=" * 80)
print("✨ PIPELINE COMPLETE ✨")
print("=" * 80)


STEP 8: FINAL SELECTION

🏆 Best Method: Optimized Weights
📊 OOF AUROC: 0.6401
📈 Improvement from 0.6385: +0.0016

📊 Gap to 0.65: 0.0099
✅ submission_breakthrough_optimized_weights_0.6401.csv
✅ submission_breakthrough_power_mean_0.6395.csv
✅ submission_breakthrough_rank_average_0.6392.csv
✅ submission_breakthrough_inverse_corr_weighted_0.6391.csv
✅ submission_breakthrough_catboost_0.6384.csv

💡 KEY INSIGHTS

What changed:
   • 10-fold CV (was 5) → more reliable OOF
   • DART boosting → different learning
   • Better features → domain knowledge
   • Target encoding → captures patterns
   • Optimized ensembling → best combination

🎯 If still below 0.65:
   1. Check Kaggle score vs OOF (should be ±0.003)
   2. Try 'Rank Average' (most robust)
   3. Consider external data or manual features
   4. The 0.65 barrier might require domain expertise

✨ PIPELINE COMPLETE ✨
