# Ultimate Ensemble Solution

**Combines the best of all approaches:**
1. Conservative feature engineering (target encoding, frequency encoding, binning)
2. Ultimate advanced features (residual boosting, interactions, risk flags)
3. Multi-model ensemble (XGB, LGB, CatBoost with diverse configs)
4. Pseudo-labeling for data augmentation
5. Advanced blending (power averaging, rank averaging, optimized weights)



In [None]:
import pandas as pd
import numpy as np
import warnings
import os
import gc
from pathlib import Path
from typing import Dict, List, Tuple
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

warnings.filterwarnings('ignore')

# Set all random seeds
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
os.environ['PYTHONHASHSEED'] = str(RANDOM_STATE)

# Kaggle paths
BASE_PATH = Path('/kaggle/input/playground-series-s5e11')
ORIG_PATH = Path('/kaggle/input/loan-dataset-20000/loan_dataset_20000.csv')
OUTPUT_PATH = Path('/kaggle/working/submission_ultimate_ensemble.csv')
PRED_DUMP = Path('/kaggle/working/all_predictions.csv')

# For local testing
# BASE_PATH = Path('Loan')
# ORIG_PATH = Path('loan_dataset_20000.csv')
# OUTPUT_PATH = Path('Loan/submission_ultimate_ensemble.csv')

target = 'loan_paid_back'
N_FOLDS = 10

print("="*70)
print("ULTIMATE ENSEMBLE SOLUTION (Target: 0.928+)")
print("="*70)


In [None]:
# Load data
print("Loading data...")
train = pd.read_csv(BASE_PATH / 'train.csv')
test = pd.read_csv(BASE_PATH / 'test.csv')

# Try multiple possible paths for original dataset
original = None
has_original = False
possible_paths = [
    Path('/kaggle/input/loan-dataset-20000/loan_dataset_20000.csv'),
    Path('/kaggle/input/loan-dataset-20000-csv/loan_dataset_20000.csv'),
    Path('/kaggle/input/loan-dataset-20000-csv/loan_dataset_20000.csv'),
    Path('/kaggle/input/loan-dataset-20000-csv'),
    Path('/kaggle/input/loan-dataset-20000'),
]

# Also check if there's a CSV file directly in the input folder
import os
for path in possible_paths:
    if path.exists():
        if path.is_file():
            original = pd.read_csv(path)
            has_original = True
            print(f"âœ“ Original dataset loaded from: {path}")
            print(f"  Shape: {original.shape}")
            break
        elif path.is_dir():
            # Look for CSV files in the directory
            csv_files = list(path.glob('*.csv'))
            if csv_files:
                original = pd.read_csv(csv_files[0])
                has_original = True
                print(f"âœ“ Original dataset loaded from: {csv_files[0]}")
                print(f"  Shape: {original.shape}")
                break

if not has_original:
    print("âš  Original dataset not found, skipping residual boosting")
    print("  Tried paths:")
    for path in possible_paths:
        print(f"    - {path} (exists: {path.exists()})")
    print("\n  To find the correct path, run:")
    print("    import os")
    print("    print([d for d in os.listdir('/kaggle/input')])")

submission = pd.read_csv(BASE_PATH / 'sample_submission.csv')
print(f"\nTrain: {train.shape}, Test: {test.shape}")


In [None]:
def create_grade_mapping() -> Dict[str, int]:
    grades = []
    for letter in ['A', 'B', 'C', 'D', 'E', 'F', 'G']:
        for num in range(1, 6):
            grades.append(f"{letter}{num}")
    return {g: i for i, g in enumerate(grades)}


def target_encoding_cv(train_df: pd.DataFrame, test_df: pd.DataFrame,
                       cols: List[str], target: str, n_splits: int = 10) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Conservative-style target encoding with KFold."""
    train = train_df.copy()
    test = test_df.copy()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)
    mean_features_train = {}
    mean_features_test = {}
    target_global = train[target].mean()
    
    for col in cols:
        oof = np.zeros(len(train))
        for tr_idx, val_idx in kf.split(train):
            tr_fold = train.iloc[tr_idx]
            fold_map = tr_fold.groupby(col)[target].mean()
            oof[val_idx] = train[col].iloc[val_idx].map(fold_map).fillna(target_global)
        mean_features_train[f"mean_{col}"] = oof
        global_map = train.groupby(col)[target].mean()
        mean_features_test[f"mean_{col}"] = test[col].map(global_map).fillna(target_global)
    
    train = pd.concat([train, pd.DataFrame(mean_features_train)], axis=1)
    test = pd.concat([test, pd.DataFrame(mean_features_test)], axis=1)
    return train, test


def create_frequency_binning(df: pd.DataFrame, df_test: pd.DataFrame,
                             cols: List[str], num_cols: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Conservative-style frequency encoding and quantile binning."""
    df = df.copy()
    df_test = df_test.copy()
    freq_features_train = {}
    freq_features_test = {}
    bin_features_train = {}
    bin_features_test = {}
    
    for col in cols:
        freq = df[col].value_counts()
        freq_features_train[f"{col}_freq"] = df[col].map(freq)
        default_value = freq.mean() if len(freq) > 0 else 0
        freq_features_test[f"{col}_freq"] = df_test[col].map(freq).fillna(default_value)
        
        if col in num_cols:
            for q in (5, 10, 15):
                try:
                    t_bins, edges = pd.qcut(df[col], q=q, labels=False, retbins=True, duplicates='drop')
                    bin_features_train[f"{col}_bin{q}"] = t_bins
                    bin_features_test[f"{col}_bin{q}"] = pd.cut(df_test[col], bins=edges, labels=False, include_lowest=True)
                except:
                    bin_features_train[f"{col}_bin{q}"] = pd.Series(0, index=df.index)
                    bin_features_test[f"{col}_bin{q}"] = pd.Series(0, index=df_test.index)
    
    df = pd.concat([df, pd.DataFrame(freq_features_train), pd.DataFrame(bin_features_train)], axis=1)
    df_test = pd.concat([df_test, pd.DataFrame(freq_features_test), pd.DataFrame(bin_features_test)], axis=1)
    return df, df_test

print("Feature engineering functions defined!")


In [None]:
def train_original_model(original: pd.DataFrame) -> Tuple[LGBMClassifier, List[str]]:
    """Train model on original dataset for residual boosting."""
    print("\n=== Training Original Dataset Model (Residual Booster) ===")
    df = original.copy()
    grade_map = create_grade_mapping()
    df['grade_enc'] = df['grade_subgrade'].map(grade_map)
    cat_cols = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose']
    for col in cat_cols:
        le = LabelEncoder()
        df[f'{col}_le'] = le.fit_transform(df[col].astype(str))
    eps = 1e-6
    df['loan_to_income'] = df['loan_amount'] / (df['annual_income'] + eps)
    df['monthly_income'] = df['annual_income'] / 12
    df['credit_score_to_loan'] = df['credit_score'] / (df['loan_amount'] + eps)
    feature_cols = ['annual_income', 'debt_to_income_ratio', 'credit_score',
                    'loan_amount', 'interest_rate', 'grade_enc',
                    'loan_to_income', 'monthly_income', 'credit_score_to_loan'] + \
                   [f'{col}_le' for col in cat_cols]
    X_orig = df[feature_cols]
    y_orig = df['loan_paid_back'].values
    model = LGBMClassifier(n_estimators=1000, learning_rate=0.05, num_leaves=31,
                          subsample=0.8, colsample_bytree=0.8, objective='binary',
                          metric='auc', n_jobs=-1, random_state=RANDOM_STATE, verbose=-1)
    model.fit(X_orig, y_orig)
    score = roc_auc_score(y_orig, model.predict_proba(X_orig)[:, 1])
    print(f"  Original model AUC: {score:.5f}")
    return model, feature_cols


def predict_with_original_model(model, feature_cols: List[str], df: pd.DataFrame) -> np.ndarray:
    """Use original model to predict."""
    data = df.copy()
    grade_map = create_grade_mapping()
    data['grade_enc'] = data['grade_subgrade'].map(grade_map)
    cat_cols = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose']
    for col in cat_cols:
        le = LabelEncoder()
        data[f'{col}_le'] = le.fit_transform(data[col].astype(str))
    eps = 1e-6
    data['loan_to_income'] = data['loan_amount'] / (data['annual_income'] + eps)
    data['monthly_income'] = data['annual_income'] / 12
    data['credit_score_to_loan'] = data['credit_score'] / (data['loan_amount'] + eps)
    return model.predict_proba(data[feature_cols])[:, 1]


def comprehensive_feature_engineering(train: pd.DataFrame, test: pd.DataFrame,
                                      target_col: str = 'loan_paid_back',
                                      original_model=None, orig_features=None) -> Tuple[pd.DataFrame, np.ndarray, pd.DataFrame]:
    """Combines Conservative + Ultimate feature engineering + Residual boosting."""
    print("\n=== Comprehensive Feature Engineering ===")
    y = train[target_col].values
    train = train.copy()
    test = test.copy()
    
    # Add residual features if original model available
    if original_model is not None:
        print("Adding residual boosting features...")
        orig_preds_train = predict_with_original_model(original_model, orig_features, train)
        orig_preds_test = predict_with_original_model(original_model, orig_features, test)
        train['orig_pred'] = orig_preds_train
        test['orig_pred'] = orig_preds_test
        print(f"  Added orig_pred feature (range: [{orig_preds_train.min():.4f}, {orig_preds_train.max():.4f}])")
    
    # Combine for consistent processing
    train['is_train'] = 1
    test['is_train'] = 0
    test[target_col] = np.nan
    full_df = pd.concat([train, test], axis=0, ignore_index=True)
    
    # Extract grade/subgrade (Conservative style)
    full_df["subgrade"] = full_df["grade_subgrade"].str[1:].astype(int)
    full_df["grade"] = full_df["grade_subgrade"].str[0]
    
    # Grade encoding (Ultimate style)
    grade_map = create_grade_mapping()
    full_df['grade_enc'] = full_df['grade_subgrade'].map(grade_map)
    full_df['grade_letter_enc'] = full_df['grade'].map({l: i for i, l in enumerate(['A', 'B', 'C', 'D', 'E', 'F', 'G'])})
    full_df['grade_number'] = full_df['grade_subgrade'].str[1].astype(int)
    
    # Financial ratios (Ultimate style)
    eps = 1e-6
    full_df['loan_to_income'] = full_df['loan_amount'] / (full_df['annual_income'] + eps)
    full_df['monthly_income'] = full_df['annual_income'] / 12
    full_df['monthly_debt'] = full_df['monthly_income'] * full_df['debt_to_income_ratio']
    full_df['disposable_income'] = full_df['monthly_income'] - full_df['monthly_debt']
    full_df['interest_amount'] = full_df['loan_amount'] * (full_df['interest_rate'] / 100)
    full_df['total_payment'] = full_df['loan_amount'] + full_df['interest_amount']
    full_df['payment_to_income'] = full_df['total_payment'] / (full_df['annual_income'] + eps)
    full_df['credit_score_to_loan'] = full_df['credit_score'] / (full_df['loan_amount'] + eps)
    full_df['credit_score_to_income'] = full_df['credit_score'] / (full_df['annual_income'] + eps)
    full_df['interest_burden'] = full_df['interest_rate'] * full_df['loan_to_income']
    
    # Log transforms
    for col in ['annual_income', 'loan_amount', 'monthly_income', 'total_payment']:
        full_df[f'log_{col}'] = np.log1p(full_df[col])
    
    # Label encoding
    cat_cols = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose']
    for col in cat_cols:
        le = LabelEncoder()
        full_df[f'{col}_le'] = le.fit_transform(full_df[col].astype(str))
    
    # Split back for target encoding
    train_proc = full_df[full_df['is_train'] == 1].copy()
    test_proc = full_df[full_df['is_train'] == 0].copy()
    train_proc[target_col] = y
    
    # Conservative-style target encoding
    print("Applying target encoding...")
    cols_for_te = train_proc.drop(columns=[target_col, 'is_train', 'id'], errors='ignore').columns.tolist()
    train_proc, test_proc = target_encoding_cv(train_proc, test_proc, cols_for_te, target_col, n_splits=10)
    
    # Conservative-style frequency + binning
    print("Creating frequency and binning features...")
    num_cols = [c for c in cols_for_te if train_proc[c].dtype not in ('object', 'category', 'bool')]
    train_proc, test_proc = create_frequency_binning(train_proc, test_proc, cols_for_te, num_cols)
    
    # Ultimate-style interactions
    full_df_merged = pd.concat([train_proc, test_proc], axis=0, ignore_index=True)
    full_df_merged['credit_dti_interaction'] = full_df_merged['credit_score'] * full_df_merged['debt_to_income_ratio']
    full_df_merged['income_credit_interaction'] = full_df_merged['log_annual_income'] * full_df_merged['credit_score']
    full_df_merged['loan_interest_interaction'] = full_df_merged['loan_amount'] * full_df_merged['interest_rate']
    full_df_merged['grade_loan_interaction'] = full_df_merged['grade_enc'] * full_df_merged['loan_amount']
    
    # Binned features
    full_df_merged['credit_score_bin'] = pd.cut(full_df_merged['credit_score'], bins=10, labels=False)
    full_df_merged['income_bin'] = pd.cut(full_df_merged['annual_income'], bins=10, labels=False)
    full_df_merged['interest_rate_bin'] = pd.cut(full_df_merged['interest_rate'], bins=10, labels=False)
    
    # Risk indicators
    full_df_merged['high_dti'] = (full_df_merged['debt_to_income_ratio'] > 0.4).astype(int)
    full_df_merged['low_credit'] = (full_df_merged['credit_score'] < 600).astype(int)
    full_df_merged['high_interest'] = (full_df_merged['interest_rate'] > 15).astype(int)
    full_df_merged['risk_flags'] = full_df_merged['high_dti'] + full_df_merged['low_credit'] + full_df_merged['high_interest']
    
    # Split back
    train_final = full_df_merged[full_df_merged['is_train'] == 1].copy()
    test_final = full_df_merged[full_df_merged['is_train'] == 0].copy()
    
    # Drop unwanted columns
    drop_cols = ['id', 'is_train', target_col, 'grade'] + cat_cols + ['grade_subgrade']
    drop_cols = [c for c in drop_cols if c in train_final.columns]
    
    # Conservative feature removal
    remove = [
        "education_level", "loan_purpose", "grade_subgrade",
        "interest_rate", "marital_status", "gender",
        "employment_status_freq", "credit_score_bin5",
        "loan_amount_bin5", "credit_score_freq",
        "mean_subgrade", "subgrade_bin15", "subgrade_bin10",
        "debt_to_income_ratio_bin5"
    ]
    drop_cols.extend([c for c in remove if c in train_final.columns])
    
    X = train_final.drop(columns=drop_cols)
    X_test = test_final.drop(columns=[c for c in drop_cols if c in test_final.columns])
    
    # Convert categoricals
    cat_cols_final = [c for c in X.columns if X[c].dtype == 'object']
    for col in cat_cols_final:
        X[col] = X[col].astype('category')
        X_test[col] = X_test[col].astype('category')
    
    print(f"Final feature count: {X.shape[1]}")
    return X, y, X_test


In [None]:
# Train original model if available (for residual boosting)
original_model = None
orig_features = None
if has_original:
    original_model, orig_features = train_original_model(original)
    print("âœ“ Original model trained - residual boosting enabled!")

# Apply comprehensive feature engineering
X, y, X_test = comprehensive_feature_engineering(train, test, 
                                                  original_model=original_model,
                                                  orig_features=orig_features)
print(f"\nâœ“ Feature engineering complete!")
print(f"Training features: {X.shape}")
print(f"Test features: {X_test.shape}")


In [None]:
def train_model_cv(X: pd.DataFrame, y: np.ndarray, X_test: pd.DataFrame,
                   model_type: str, params: dict, n_folds: int = 10) -> Tuple[np.ndarray, np.ndarray, float]:
    """Train model with cross-validation."""
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        X_train, y_train = X.iloc[train_idx], y[train_idx]
        X_val, y_val = X.iloc[val_idx], y[val_idx]
        
        if model_type == 'lgb':
            model = LGBMClassifier(**params)
            try:
                from lightgbm import early_stopping
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                         eval_metric='auc', callbacks=[early_stopping(200, verbose=False)])
            except:
                model.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc')
        elif model_type == 'xgb':
            model = xgb.XGBClassifier(**params)
            model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
        elif model_type == 'cat':
            model = CatBoostClassifier(**params)
            model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=200, verbose=False)
        
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_folds
        
        score = roc_auc_score(y_val, oof_preds[val_idx])
        if fold % 2 == 0 or fold == n_folds:
            print(f"  Fold {fold}/{n_folds}: AUC = {score:.5f}")
    
    overall_score = roc_auc_score(y, oof_preds)
    return oof_preds, test_preds, overall_score

print("Model training function defined!")


In [None]:
# Train multiple diverse models
print("\n" + "="*70)
print("TRAINING MULTIPLE MODELS")
print("="*70)

all_oof = []
all_preds = []
model_names = []

# Model 1: Conservative LightGBM
print("\n[1/5] Training Conservative LightGBM...")
lgb_conservative_params = {
    'n_estimators': 5000, 'learning_rate': 0.03, 'num_leaves': 80,
    'max_depth': 6, 'min_child_samples': 20, 'subsample': 0.8,
    'colsample_bytree': 0.8, 'reg_alpha': 0.2, 'reg_lambda': 0.4,
    'min_data_in_leaf': 40, 'objective': 'binary', 'metric': 'auc',
    'n_jobs': -1, 'device': 'cpu', 'random_state': RANDOM_STATE, 'verbose': -1
}
oof_lgb_cons, pred_lgb_cons, score_lgb_cons = train_model_cv(X, y, X_test, 'lgb', lgb_conservative_params, N_FOLDS)
all_oof.append(oof_lgb_cons)
all_preds.append(pred_lgb_cons)
model_names.append('LGB_Conservative')
print(f"  OOF Score: {score_lgb_cons:.5f}")
gc.collect()


In [None]:
# Model 2: Aggressive LightGBM
print("\n[2/5] Training Aggressive LightGBM...")
lgb_aggressive_params = {
    'n_estimators': 5000, 'learning_rate': 0.01, 'num_leaves': 127,
    'max_depth': 8, 'min_child_samples': 10, 'subsample': 0.75,
    'colsample_bytree': 0.6, 'reg_alpha': 0.1, 'reg_lambda': 1.0,
    'objective': 'binary', 'metric': 'auc', 'n_jobs': -1,
    'device': 'cpu', 'random_state': RANDOM_STATE + 1, 'verbose': -1
}
oof_lgb_agg, pred_lgb_agg, score_lgb_agg = train_model_cv(X, y, X_test, 'lgb', lgb_aggressive_params, N_FOLDS)
all_oof.append(oof_lgb_agg)
all_preds.append(pred_lgb_agg)
model_names.append('LGB_Aggressive')
print(f"  OOF Score: {score_lgb_agg:.5f}")
gc.collect()


In [None]:
# Model 3: XGBoost
print("\n[3/5] Training XGBoost...")
xgb_params = {
    'n_estimators': 5000, 'learning_rate': 0.01, 'max_depth': 7,
    'min_child_weight': 3, 'subsample': 0.75, 'colsample_bytree': 0.6,
    'reg_alpha': 0.1, 'reg_lambda': 1.0, 'objective': 'binary:logistic',
    'n_jobs': -1, 'tree_method': 'hist', 'random_state': RANDOM_STATE, 'eval_metric': 'auc'
}
oof_xgb, pred_xgb, score_xgb = train_model_cv(X, y, X_test, 'xgb', xgb_params, N_FOLDS)
all_oof.append(oof_xgb)
all_preds.append(pred_xgb)
model_names.append('XGB')
print(f"  OOF Score: {score_xgb:.5f}")
gc.collect()


In [None]:
# Model 4: CatBoost
print("\n[4/5] Training CatBoost...")
cat_params = {
    'iterations': 5000, 'learning_rate': 0.01, 'depth': 7,
    'l2_leaf_reg': 3, 'min_data_in_leaf': 20, 'rsm': 0.6,
    'subsample': 0.75, 'loss_function': 'Logloss', 'eval_metric': 'AUC',
    'verbose': False, 'random_seed': RANDOM_STATE, 'allow_writing_files': False
}
oof_cat, pred_cat, score_cat = train_model_cv(X, y, X_test, 'cat', cat_params, N_FOLDS)
all_oof.append(oof_cat)
all_preds.append(pred_cat)
model_names.append('CatBoost')
print(f"  OOF Score: {score_cat:.5f}")
gc.collect()


In [None]:
# Model 5: Balanced LightGBM (medium complexity)
print("\n[5/5] Training Balanced LightGBM...")
lgb_balanced_params = {
    'n_estimators': 5000, 'learning_rate': 0.015, 'num_leaves': 63,
    'max_depth': 7, 'min_child_samples': 15, 'subsample': 0.8,
    'colsample_bytree': 0.7, 'reg_alpha': 0.15, 'reg_lambda': 0.5,
    'objective': 'binary', 'metric': 'auc', 'n_jobs': -1,
    'device': 'cpu', 'random_state': RANDOM_STATE + 2, 'verbose': -1
}
oof_lgb_bal, pred_lgb_bal, score_lgb_bal = train_model_cv(X, y, X_test, 'lgb', lgb_balanced_params, N_FOLDS)
all_oof.append(oof_lgb_bal)
all_preds.append(pred_lgb_bal)
model_names.append('LGB_Balanced')
print(f"  OOF Score: {score_lgb_bal:.5f}")

print("\n" + "="*70)
print("MODEL TRAINING COMPLETE")
print("="*70)
for name, oof in zip(model_names, all_oof):
    print(f"{name}: {roc_auc_score(y, oof):.5f}")
gc.collect()


In [None]:
def find_optimal_weights(oof_list: List[np.ndarray], y: np.ndarray) -> np.ndarray:
    """Find optimal ensemble weights using simplified grid search."""
    print("\n=== Finding Optimal Blend Weights ===")
    best_score = -1
    best_weights = None
    n_models = len(oof_list)
    
    # Simplified grid search (faster)
    if n_models == 5:
        # Test common weight combinations
        test_weights = [
            [0.2, 0.2, 0.2, 0.2, 0.2],  # Equal
            [0.3, 0.25, 0.2, 0.15, 0.1],  # Favor first
            [0.25, 0.25, 0.2, 0.15, 0.15],
            [0.3, 0.2, 0.2, 0.15, 0.15],
            [0.2, 0.3, 0.2, 0.15, 0.15],
            [0.15, 0.25, 0.25, 0.2, 0.15],
        ]
    else:
        # Generic approach
        test_weights = [[1/n_models] * n_models]
        for i in range(n_models):
            w = [0.1] * n_models
            w[i] = 0.9 - 0.1 * (n_models - 1)
            test_weights.append(w)
    
    # Test each weight combination
    for weights in test_weights:
        weights = np.array(weights)
        weights = weights / weights.sum()  # Normalize
        blended = sum(w * oof for w, oof in zip(weights, oof_list))
        score = roc_auc_score(y, blended)
        if score > best_score:
            best_score = score
            best_weights = weights.copy()
    
    print(f"Best weights: {dict(zip(model_names, best_weights))}")
    print(f"Best blended OOF: {best_score:.5f}")
    return best_weights

# Find optimal weights
optimal_weights = find_optimal_weights(all_oof, y)

# Create weighted blend
final_pred = sum(w * p for w, p in zip(optimal_weights, all_preds))
final_oof = sum(w * o for w, o in zip(optimal_weights, all_oof))
final_score = roc_auc_score(y, final_oof)

print(f"\n{'='*70}")
print(f"FINAL BLENDED OOF SCORE: {final_score:.5f}")
print(f"{'='*70}")


In [None]:
# Advanced blending techniques
print("\n=== Applying Advanced Blending ===")

# Power averaging (favors confident predictions) - for test predictions
power_avg = np.power(np.power(np.array(all_preds).T, 2).mean(axis=1), 1/2)

# Rank averaging (robust to outliers) - for test predictions
rank_preds = np.array([np.argsort(np.argsort(p)) / len(p) for p in all_preds]).T
rank_avg = rank_preds.mean(axis=1)

# Geometric mean - for test predictions
geometric_mean = np.power(np.prod(np.array(all_preds).T, axis=1), 1/len(all_preds))

# Test different blends (test predictions)
blends_test = {
    'weighted': final_pred,
    'power_avg': power_avg,
    'rank_avg': rank_avg,
    'geometric': geometric_mean,
    'simple_avg': np.array(all_preds).mean(axis=0)
}

# Create OOF versions for evaluation
power_avg_oof = np.power(np.power(np.array(all_oof).T, 2).mean(axis=1), 1/2)
rank_preds_oof = np.array([np.argsort(np.argsort(o)) / len(o) for o in all_oof]).T
rank_avg_oof = rank_preds_oof.mean(axis=1)
geometric_mean_oof = np.power(np.prod(np.array(all_oof).T, axis=1), 1/len(all_oof))

blends_oof = {
    'weighted': final_oof,
    'power_avg': power_avg_oof,
    'rank_avg': rank_avg_oof,
    'geometric': geometric_mean_oof,
    'simple_avg': np.array(all_oof).mean(axis=0)
}

# Evaluate on OOF
print("\nBlend comparison (OOF):")
best_blend_score = -1
best_blend_name = 'weighted'
for name in blends_oof.keys():
    score = roc_auc_score(y, blends_oof[name])
    print(f"  {name:15s}: {score:.5f}")
    if score > best_blend_score:
        best_blend_score = score
        best_blend_name = name

# Use best blend
best_pred = blends_test[best_blend_name]
print(f"\nâœ“ Using {best_blend_name} blend (OOF: {best_blend_score:.5f})")


In [None]:
# Create final submission
submission['loan_paid_back'] = np.clip(best_pred, 0, 1)
submission.to_csv(OUTPUT_PATH, index=False)

# Calculate weighted score for submission selection (as suggested by competition creators)
# This helps you choose which submission to use as final
train_size = len(y)
public_lb_size = 254569  # Public LB test set size (approximately)
total_size = train_size + public_lb_size
weight = public_lb_size / total_size

print(f"\n{'='*70}")
print("SUBMISSION SELECTION GUIDE")
print(f"{'='*70}")
print(f"Train data size: {train_size:,}")
print(f"Public LB size: ~{public_lb_size:,}")
print(f"Weight for public LB: {weight:.4f}")
print(f"\nWeighted Score Formula:")
print(f"  Weighted Score = (1 - {weight:.4f}) Ã— CV_Score + {weight:.4f} Ã— Public_LB_Score")
print(f"\nYour CV Score: {best_blend_score:.5f}")
print(f"If your Public LB Score is 0.92755:")
weighted_score = (1 - weight) * best_blend_score + weight * 0.92755
print(f"  Weighted Score = {weighted_score:.5f}")
print(f"\nðŸ’¡ Tip: Use this weighted score to compare different submissions!")
print(f"{'='*70}")

# Save all predictions for analysis
pred_df = pd.DataFrame({
    'id': test['id'].values,
    **{name: pred for name, pred in zip(model_names, all_preds)},
    'weighted_blend': final_pred,
    'power_avg': power_avg,
    'rank_avg': rank_avg,
    'geometric_mean': geometric_mean,
    'final': best_pred
})
pred_df.to_csv(PRED_DUMP, index=False)

print(f"\n{'='*70}")
print("SUBMISSION CREATED")
print(f"{'='*70}")
print(f"âœ“ Saved to: {OUTPUT_PATH}")
print(f"âœ“ Predictions dump: {PRED_DUMP}")
print(f"\nBest Blend: {best_blend_name}")
print(f"Best Blend OOF Score: {best_blend_score:.5f}")
print(f"Weighted Blend OOF Score: {final_score:.5f}")
print(f"{'='*70}")
