In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from scipy.stats import rankdata

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import joblib
import time

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("=" * 80)
print("STANDALONE FINAL OPTIMIZATION PIPELINE")
print("Target: Push AUROC from 0.6378 ‚Üí 0.65+")
print("=" * 80)

STANDALONE FINAL OPTIMIZATION PIPELINE
Target: Push AUROC from 0.6378 ‚Üí 0.65+


In [2]:
# ========================================================================
# STEP 1: LOAD DATA & REPRODUCE FEATURE ENGINEERING
# ========================================================================
print("\n" + "=" * 80)
print("STEP 1: LOADING DATA & FEATURE ENGINEERING")
print("=" * 80)

train = pd.read_csv('train1.csv')
test = pd.read_csv('test.csv')

TARGET = 'target'
ID_COL = 'id'

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

def create_features(df, is_train=True):
    """Complete feature engineering pipeline"""
    df = df.copy()
    
    # Missing value indicators
    print("  Creating missing value indicators...")
    missing_cols = df.columns[df.isnull().any()].tolist()
    if ID_COL in missing_cols:
        missing_cols.remove(ID_COL)
    if TARGET in missing_cols and is_train:
        missing_cols.remove(TARGET)
    
    for col in missing_cols:
        df[f'{col}_missing'] = df[col].isnull().astype(int)
    
    # Interaction features
    print("  Creating interactions...")
    if 'ps_car_13' in df.columns and 'ps_reg_03' in df.columns:
        df['car13_reg03_interaction'] = df['ps_car_13'] * df['ps_reg_03']
    
    if 'ps_ind_15' in df.columns and 'ps_reg_01' in df.columns:
        df['ind15_reg01_interaction'] = df['ps_ind_15'] * df['ps_reg_01']
    
    if 'ps_car_13' in df.columns and 'ps_car_15' in df.columns:
        df['car13_car15_ratio'] = df['ps_car_13'] / (df['ps_car_15'] + 1e-5)
    
    if 'ps_reg_02' in df.columns and 'ps_reg_03' in df.columns:
        df['reg02_reg03_product'] = df['ps_reg_02'] * df['ps_reg_03']
    
    # Polynomial features
    print("  Creating polynomial features...")
    poly_cols = ['ps_car_13', 'ps_reg_03', 'ps_car_15', 'ps_ind_15']
    for col in poly_cols:
        if col in df.columns:
            df[f'{col}_squared'] = df[col] ** 2
            df[f'{col}_cubed'] = df[col] ** 3
            df[f'{col}_sqrt'] = np.sqrt(np.abs(df[col]))
    
    # Aggregations
    print("  Creating aggregations...")
    car_cols = [c for c in df.columns if c.startswith('ps_car_') and c.endswith('_cat')]
    if car_cols:
        df['car_cat_sum'] = df[car_cols].sum(axis=1)
        df['car_cat_mean'] = df[car_cols].mean(axis=1)
    
    ind_cols = [c for c in df.columns if c.startswith('ps_ind_') and c.endswith('_bin')]
    if ind_cols:
        df['ind_bin_sum'] = df[ind_cols].sum(axis=1)
    
    calc_cols = [c for c in df.columns if c.startswith('ps_calc_')]
    if calc_cols:
        df['calc_sum'] = df[calc_cols].sum(axis=1)
        df['calc_mean'] = df[calc_cols].mean(axis=1)
        df['calc_std'] = df[calc_cols].std(axis=1)
    
    # Binning
    print("  Creating binned features...")
    if 'ps_reg_03' in df.columns:
        df['ps_reg_03_binned'] = pd.qcut(df['ps_reg_03'].fillna(-1), q=10, labels=False, duplicates='drop')
    
    if 'ps_car_13' in df.columns:
        df['ps_car_13_binned'] = pd.qcut(df['ps_car_13'].fillna(-1), q=10, labels=False, duplicates='drop')
    
    # Combinations
    print("  Creating combinations...")
    if 'ps_ind_06_bin' in df.columns and 'ps_ind_07_bin' in df.columns:
        df['ind_06_07_combined'] = df['ps_ind_06_bin'].astype(str) + '_' + df['ps_ind_07_bin'].astype(str)
        df['ind_06_07_combined'] = LabelEncoder().fit_transform(df['ind_06_07_combined'])
    
    if 'ps_car_01_cat' in df.columns and 'ps_car_02_cat' in df.columns:
        df['car_01_02_combined'] = df['ps_car_01_cat'].astype(str) + '_' + df['ps_car_02_cat'].astype(str)
        df['car_01_02_combined'] = LabelEncoder().fit_transform(df['car_01_02_combined'])
    
    # Advanced interactions
    print("  Creating advanced interactions...")
    if 'ps_reg_01' in df.columns and 'ps_reg_02' in df.columns:
        df['reg_ratio_01_02'] = df['ps_reg_01'] / (df['ps_reg_02'] + 1e-5)
    
    if 'ps_car_13' in df.columns and 'ps_car_12' in df.columns:
        df['car_ratio_13_12'] = df['ps_car_13'] / (df['ps_car_12'] + 1e-5)
    
    if 'ps_car_15' in df.columns and 'ps_car_14' in df.columns:
        df['car_diff_15_14'] = df['ps_car_15'] - df['ps_car_14']
    
    if 'ps_car_13' in df.columns and 'ps_reg_03' in df.columns and 'ps_ind_15' in df.columns:
        df['triple_interaction'] = df['ps_car_13'] * df['ps_reg_03'] * df['ps_ind_15']
    
    # Log transformations
    for col in ['ps_car_13', 'ps_reg_03', 'ps_car_15']:
        if col in df.columns:
            df[f'{col}_log'] = np.log1p(np.abs(df[col]))
    
    return df

print("\nApplying feature engineering to train...")
train_fe = create_features(train, is_train=True)

print("Applying feature engineering to test...")
test_fe = create_features(test, is_train=False)

print(f"\nTrain shape after FE: {train_fe.shape}")
print(f"Test shape after FE: {test_fe.shape}")


STEP 1: LOADING DATA & FEATURE ENGINEERING
Train shape: (296209, 67)
Test shape: (126948, 66)

Applying feature engineering to train...
  Creating missing value indicators...
  Creating interactions...
  Creating polynomial features...
  Creating aggregations...
  Creating binned features...
  Creating combinations...
  Creating advanced interactions...
Applying feature engineering to test...
  Creating missing value indicators...
  Creating interactions...
  Creating polynomial features...
  Creating aggregations...
  Creating binned features...
  Creating combinations...
  Creating advanced interactions...

Train shape after FE: (296209, 114)
Test shape after FE: (126948, 111)


In [3]:
# ========================================================================
# STEP 2: FEATURE SELECTION
# ========================================================================
print("\n" + "=" * 80)
print("STEP 2: FEATURE SELECTION")
print("=" * 80)

X_full = train_fe.drop([TARGET, ID_COL], axis=1)
y_full = train_fe[TARGET]
test_full = test_fe.drop([ID_COL], axis=1, errors='ignore')

# Align columns
test_full = test_full.reindex(columns=X_full.columns, fill_value=0)

# Fill missing
X_full_filled = X_full.fillna(-999)
test_full_filled = test_full.fillna(-999)

print("Calculating feature importance with LightGBM...")
lgb_selector = lgb.LGBMClassifier(
    n_estimators=100,
    random_state=RANDOM_SEED,
    verbose=-1,
    class_weight='balanced'
)
lgb_selector.fit(X_full_filled, y_full)

feature_importance = pd.DataFrame({
    'feature': X_full.columns,
    'importance': lgb_selector.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 20 features:")
print(feature_importance.head(20))

# Select top features
N_FEATURES = 70
selected_features = feature_importance.head(N_FEATURES)['feature'].tolist()

X_full = X_full[selected_features]
test_full = test_full[selected_features]

# Fill missing values
X_full = X_full.fillna(-999)
test_full = test_full.fillna(-999)

print(f"\nFinal feature count: {len(selected_features)}")
print(f"Missing values in train: {X_full.isnull().sum().sum()}")
print(f"Missing values in test: {test_full.isnull().sum().sum()}")


STEP 2: FEATURE SELECTION
Calculating feature importance with LightGBM...


  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py


Top 20 features:
                     feature  importance
62                  feature6         130
15                 ps_ind_03         129
34                 ps_car_13         119
63                  feature7         103
58                  feature2          97
35                 ps_car_14          94
98                  calc_sum          93
100                 calc_std          90
108       triple_interaction          85
81         car13_car15_ratio          84
2              ps_ind_05_cat          82
107           car_diff_15_14          82
106          car_ratio_13_12          80
25                 ps_ind_15          78
105          reg_ratio_01_02          74
80   ind15_reg01_interaction          70
31                 ps_reg_03          64
104       car_01_02_combined          62
95               car_cat_sum          61
79   car13_reg03_interaction          61

Final feature count: 70
Missing values in train: 0
Missing values in test: 0


In [4]:
print("\n" + "=" * 80)
print("STEP 3: TRAIN-VALIDATION SPLIT")
print("=" * 80)

X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full,
    test_size=0.25,
    stratify=y_full,
    random_state=RANDOM_SEED
)

print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"Train target distribution: {y_train.value_counts(normalize=True).to_dict()}")


STEP 3: TRAIN-VALIDATION SPLIT
X_train: (222156, 70)
X_val: (74053, 70)
Train target distribution: {0: 0.9487342227983939, 1: 0.05126577720160608}


In [5]:
print(f"Target distribution in train: {y_train.value_counts(normalize=True).to_dict()}")
print(f"Target distribution in val: {y_val.value_counts(normalize=True).to_dict()}")

Target distribution in train: {0: 0.9487342227983939, 1: 0.05126577720160608}
Target distribution in val: {0: 0.9487259125221126, 1: 0.05127408747788746}


In [20]:
# ========================================================================
# OPTIMIZATION 1: TARGET ENCODING
# ========================================================================
print("\n" + "=" * 80)
print("OPTIMIZATION 1: TARGET ENCODING")
print("=" * 80)

def target_encode_cv(X_train, y_train, X_val, X_test, cat_cols, n_splits=5, alpha=10):
    """Target encoding with CV to prevent overfitting"""
    X_train_te = X_train.copy()
    X_val_te = X_val.copy()
    X_test_te = X_test.copy()
    
    global_mean = y_train.mean()
    
    for col in cat_cols:
        if col not in X_train.columns:
            continue
            
        print(f"  Encoding: {col}")
        
        X_train_te[f'{col}_te'] = 0.0
        
        # KFold for train
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_SEED)
        
        for train_idx, val_idx in kf.split(X_train, y_train):
            X_tr = X_train.iloc[train_idx]
            y_tr = y_train.iloc[train_idx]
            X_vl = X_train.iloc[val_idx]
            
            # Calculate mean target per category
            agg = pd.DataFrame({'col': X_tr[col], 'target': y_tr})
            means = agg.groupby('col')['target'].agg(['mean', 'count'])
            
            # Smoothing
            smoothed = (means['count'] * means['mean'] + alpha * global_mean) / (means['count'] + alpha)
            
            # Map to validation fold
            X_train_te.loc[X_train.index[val_idx], f'{col}_te'] = X_vl[col].map(smoothed).fillna(global_mean).values
        
        # For val and test, use full training data
        agg_full = pd.DataFrame({'col': X_train[col], 'target': y_train})
        means_full = agg_full.groupby('col')['target'].agg(['mean', 'count'])
        smoothed_full = (means_full['count'] * means_full['mean'] + alpha * global_mean) / (means_full['count'] + alpha)
        
        X_val_te[f'{col}_te'] = X_val[col].map(smoothed_full).fillna(global_mean)
        X_test_te[f'{col}_te'] = X_test[col].map(smoothed_full).fillna(global_mean)
    
    return X_train_te, X_val_te, X_test_te

# Identify categorical columns
cat_cols_for_te = [col for col in X_train.columns 
                   if col.endswith('_cat') or col.endswith('_combined') or col.endswith('_binned')]

print(f"Found {len(cat_cols_for_te)} categorical features for target encoding")

if len(cat_cols_for_te) > 0:
    X_train, X_val, test_full = target_encode_cv(
        X_train, y_train, X_val, test_full,
        cat_cols_for_te, n_splits=5, alpha=10
    )
    print(f"New shape: {X_train.shape}")
    
# ---- PATCH 1: Rebuild X_full & y_full after TE ----
X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)
 
# Ensure test index matches
test_full = test_full.reset_index(drop=True)



OPTIMIZATION 1: TARGET ENCODING
Found 16 categorical features for target encoding
  Encoding: ps_ind_05_cat
  Encoding: car_01_02_combined
  Encoding: ps_car_11_cat
  Encoding: ps_car_09_cat
  Encoding: ps_car_01_cat
  Encoding: ps_ind_02_cat
  Encoding: ps_car_07_cat
  Encoding: ps_ind_04_cat
  Encoding: ps_car_06_cat
  Encoding: ps_car_03_cat
  Encoding: ps_car_04_cat
  Encoding: ps_car_02_cat
  Encoding: ps_car_05_cat
  Encoding: ps_reg_03_binned
  Encoding: ps_car_08_cat
  Encoding: ps_car_10_cat
New shape: (222156, 86)


In [24]:
# ========================================================================
# OPTIMIZATION 2: FINE-TUNE CATBOOST (YOUR BEST MODEL)
# ========================================================================
print("\n" + "=" * 80)
print("OPTIMIZATION 2: FINE-TUNING CATBOOST")
print("=" * 80)

from sklearn.model_selection import RandomizedSearchCV

catboost_params = {
    'iterations': [500],
    'depth': [6],
    'learning_rate': [0.02],
    'l2_leaf_reg': [7],
    'border_count': [254],
    'bagging_temperature': [1.0],
    'random_strength': [1]
}

print("Running RandomizedSearchCV (50 iterations, 5-fold CV)...")
catboost_search = RandomizedSearchCV(
    CatBoostClassifier(
        auto_class_weights='Balanced',
        random_state=RANDOM_SEED,
        verbose=0,
        eval_metric='AUC'
    ),
    param_distributions=catboost_params,
    n_iter=5,
    scoring='roc_auc',
    cv=5,
    random_state=RANDOM_SEED,
    n_jobs=1,
    verbose=2
)

catboost_search.fit(X_train, y_train)
best_catboost = catboost_search.best_estimator_

pred_cat = best_catboost.predict_proba(X_val)[:, 1]
auc_cat = roc_auc_score(y_val, pred_cat)

print(f"\n‚úÖ Optimized CatBoost Val AUROC: {auc_cat:.4f}")
print(f"Best params: {catboost_search.best_params_}")


OPTIMIZATION 2: FINE-TUNING CATBOOST
Running RandomizedSearchCV (50 iterations, 5-fold CV)...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] END bagging_temperature=1.0, border_count=254, depth=6, iterations=500, l2_leaf_reg=7, learning_rate=0.02, random_strength=1; total time= 1.1min
[CV] END bagging_temperature=1.0, border_count=254, depth=6, iterations=500, l2_leaf_reg=7, learning_rate=0.02, random_strength=1; total time=  59.3s
[CV] END bagging_temperature=1.0, border_count=254, depth=6, iterations=500, l2_leaf_reg=7, learning_rate=0.02, random_strength=1; total time=  59.6s
[CV] END bagging_temperature=1.0, border_count=254, depth=6, iterations=500, l2_leaf_reg=7, learning_rate=0.02, random_strength=1; total time= 1.0min
[CV] END bagging_temperature=1.0, border_count=254, depth=6, iterations=500, l2_leaf_reg=7, learning_rate=0.02, random_strength=1; total time=  58.9s

‚úÖ Optimized CatBoost Val AUROC: 0.6361
Best params: {'random_strength': 1, 'learning_rate': 0

In [25]:
# ========================================================================
# OPTIMIZATION 3: TRAIN SUPPORTING MODELS
# ========================================================================
print("\n" + "=" * 80)
print("OPTIMIZATION 3: TRAINING LGBM & XGBOOST")
print("=" * 80)

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

print("Training LightGBM...")
lgb_model = lgb.LGBMClassifier(
    n_estimators=700,
    num_leaves=70,
    learning_rate=0.02,
    class_weight='balanced',
    random_state=RANDOM_SEED,
    verbose=-1
)
lgb_model.fit(X_train, y_train)
pred_lgb = lgb_model.predict_proba(X_val)[:, 1]
auc_lgb = roc_auc_score(y_val, pred_lgb)
print(f"‚úÖ LightGBM Val AUROC: {auc_lgb:.4f}")

print("Training XGBoost...")
xgb_model = xgb.XGBClassifier(
    n_estimators=700,
    max_depth=5,
    learning_rate=0.02,
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_SEED
)
xgb_model.fit(X_train, y_train)
pred_xgb = xgb_model.predict_proba(X_val)[:, 1]
auc_xgb = roc_auc_score(y_val, pred_xgb)
print(f"‚úÖ XGBoost Val AUROC: {auc_xgb:.4f}")


OPTIMIZATION 3: TRAINING LGBM & XGBOOST
Training LightGBM...
‚úÖ LightGBM Val AUROC: 0.6181
Training XGBoost...
‚úÖ XGBoost Val AUROC: 0.6308


In [26]:
# ========================================================================
# OPTIMIZATION 4: PSEUDO-LABELING
# ========================================================================
print("\n" + "=" * 80)
print("OPTIMIZATION 4: PSEUDO-LABELING")
print("=" * 80)

# Get predictions on full test set
test_preds_cat = best_catboost.predict_proba(test_full)[:, 1]

# Select high-confidence predictions
threshold_high = 0.98
threshold_low = 0.02

confident_positive = test_preds_cat >= threshold_high
confident_negative = test_preds_cat <= threshold_low
confident_mask = confident_positive | confident_negative

print(f"High-confidence predictions: {confident_mask.sum()} / {len(test_full)}")

if confident_mask.sum() > 100:
    pseudo_labels = (test_preds_cat >= 0.5).astype(int)
    
    X_augmented = pd.concat([X_train, test_full[confident_mask]], axis=0)
    y_augmented = pd.concat([y_train, pd.Series(pseudo_labels[confident_mask], index=test_full[confident_mask].index)], axis=0)
    
    print(f"Training with augmented data: {X_augmented.shape}")
    
    catboost_pseudo = CatBoostClassifier(
        **catboost_search.best_params_,
        auto_class_weights='Balanced',
        random_state=RANDOM_SEED,
        verbose=0
    )
    catboost_pseudo.fit(X_augmented, y_augmented)
    
    pred_pseudo = catboost_pseudo.predict_proba(X_val)[:, 1]
    auc_pseudo = roc_auc_score(y_val, pred_pseudo)
    print(f"‚úÖ Pseudo-labeled CatBoost Val AUROC: {auc_pseudo:.4f}")
else:
    print("Not enough confident predictions, skipping pseudo-labeling")
    auc_pseudo = auc_cat
    catboost_pseudo = best_catboost


OPTIMIZATION 4: PSEUDO-LABELING
High-confidence predictions: 0 / 126948
Not enough confident predictions, skipping pseudo-labeling


In [28]:
# ========================================================================
# OPTIMIZATION 5: MULTI-LAYER STACKING  (FULL-DATA OOF VERSION)
# ========================================================================
print("\n" + "=" * 80)
print("OPTIMIZATION 5: MULTI-LAYER STACKING (Full Data)")
print("=" * 80)

print("Generating out-of-fold predictions for full-data meta-features...")
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# OOF predictions for full dataset
oof_cat = np.zeros(len(X_full))
oof_lgb = np.zeros(len(X_full))
oof_xgb = np.zeros(len(X_full))

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full, y_full)):
    print(f"  Fold {fold+1}/5...")

    # Split folds
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full.iloc[train_idx], y_full.iloc[val_idx]

    # CatBoost
    cat_fold = CatBoostClassifier(
        **catboost_search.best_params_,
        auto_class_weights='Balanced',
        random_state=RANDOM_SEED,
        verbose=0
    )
    cat_fold.fit(X_tr, y_tr)
    oof_cat[val_idx] = cat_fold.predict_proba(X_vl)[:, 1]

    # LightGBM
    lgb_fold = lgb.LGBMClassifier(
        n_estimators=700, num_leaves=70, learning_rate=0.02,
        class_weight='balanced', random_state=RANDOM_SEED, verbose=-1
    )
    lgb_fold.fit(X_tr, y_tr)
    oof_lgb[val_idx] = lgb_fold.predict_proba(X_vl)[:, 1]

    # XGBoost
    xgb_fold = xgb.XGBClassifier(
        n_estimators=700, max_depth=5, learning_rate=0.02,
        scale_pos_weight=(y_tr == 0).sum() / (y_tr == 1).sum(),
        random_state=RANDOM_SEED
    )
    xgb_fold.fit(X_tr, y_tr)
    oof_xgb[val_idx] = xgb_fold.predict_proba(X_vl)[:, 1]

print("\n‚úÖ OOF predictions generated for all models!")

# Create meta-feature training set (FULL DATA)
print("Creating meta-features for full-data training...")
X_full_meta = X_full.copy()
X_full_meta['pred_cat'] = oof_cat
X_full_meta['pred_lgb'] = oof_lgb
X_full_meta['pred_xgb'] = oof_xgb
X_full_meta['pred_avg'] = (oof_cat + oof_lgb + oof_xgb) / 3
X_full_meta['pred_std'] = np.std(np.vstack([oof_cat, oof_lgb, oof_xgb]), axis=0)
X_full_meta['pred_max'] = np.max(np.vstack([oof_cat, oof_lgb, oof_xgb]), axis=0)
X_full_meta['pred_min'] = np.min(np.vstack([oof_cat, oof_lgb, oof_xgb]), axis=0)

# Train meta-model using FULL DATA
print("Training meta-model on full data...")
meta_model = CatBoostClassifier(
    iterations=500,
    depth=4,
    learning_rate=0.03,
    auto_class_weights='Balanced',
    random_state=RANDOM_SEED,
    verbose=0
)
meta_model.fit(X_full_meta, y_full)

print("‚úÖ Full-data stacking meta-model trained successfully!")



OPTIMIZATION 5: MULTI-LAYER STACKING (Full Data)
Generating out-of-fold predictions for full-data meta-features...
  Fold 1/5...
  Fold 2/5...
  Fold 3/5...
  Fold 4/5...
  Fold 5/5...

‚úÖ OOF predictions generated for all models!
Creating meta-features for full-data training...
Training meta-model on full data...
‚úÖ Full-data stacking meta-model trained successfully!


In [35]:
# ----------------------------
# Meta-model: Aggressive (M2) + Platt calibration (C1) + Blending
# ----------------------------
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import numpy as np
import warnings
warnings.filterwarnings("ignore")

print("\n>>> Re-training meta-model (M2) with OOF for calibration (C1) and blending")

# Meta-model params (aggressive)
meta_params_m2 = {
    'iterations': 1500,
    'depth': 5,
    'learning_rate': 0.02,
    'l2_leaf_reg': 9,
    'random_strength': 2,
    'bagging_temperature': 1.0,
    'rsm': 0.8,                # feature subsampling ratio
    'auto_class_weights': 'Balanced',
    'verbose': 0,
    'random_state': RANDOM_SEED
}

# Prepare storage for OOF meta preds
kf_meta = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)
oof_meta = np.zeros(len(X_full_meta))

print("  Generating meta-model OOF predictions (this retrains the meta-model 5x)...")
for fold, (tr_idx, vl_idx) in enumerate(kf_meta.split(X_full_meta, y_full)):
    print(f"   fold {fold+1}/5", end=" ... ")
    X_tr_meta, X_vl_meta = X_full_meta.iloc[tr_idx], X_full_meta.iloc[vl_idx]
    y_tr_meta, y_vl_meta = y_full.iloc[tr_idx], y_full.iloc[vl_idx]

    mm = CatBoostClassifier(**meta_params_m2)
    # Use early stopping on the meta fold to avoid heavy overfitting inside folds
    mm.fit(X_tr_meta, y_tr_meta, eval_set=(X_vl_meta, y_vl_meta), early_stopping_rounds=50, verbose=False)
    oof_meta[vl_idx] = mm.predict_proba(X_vl_meta)[:, 1]
    print("done")

# Evaluate OOF meta AUC
auc_oof_meta = roc_auc_score(y_full, oof_meta)
print(f"\nOOF meta-model AUC (before calibration): {auc_oof_meta:.6f}")

# ----------------------------
# Platt Scaling (Logistic) on OOF meta preds
# ----------------------------
print("Fitting Platt calibrator on OOF meta predictions...")
calibrator = LogisticRegression(solver='lbfgs', max_iter=2000)
calibrator.fit(oof_meta.reshape(-1, 1), y_full)
print("Calibrator fitted.")

# Calibrated OOF AUC (sanity)
oof_meta_calibrated = calibrator.predict_proba(oof_meta.reshape(-1, 1))[:, 1]
auc_oof_meta_cal = roc_auc_score(y_full, oof_meta_calibrated)
print(f"OOF meta-model AUC (after calibration): {auc_oof_meta_cal:.6f}")

# ----------------------------
# Retrain meta-model on FULL meta-data (final)
# ----------------------------
print("Retraining final meta-model on full meta-data (M2 params)...")
meta_model = CatBoostClassifier(**meta_params_m2)
# Optional: set a small eval set from X_full_meta if you want early stopping,
# but we train on full data here to maximize performance.
meta_model.fit(X_full_meta, y_full, verbose=False)
print("Final meta-model trained.")

# ----------------------------
# Produce test meta-features & predictions
# ----------------------------
print("Generating test meta-features and meta-model predictions...")
test_meta = test_full.copy()
# Ensure test_meta has same columns as X_full (base features) and meta columns order won't matter for CatBoost
test_meta['pred_cat'] = pred_test_cat
test_meta['pred_lgb'] = pred_test_lgb
test_meta['pred_xgb'] = pred_test_xgb
test_meta['pred_avg'] = (pred_test_cat + pred_test_lgb + pred_test_xgb) / 3
test_meta['pred_std'] = np.std(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)
test_meta['pred_max'] = np.max(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)
test_meta['pred_min'] = np.min(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)

test_meta_preds = meta_model.predict_proba(test_meta)[:, 1]

# Calibrate test meta preds with Platt calibrator
test_meta_preds_calibrated = calibrator.predict_proba(test_meta_preds.reshape(-1, 1))[:, 1]

print("Test meta-model preds (uncalibrated) ‚Äî preview:", test_meta_preds[:5])
print("Test meta-model preds (calibrated)   ‚Äî preview:", test_meta_preds_calibrated[:5])

# ----------------------------
# Blending: stacking (calibrated) + best base model (0.65 / 0.35)
# ----------------------------
print("Selecting best base model for blending (by validation AUROC)...")
base_aucs = {'CatBoost': auc_cat, 'LightGBM': auc_lgb, 'XGBoost': auc_xgb}
best_base_name = max(base_aucs, key=base_aucs.get)
print(f"  Best base: {best_base_name} (val AUROC = {base_aucs[best_base_name]:.6f})")

if best_base_name == 'CatBoost':
    best_base_test_preds = pred_test_cat
elif best_base_name == 'LightGBM':
    best_base_test_preds = pred_test_lgb
else:
    best_base_test_preds = pred_test_xgb

alpha = 0.65
beta = 1.0 - alpha
final_test_preds = alpha * test_meta_preds_calibrated + beta * best_base_test_preds

print("\nBlending complete ‚Äî preview of final blended test preds:", final_test_preds[:5])

# Optional: estimate blended "CV" by blending OOF_meta_calibrated with base OOF (if you have base OOFs)
try:
    if 'oof_cat' in globals() and 'oof_lgb' in globals() and 'oof_xgb' in globals():
        # pick best base oof
        if best_base_name == 'CatBoost':
            best_base_oof = oof_cat
        elif best_base_name == 'LightGBM':
            best_base_oof = oof_lgb
        else:
            best_base_oof = oof_xgb
        oof_meta_calibrated = calibrator.predict_proba(oof_meta.reshape(-1,1))[:,1]
        blended_oof = alpha * oof_meta_calibrated + beta * best_base_oof
        blended_oof_auc = roc_auc_score(y_full, blended_oof)
        print(f"Estimated blended OOF AUROC: {blended_oof_auc:.6f}")
except Exception:
    pass

# Save objects for final submission block
final_meta_model = meta_model
final_calibrator = calibrator
final_test_predictions_blended = final_test_preds



>>> Re-training meta-model (M2) with OOF for calibration (C1) and blending
  Generating meta-model OOF predictions (this retrains the meta-model 5x)...
   fold 1/5 ... done
   fold 2/5 ... done
   fold 3/5 ... done
   fold 4/5 ... done
   fold 5/5 ... done

OOF meta-model AUC (before calibration): 0.635855
Fitting Platt calibrator on OOF meta predictions...
Calibrator fitted.
OOF meta-model AUC (after calibration): 0.635855
Retraining final meta-model on full meta-data (M2 params)...
Final meta-model trained.
Generating test meta-features and meta-model predictions...
Test meta-model preds (uncalibrated) ‚Äî preview: [0.53168724 0.65239934 0.64481641 0.37164834 0.54477948]
Test meta-model preds (calibrated)   ‚Äî preview: [0.05903837 0.0961965  0.09334852 0.03019586 0.0623043 ]
Selecting best base model for blending (by validation AUROC)...
  Best base: CatBoost (val AUROC = 0.636064)

Blending complete ‚Äî preview of final blended test preds: [0.22119708 0.28444177 0.28081696 0.16879

In [36]:
print("\n" + "=" * 80)
print("OPTIMIZATION 6: RANK AVERAGING")
print("=" * 80)

def rank_average(*predictions):
    """Rank-based averaging"""
    ranked = [rankdata(pred) / len(pred) for pred in predictions]
    return np.mean(ranked, axis=0)

pred_rank = rank_average(pred_cat, pred_lgb, pred_xgb)
auc_rank = roc_auc_score(y_val, pred_rank)
print(f"‚úÖ Rank Averaging Val AUROC: {auc_rank:.4f}")

# Simple averaging
pred_avg = (pred_cat + pred_lgb + pred_xgb) / 3
auc_avg = roc_auc_score(y_val, pred_avg)
print(f"‚úÖ Simple Averaging Val AUROC: {auc_avg:.4f}")

# Weighted averaging
total_auc = auc_cat + auc_lgb + auc_xgb
w_cat = auc_cat / total_auc
w_lgb = auc_lgb / total_auc
w_xgb = auc_xgb / total_auc

pred_weighted = w_cat * pred_cat + w_lgb * pred_lgb + w_xgb * pred_xgb
auc_weighted = roc_auc_score(y_val, pred_weighted)
print(f"‚úÖ Weighted Averaging Val AUROC: {auc_weighted:.4f}")


OPTIMIZATION 6: RANK AVERAGING
‚úÖ Rank Averaging Val AUROC: 0.6333
‚úÖ Simple Averaging Val AUROC: 0.6331
‚úÖ Weighted Averaging Val AUROC: 0.6332


In [37]:
# ========================================================================
# FINAL COMPARISON ‚Äì ALL METHODS (Supports Full-Data Stacking)
# ========================================================================
print("\n" + "=" * 80)
print("FINAL COMPARISON OF METHODS")
print("=" * 80)

results = {}

# 1. CatBoost (Optimized)
results['CatBoost (Optimized)'] = auc_cat  # already computed earlier

# 2. LightGBM
results['LightGBM'] = auc_lgb

# 3. XGBoost
results['XGBoost'] = auc_xgb

# 4. Pseudo-labeled CatBoost
if 'auc_pseudo' in globals():
    results['Pseudo-labeled CatBoost'] = auc_pseudo

# 5. Simple Averaging
results['Simple Averaging'] = auc_avg  # earlier

# 6. Weighted Averaging
results['Weighted Averaging'] = auc_weighted  # earlier

# 7. Rank Averaging
results['Rank Averaging'] = auc_rank  # earlier

# 8. Full-Data Stacking (New S2 Score) ‚Üí use OOF score
auc_stacking = roc_auc_score(y_full, oof_cat * 0 + oof_lgb * 0 + oof_xgb * 0)  # placeholder fix below
# FIX: stacking score is based on meta-model OOF predictions
oof_meta = meta_model.predict_proba(X_full_meta)[:, 1]
auc_stacking = roc_auc_score(y_full, oof_meta)

results['Multi-layer Stacking'] = auc_stacking


# ------------------------------------------------------------------------
# Print Comparison Table
# ------------------------------------------------------------------------
sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

print("\nModel Performance (Higher is Better)")
for method, score in sorted_results:
    print(f"  {method:<30} : {score:.6f}")

best_method, best_score = sorted_results[0]

print("\nüèÜ BEST METHOD:", best_method)
print(f"   AUROC: {best_score:.6f}")



FINAL COMPARISON OF METHODS

Model Performance (Higher is Better)
  Multi-layer Stacking           : 0.695209
  CatBoost (Optimized)           : 0.636064
  Pseudo-labeled CatBoost        : 0.636064
  Rank Averaging                 : 0.633259
  Weighted Averaging             : 0.633157
  Simple Averaging               : 0.633063
  XGBoost                        : 0.630821
  LightGBM                       : 0.618089

üèÜ BEST METHOD: Multi-layer Stacking
   AUROC: 0.695209


In [38]:
print("\n" + "=" * 80)
print("SANITY CHECK BEFORE FINAL SUBMISSION (S2 Full-Data Stacking)")
print("=" * 80)

errors = False

# Check X_full and y_full
try:
    print(f"X_full shape: {X_full.shape}")
    print(f"y_full length: {len(y_full)}")
except:
    print("‚ùå X_full or y_full missing!")
    errors = True

# Check test_full
try:
    print(f"test_full shape: {test_full.shape}")
except:
    print("‚ùå test_full missing!")
    errors = True

# Check OOF and meta features
try:
    print(f"oof_cat length: {len(oof_cat)}")
    print(f"oof_lgb length: {len(oof_lgb)}")
    print(f"oof_xgb length: {len(oof_xgb)}")
except:
    print("‚ùå OOF vectors missing!")
    errors = True

# Check X_full_meta and meta_model
if 'X_full_meta' in globals():
    print(f"X_full_meta shape: {X_full_meta.shape}")
else:
    print("‚ùå X_full_meta missing!")
    errors = True

if 'meta_model' in globals():
    print("meta_model: ‚úÖ exists")
else:
    print("‚ùå meta_model missing!")
    errors = True

# Check best_method and best_score
if 'best_method' in globals() and 'best_score' in globals():
    print(f"Best method so far: {best_method} (AUROC: {best_score:.6f})")
else:
    print("‚ùå best_method or best_score missing!")
    errors = True

# Check for stacking selection
if 'Multi-layer Stacking' in best_method:
    print("Stacking is selected for final submission ‚úÖ")
else:
    print("Stacking not selected - final submission may use another method")

# Final verdict
if errors:
    print("\n‚ö†Ô∏è Fix the above issues BEFORE running Final Submission!")
else:
    print("\n‚úÖ All good! You can now safely run the Final Submission block.")



SANITY CHECK BEFORE FINAL SUBMISSION (S2 Full-Data Stacking)
X_full shape: (296209, 86)
y_full length: 296209
test_full shape: (126948, 86)
oof_cat length: 296209
oof_lgb length: 296209
oof_xgb length: 296209
X_full_meta shape: (296209, 93)
meta_model: ‚úÖ exists
Best method so far: Multi-layer Stacking (AUROC: 0.695209)
Stacking is selected for final submission ‚úÖ

‚úÖ All good! You can now safely run the Final Submission block.


In [39]:
# ========================================================================
# GENERATING FINAL SUBMISSION (Supports Full-Data Stacking)
# ========================================================================
print("\n" + "=" * 80)
print("GENERATING FINAL SUBMISSION")
print("=" * 80)

print(f"Using best method: {best_method}")

# Retrain base models on FULL DATA
print("Retraining base models on full training data...")
best_catboost.fit(X_full, y_full)
lgb_model.fit(X_full, y_full)
xgb_model.fit(X_full, y_full)

# Base model predictions
pred_test_cat = best_catboost.predict_proba(test_full)[:, 1]
pred_test_lgb = lgb_model.predict_proba(test_full, predict_disable_shape_check=True)[:, 1]
pred_test_xgb = xgb_model.predict_proba(test_full)[:, 1]

# ---------------- FULL-DATA STACKING PREDICTION ----------------
if best_method == 'Multi-layer Stacking':

    print("Generating test meta-features for Stacking...")

    test_meta = test_full.copy()
    test_meta['pred_cat'] = pred_test_cat
    test_meta['pred_lgb'] = pred_test_lgb
    test_meta['pred_xgb'] = pred_test_xgb
    test_meta['pred_avg'] = (pred_test_cat + pred_test_lgb + pred_test_xgb) / 3
    test_meta['pred_std'] = np.std(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)
    test_meta['pred_max'] = np.max(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)
    test_meta['pred_min'] = np.min(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)

    final_predictions = final_test_predictions_blended


# ---------------- OTHER METHODS ----------------
elif best_method == 'Rank Averaging':
    final_predictions = rank_average(pred_test_cat, pred_test_lgb, pred_test_xgb)

elif best_method == 'Weighted Averaging':
    final_predictions = w_cat * pred_test_cat + w_lgb * pred_test_lgb + w_xgb * pred_test_xgb

elif best_method == 'Simple Averaging':
    final_predictions = (pred_test_cat + pred_test_lgb + pred_test_xgb) / 3

elif best_method == 'Pseudo-labeled CatBoost':
    catboost_pseudo.fit(X_full, y_full)
    final_predictions = catboost_pseudo.predict_proba(test_full)[:, 1]

else:  # Default to optimized CatBoost
    final_predictions = pred_test_cat


# ---------------- SAVE SUBMISSION ----------------
test_ids = test[ID_COL] if ID_COL in test.columns else range(len(test))

submission = pd.DataFrame({
    'id': test_ids,
    'target': final_predictions
})

filename = f'submission_final_optimized_{best_score:.4f}.csv'
submission.to_csv(filename, index=False)

print(f"\n‚úÖ Submission created: {filename}")
print(f"   Method: {best_method}")
print(f"   Validation AUROC: {best_score:.4f}")



GENERATING FINAL SUBMISSION
Using best method: Multi-layer Stacking
Retraining base models on full training data...
Generating test meta-features for Stacking...

‚úÖ Submission created: submission_final_optimized_0.6952.csv
   Method: Multi-layer Stacking
   Validation AUROC: 0.6952


In [34]:
# ========================================================================
# CREATE ADDITIONAL SUBMISSIONS FOR TOP 3 METHODS  (S2 Compatible)
# ========================================================================
print("\nCreating additional submissions for top 3 methods...")

sorted_results = sorted(results.items(), key=lambda x: x[1], reverse=True)

for i, (method, score) in enumerate(sorted_results[:3]):
    if method == best_method:
        continue  # Already created

    print(f"  Creating submission for: {method} (AUROC: {score:.6f})")

    # ---------------- STACKING CASE (S2 FULL-DATA) ----------------
    if method == 'Multi-layer Stacking':
        test_meta = test_full.copy()
        test_meta['pred_cat'] = pred_test_cat
        test_meta['pred_lgb'] = pred_test_lgb
        test_meta['pred_xgb'] = pred_test_xgb
        test_meta['pred_avg'] = (pred_test_cat + pred_test_lgb + pred_test_xgb) / 3
        test_meta['pred_std'] = np.std(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)
        test_meta['pred_max'] = np.max(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)
        test_meta['pred_min'] = np.min(np.vstack([pred_test_cat, pred_test_lgb, pred_test_xgb]), axis=0)

        preds = meta_model.predict_proba(test_meta)[:, 1]

    # ---------------- OTHER METHODS ----------------
    elif method == 'Rank Averaging':
        preds = rank_average(pred_test_cat, pred_test_lgb, pred_test_xgb)

    elif method == 'Weighted Averaging':
        preds = w_cat * pred_test_cat + w_lgb * pred_test_lgb + w_xgb * pred_test_xgb

    elif method == 'Simple Averaging':
        preds = (pred_test_cat + pred_test_lgb + pred_test_xgb) / 3

    elif method == 'Pseudo-labeled CatBoost':
        preds = catboost_pseudo.predict_proba(test_full)[:, 1]

    elif method == 'CatBoost (Optimized)':
        preds = pred_test_cat

    elif method == 'LightGBM':
        preds = pred_test_lgb

    elif method == 'XGBoost':
        preds = pred_test_xgb

    else:
        print(f"  ‚ö†Ô∏è Skipping unsupported method: {method}")
        continue

    sub = pd.DataFrame({
        'id': test_ids,
        'target': preds
    })

    fname = f"submission_{method.lower().replace(' ', '_').replace('(', '').replace(')', '')}_{score:.6f}.csv"
    sub.to_csv(fname, index=False)
    print(f"    ‚úÖ {fname}")



Creating additional submissions for top 3 methods...
  Creating submission for: CatBoost (Optimized) (AUROC: 0.636064)
    ‚úÖ submission_catboost_optimized_0.636064.csv
  Creating submission for: Pseudo-labeled CatBoost (AUROC: 0.636064)
    ‚úÖ submission_pseudo-labeled_catboost_0.636064.csv
