In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from scipy.stats import rankdata

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import joblib

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

print("=" * 80)
print("ANTI-OVERFITTING PIPELINE")
print("Addressing: Val=0.651 vs Kaggle=0.642 gap")
print("=" * 80)

ANTI-OVERFITTING PIPELINE
Addressing: Val=0.651 vs Kaggle=0.642 gap


In [2]:
# ========================================================================
# STEP 1: ADVERSARIAL VALIDATION - Detect Distribution Shift
# ========================================================================
print("\n" + "=" * 80)
print("STEP 1: ADVERSARIAL VALIDATION")
print("=" * 80)

train = pd.read_csv('train1.csv')
test = pd.read_csv('test.csv')

TARGET = 'target'
ID_COL = 'id'

# Create adversarial dataset
train_adv = train.drop([TARGET, ID_COL], axis=1, errors='ignore').copy()
test_adv = test.drop([ID_COL], axis=1, errors='ignore').copy()

train_adv['is_test'] = 0
test_adv['is_test'] = 1

# Align columns
common_cols = list(set(train_adv.columns) & set(test_adv.columns))
train_adv = train_adv[common_cols]
test_adv = test_adv[common_cols]

adv_data = pd.concat([train_adv, test_adv], axis=0).reset_index(drop=True)
adv_target = adv_data['is_test']
adv_features = adv_data.drop('is_test', axis=1).fillna(-999)

# Train adversarial model
print("Training adversarial model to detect train/test differences...")
adv_model = lgb.LGBMClassifier(n_estimators=100, random_state=RANDOM_SEED, verbose=-1)
adv_scores = cross_val_score(adv_model, adv_features, adv_target, cv=5, scoring='roc_auc')

print(f"Adversarial AUC: {adv_scores.mean():.4f}")
print("Interpretation:")
if adv_scores.mean() < 0.55:
    print("  ‚úÖ Train and test distributions are very similar")
elif adv_scores.mean() < 0.65:
    print("  ‚ö†Ô∏è  Moderate distribution shift detected")
else:
    print("  üö® Significant distribution shift - high overfitting risk!")

# Get feature importance from adversarial model
adv_model.fit(adv_features, adv_target)
adv_importance = pd.DataFrame({
    'feature': adv_features.columns,
    'importance': adv_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 features causing train/test mismatch:")
print(adv_importance.head(10))

# Identify problematic features (high adversarial importance)
problematic_features = adv_importance[adv_importance['importance'] > adv_importance['importance'].quantile(0.9)]['feature'].tolist()
print(f"\nProblematic features to monitor: {len(problematic_features)}")


STEP 1: ADVERSARIAL VALIDATION
Training adversarial model to detect train/test differences...


  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py

Adversarial AUC: 0.4979
Interpretation:
  ‚úÖ Train and test distributions are very similar

Top 10 features causing train/test mismatch:
          feature  importance
54       feature7         195
18      ps_car_13         161
64       feature6         150
59      ps_reg_03         149
40      ps_car_14         148
27       feature4         143
48       feature2         111
53  ps_car_11_cat         107
63      ps_ind_15          95
24     ps_calc_02          88

Problematic features to monitor: 7


In [3]:
# ========================================================================
# STEP 2: CONSERVATIVE FEATURE ENGINEERING
# ========================================================================
print("\n" + "=" * 80)
print("STEP 2: CONSERVATIVE FEATURE ENGINEERING")
print("=" * 80)

def create_conservative_features(df, is_train=True):
    """
    More conservative feature engineering - avoid overfitting
    """
    df = df.copy()
    
    print("  Creating robust features...")
    
    # Only simple, stable interactions
    if 'ps_car_13' in df.columns and 'ps_reg_03' in df.columns:
        df['car13_reg03'] = df['ps_car_13'] * df['ps_reg_03']
    
    if 'ps_ind_15' in df.columns and 'ps_reg_01' in df.columns:
        df['ind15_reg01'] = df['ps_ind_15'] * df['ps_reg_01']
    
    # Simple aggregations only
    car_cols = [c for c in df.columns if c.startswith('ps_car_') and c.endswith('_cat')]
    if car_cols:
        df['car_cat_sum'] = df[car_cols].sum(axis=1)
    
    ind_cols = [c for c in df.columns if c.startswith('ps_ind_') and c.endswith('_bin')]
    if ind_cols:
        df['ind_bin_sum'] = df[ind_cols].sum(axis=1)
    
    # Missing indicators for key features only
    key_missing = ['ps_car_03_cat', 'ps_car_05_cat', 'ps_reg_03', 'ps_car_11', 'ps_car_14']
    for col in key_missing:
        if col in df.columns:
            df[f'{col}_missing'] = df[col].isnull().astype(int)
    
    # Avoid high-order polynomials and complex interactions
    # They overfit easily
    
    return df

print("Applying to train...")
train_fe = create_conservative_features(train, is_train=True)

print("Applying to test...")
test_fe = create_conservative_features(test, is_train=False)


STEP 2: CONSERVATIVE FEATURE ENGINEERING
Applying to train...
  Creating robust features...
Applying to test...
  Creating robust features...


In [4]:
# ========================================================================
# STEP 3: ROBUST CROSS-VALIDATION STRATEGY
# ========================================================================
print("\n" + "=" * 80)
print("STEP 3: ROBUST CROSS-VALIDATION")
print("=" * 80)

X_full = train_fe.drop([TARGET, ID_COL], axis=1)
y_full = train_fe[TARGET]
test_full = test_fe.drop([ID_COL], axis=1, errors='ignore')

# CRITICAL: Align columns between train and test
print(f"Train features: {X_full.shape[1]}")
print(f"Test features: {test_full.shape[1]}")

# Ensure exact same features
common_features = list(set(X_full.columns) & set(test_full.columns))
print(f"Common features: {len(common_features)}")

X_full = X_full[common_features]
test_full = test_full[common_features]

print(f"Aligned - Train: {X_full.shape[1]}, Test: {test_full.shape[1]}")

# Fill missing
X_full = X_full.fillna(-999)
test_full = test_full.fillna(-999)

# Remove highly adversarial features
features_to_drop = [f for f in problematic_features if f in X_full.columns]
if features_to_drop:
    print(f"\nDropping {len(features_to_drop)} problematic features:")
    print(features_to_drop[:5], "...")
    X_full = X_full.drop(features_to_drop, axis=1)
    test_full = test_full.drop(features_to_drop, axis=1)

print(f"\nFinal feature count: {X_full.shape[1]}")


STEP 3: ROBUST CROSS-VALIDATION
Train features: 74
Test features: 74
Common features: 74
Aligned - Train: 74, Test: 74

Dropping 7 problematic features:
['feature7', 'ps_car_13', 'feature6', 'ps_reg_03', 'ps_car_14'] ...

Final feature count: 67


In [6]:
# ========================================================================
# STEP 4: CONSERVATIVE TARGET ENCODING WITH MORE SMOOTHING
# ========================================================================
print("\n" + "=" * 80)
print("STEP 4: CONSERVATIVE TARGET ENCODING")
print("=" * 80)

def conservative_target_encode(X_train, y_train, X_test, cat_cols, alpha=20):
    """
    Target encoding with HIGH smoothing to prevent overfitting
    alpha=20 (vs 10 before) = more regularization
    """
    X_train_te = X_train.copy()
    X_test_te = X_test.copy()
    
    global_mean = y_train.mean()
    
    # Use stratified 10-fold (more folds = less overfitting)
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)
    
    for col in cat_cols:
        if col not in X_train.columns:
            continue
        
        print(f"  Encoding: {col}")
        
        X_train_te[f'{col}_te'] = global_mean  # Initialize with global mean
        
        for train_idx, val_idx in kf.split(X_train, y_train):
            X_tr = X_train.iloc[train_idx]
            y_tr = y_train.iloc[train_idx]
            X_vl = X_train.iloc[val_idx]
            
            agg = pd.DataFrame({'col': X_tr[col], 'target': y_tr})
            means = agg.groupby('col')['target'].agg(['mean', 'count'])
            
            # Higher alpha = more smoothing towards global mean
            smoothed = (means['count'] * means['mean'] + alpha * global_mean) / (means['count'] + alpha)
            
            X_train_te.loc[X_train.index[val_idx], f'{col}_te'] = X_vl[col].map(smoothed).fillna(global_mean).values
        
        # For test, use full training data
        agg_full = pd.DataFrame({'col': X_train[col], 'target': y_train})
        means_full = agg_full.groupby('col')['target'].agg(['mean', 'count'])
        smoothed_full = (means_full['count'] * means_full['mean'] + alpha * global_mean) / (means_full['count'] + alpha)
        
        X_test_te[f'{col}_te'] = X_test[col].map(smoothed_full).fillna(global_mean)
    
    return X_train_te, X_test_te

cat_cols_for_te = [col for col in X_full.columns if col.endswith('_cat')]

if len(cat_cols_for_te) > 0:
    print(f"Applying conservative target encoding to {len(cat_cols_for_te)} features...")
    X_full, test_full = conservative_target_encode(
        X_full, y_full, test_full,
        cat_cols_for_te, alpha=20  # Increased from 10
    )


STEP 4: CONSERVATIVE TARGET ENCODING
Applying conservative target encoding to 14 features...
  Encoding: ps_car_04_cat
  Encoding: ps_car_08_cat
  Encoding: ps_car_03_cat
  Encoding: ps_ind_05_cat
  Encoding: ps_ind_04_cat
  Encoding: ps_car_05_cat
  Encoding: ps_car_01_cat
  Encoding: ps_car_06_cat
  Encoding: ps_car_09_cat
  Encoding: ps_car_10_cat
  Encoding: ps_car_07_cat
  Encoding: ps_car_02_cat
  Encoding: ps_car_11_cat
  Encoding: ps_ind_02_cat


In [7]:
# ========================================================================
# STEP 5: REGULARIZED MODELS WITH EARLY STOPPING
# ========================================================================
print("\n" + "=" * 80)
print("STEP 5: TRAINING REGULARIZED MODELS")
print("=" * 80)

# Use proper CV instead of single split
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

# Store OOF predictions
oof_cat = np.zeros(len(X_full))
oof_lgb = np.zeros(len(X_full))
oof_xgb = np.zeros(len(X_full))

# Store test predictions
test_cat = np.zeros(len(test_full))
test_lgb = np.zeros(len(test_full))
test_xgb = np.zeros(len(test_full))

# CatBoost with strong regularization
print("\nüìä Training CatBoost with 5-fold CV...")
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full, y_full)):
    print(f"  Fold {fold+1}/5...")
    
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    cat_model = CatBoostClassifier(
        iterations=1000,
        depth=4,  # Reduced from 6-8 to prevent overfitting
        learning_rate=0.02,
        l2_leaf_reg=10,  # Strong regularization
        bagging_temperature=1.0,
        random_strength=2,
        auto_class_weights='Balanced',
        random_state=RANDOM_SEED + fold,
        verbose=0,
        early_stopping_rounds=50,
        eval_metric='AUC'
    )
    
    cat_model.fit(
        X_tr, y_tr,
        eval_set=(X_vl, y_vl),
        verbose=False
    )
    
    oof_cat[val_idx] = cat_model.predict_proba(X_vl)[:, 1]
    test_cat += cat_model.predict_proba(test_full)[:, 1] / 5

cat_cv_score = roc_auc_score(y_full, oof_cat)
print(f"‚úÖ CatBoost OOF AUROC: {cat_cv_score:.4f}")

# LightGBM with strong regularization
print("\nüìä Training LightGBM with 5-fold CV...")
for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full, y_full)):
    print(f"  Fold {fold+1}/5...")
    
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    lgb_model = lgb.LGBMClassifier(
        n_estimators=1000,
        num_leaves=31,  # Conservative
        learning_rate=0.02,
        min_child_samples=50,  # Prevent overfitting
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1.0,  # L1 regularization
        reg_lambda=1.0,  # L2 regularization
        class_weight='balanced',
        random_state=RANDOM_SEED + fold,
        verbose=-1
    )
    
    lgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_vl, y_vl)],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    
    oof_lgb[val_idx] = lgb_model.predict_proba(X_vl)[:, 1]
    test_lgb += lgb_model.predict_proba(test_full)[:, 1] / 5

lgb_cv_score = roc_auc_score(y_full, oof_lgb)
print(f"‚úÖ LightGBM OOF AUROC: {lgb_cv_score:.4f}")

# XGBoost with strong regularization
print("\nüìä Training XGBoost with 5-fold CV...")
scale_pos_weight = (y_full == 0).sum() / (y_full == 1).sum()

for fold, (train_idx, val_idx) in enumerate(kfold.split(X_full, y_full)):
    print(f"  Fold {fold+1}/5...")
    
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full.iloc[train_idx], y_full.iloc[val_idx]
    
    xgb_model = xgb.XGBClassifier(
        n_estimators=1000,
        max_depth=4,  # Shallow trees
        learning_rate=0.02,
        min_child_weight=5,  # Prevent overfitting
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=1.0,  # Regularization
        reg_alpha=1.0,
        reg_lambda=1.0,
        scale_pos_weight=scale_pos_weight,
        random_state=RANDOM_SEED + fold,
        early_stopping_rounds=50
    )
    
    xgb_model.fit(
        X_tr, y_tr,
        eval_set=[(X_vl, y_vl)],
        verbose=False
    )
    
    oof_xgb[val_idx] = xgb_model.predict_proba(X_vl)[:, 1]
    test_xgb += xgb_model.predict_proba(test_full)[:, 1] / 5

xgb_cv_score = roc_auc_score(y_full, oof_xgb)
print(f"‚úÖ XGBoost OOF AUROC: {xgb_cv_score:.4f}")


STEP 5: TRAINING REGULARIZED MODELS

üìä Training CatBoost with 5-fold CV...
  Fold 1/5...
  Fold 2/5...
  Fold 3/5...
  Fold 4/5...
  Fold 5/5...
‚úÖ CatBoost OOF AUROC: 0.6378

üìä Training LightGBM with 5-fold CV...
  Fold 1/5...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.576338
  Fold 2/5...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.599038
  Fold 3/5...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.591942
  Fold 4/5...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.598764
  Fold 5/5...
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration

In [8]:
# ========================================================================
# STEP 6: SIMPLE ENSEMBLING (AVOID OVERFITTING)
# ========================================================================
print("\n" + "=" * 80)
print("STEP 6: ENSEMBLE STRATEGIES")
print("=" * 80)

# Simple average
oof_avg = (oof_cat + oof_lgb + oof_xgb) / 3
test_avg = (test_cat + test_lgb + test_xgb) / 3
avg_score = roc_auc_score(y_full, oof_avg)

# Weighted by CV score
total_score = cat_cv_score + lgb_cv_score + xgb_cv_score
w_cat = cat_cv_score / total_score
w_lgb = lgb_cv_score / total_score
w_xgb = xgb_cv_score / total_score

oof_weighted = w_cat * oof_cat + w_lgb * oof_lgb + w_xgb * oof_xgb
test_weighted = w_cat * test_cat + w_lgb * test_lgb + w_xgb * test_xgb
weighted_score = roc_auc_score(y_full, oof_weighted)

# Rank average (most robust)
oof_rank = (rankdata(oof_cat) + rankdata(oof_lgb) + rankdata(oof_xgb)) / 3
test_rank = (rankdata(test_cat) + rankdata(test_lgb) + rankdata(test_xgb)) / 3
rank_score = roc_auc_score(y_full, oof_rank)

# Simple stacking with linear model (low overfitting)
from sklearn.linear_model import LogisticRegression

oof_stack = np.column_stack([oof_cat, oof_lgb, oof_xgb])
test_stack = np.column_stack([test_cat, test_lgb, test_xgb])

lr = LogisticRegression(class_weight='balanced', random_state=RANDOM_SEED, max_iter=1000)
lr.fit(oof_stack, y_full)

oof_lr = lr.predict_proba(oof_stack)[:, 1]
test_lr = lr.predict_proba(test_stack)[:, 1]
lr_score = roc_auc_score(y_full, oof_lr)

print(f"\nEnsemble Results (OOF scores):")
print(f"  CatBoost:          {cat_cv_score:.4f}")
print(f"  LightGBM:          {lgb_cv_score:.4f}")
print(f"  XGBoost:           {xgb_cv_score:.4f}")
print(f"  Simple Average:    {avg_score:.4f}")
print(f"  Weighted Average:  {weighted_score:.4f}")
print(f"  Rank Average:      {rank_score:.4f}")
print(f"  Linear Stacking:   {lr_score:.4f}")


STEP 6: ENSEMBLE STRATEGIES

Ensemble Results (OOF scores):
  CatBoost:          0.6378
  LightGBM:          0.6315
  XGBoost:           0.6234
  Simple Average:    0.6355
  Weighted Average:  0.6355
  Rank Average:      0.6362
  Linear Stacking:   0.6385


In [9]:
# ========================================================================
# STEP 7: SELECT BEST METHOD & CREATE SUBMISSION
# ========================================================================
print("\n" + "=" * 80)
print("STEP 7: FINAL SELECTION & SUBMISSION")
print("=" * 80)

results = {
    'CatBoost': (cat_cv_score, test_cat),
    'LightGBM': (lgb_cv_score, test_lgb),
    'XGBoost': (xgb_cv_score, test_xgb),
    'Simple Average': (avg_score, test_avg),
    'Weighted Average': (weighted_score, test_weighted),
    'Rank Average': (rank_score, test_rank),
    'Linear Stacking': (lr_score, test_lr)
}

# Choose method with best OOF score
best_method = max(results, key=lambda x: results[x][0])
best_score, best_preds = results[best_method]

print(f"\nüèÜ Best Method: {best_method}")
print(f"üìä OOF AUROC: {best_score:.4f}")
print(f"\n‚ö†Ô∏è  Note: OOF score is more reliable than single validation split")
print(f"   Expected Kaggle score: {best_score:.4f} ¬± 0.003")

# Create submissions for all methods
test_ids = test[ID_COL] if ID_COL in test.columns else range(len(test))

for method_name, (score, predictions) in results.items():
    submission = pd.DataFrame({
        'id': test_ids,
        'target': predictions
    })
    
    filename = f'submission_robust_{method_name.lower().replace(" ", "_")}_{score:.4f}.csv'
    submission.to_csv(filename, index=False)
    print(f"‚úÖ Created: {filename}")


STEP 7: FINAL SELECTION & SUBMISSION

üèÜ Best Method: Linear Stacking
üìä OOF AUROC: 0.6385

‚ö†Ô∏è  Note: OOF score is more reliable than single validation split
   Expected Kaggle score: 0.6385 ¬± 0.003
‚úÖ Created: submission_robust_catboost_0.6378.csv
‚úÖ Created: submission_robust_lightgbm_0.6315.csv
‚úÖ Created: submission_robust_xgboost_0.6234.csv
‚úÖ Created: submission_robust_simple_average_0.6355.csv
‚úÖ Created: submission_robust_weighted_average_0.6355.csv
‚úÖ Created: submission_robust_rank_average_0.6362.csv
‚úÖ Created: submission_robust_linear_stacking_0.6385.csv


In [10]:
# ========================================================================
# STEP 8: VALIDATION DIAGNOSTICS
# ========================================================================
print("\n" + "=" * 80)
print("STEP 8: OVERFITTING DIAGNOSTICS")
print("=" * 80)

print("\nüìà Score Stability Analysis:")
print(f"   Best OOF Score: {best_score:.4f}")
print(f"   Previous Val Score: 0.651")
print(f"   Previous Kaggle Score: 0.642")
print(f"   Gap (Val - Kaggle): 0.009")
print(f"\n   Expected improvement: OOF scores are more reliable")
print(f"   New expected Kaggle: {best_score:.4f} ¬± 0.003")

if best_score < 0.651:
    print(f"\n‚úÖ GOOD: Lower OOF score suggests less overfitting")
    print(f"   The gap between val and kaggle should be smaller now")
else:
    print(f"\n‚ö†Ô∏è  Score still high - may indicate remaining overfitting")

print(f"\nüí° Submission Strategy:")
print(f"   1. Submit: submission_robust_{best_method.lower().replace(' ', '_')}_{best_score:.4f}.csv")
print(f"   2. If score is still lower:")
print(f"      - Try 'rank_average' (most robust to distribution shift)")
print(f"      - Try 'simple_average' (less overfitting than weighted)")
print(f"   3. Monitor: OOF vs Kaggle gap should be < 0.005 now")

print("\n" + "=" * 80)
print("‚ú® ANTI-OVERFITTING PIPELINE COMPLETE")
print("=" * 80)


STEP 8: OVERFITTING DIAGNOSTICS

üìà Score Stability Analysis:
   Best OOF Score: 0.6385
   Previous Val Score: 0.651
   Previous Kaggle Score: 0.642
   Gap (Val - Kaggle): 0.009

   Expected improvement: OOF scores are more reliable
   New expected Kaggle: 0.6385 ¬± 0.003

‚úÖ GOOD: Lower OOF score suggests less overfitting
   The gap between val and kaggle should be smaller now

üí° Submission Strategy:
   1. Submit: submission_robust_linear_stacking_0.6385.csv
   2. If score is still lower:
      - Try 'rank_average' (most robust to distribution shift)
      - Try 'simple_average' (less overfitting than weighted)
   3. Monitor: OOF vs Kaggle gap should be < 0.005 now

‚ú® ANTI-OVERFITTING PIPELINE COMPLETE
