In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import joblib
import time

# Reproducible seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [2]:
print("=" * 70)
print("STEP 1: LOADING DATA")
print("=" * 70)

train = pd.read_csv('train1.csv')
test = pd.read_csv('test.csv')

TARGET = 'target'
ID_COL = 'id'

print(f"Train shape: {train.shape}")
print(f"Test shape:  {test.shape}")
print(f"Target distribution:\n{train[TARGET].value_counts(normalize=True)}")

# Calculate class imbalance ratio
class_ratio = (train[TARGET] == 0).sum() / (train[TARGET] == 1).sum()
print(f"Class imbalance ratio: {class_ratio:.2f}:1")

STEP 1: LOADING DATA
Train shape: (296209, 67)
Test shape:  (126948, 66)
Target distribution:
target
0    0.948732
1    0.051268
Name: proportion, dtype: float64
Class imbalance ratio: 18.51:1


In [3]:
print("\n" + "=" * 70)
print("STEP 2: FEATURE ENGINEERING")
print("=" * 70)

def create_features(df, is_train=True):
    """
    Comprehensive feature engineering pipeline
    """
    df = df.copy()
    
    # ===== 2.1: Missing Value Indicators =====
    print("Creating missing value indicators...")
    missing_cols = df.columns[df.isnull().any()].tolist()
    if ID_COL in missing_cols:
        missing_cols.remove(ID_COL)
    if TARGET in missing_cols and is_train:
        missing_cols.remove(TARGET)
    
    for col in missing_cols:
        df[f'{col}_missing'] = df[col].isnull().astype(int)
    
    # ===== 2.2: Interaction Features =====
    print("Creating interaction features...")
    
    # High-value interactions based on domain knowledge
    if 'ps_car_13' in df.columns and 'ps_reg_03' in df.columns:
        df['car13_reg03_interaction'] = df['ps_car_13'] * df['ps_reg_03']
    
    if 'ps_ind_15' in df.columns and 'ps_reg_01' in df.columns:
        df['ind15_reg01_interaction'] = df['ps_ind_15'] * df['ps_reg_01']
    
    if 'ps_car_13' in df.columns and 'ps_car_15' in df.columns:
        df['car13_car15_ratio'] = df['ps_car_13'] / (df['ps_car_15'] + 1e-5)
    
    if 'ps_reg_02' in df.columns and 'ps_reg_03' in df.columns:
        df['reg02_reg03_product'] = df['ps_reg_02'] * df['ps_reg_03']
    
    # ===== 2.3: Polynomial Features (key variables) =====
    print("Creating polynomial features...")
    poly_cols = ['ps_car_13', 'ps_reg_03', 'ps_car_15', 'ps_ind_15']
    
    for col in poly_cols:
        if col in df.columns:
            df[f'{col}_squared'] = df[col] ** 2
            df[f'{col}_cubed'] = df[col] ** 3
            df[f'{col}_sqrt'] = np.sqrt(np.abs(df[col]))
    
    # ===== 2.4: Aggregation Features =====
    print("Creating aggregation features...")
    
    # Sum of all car features
    car_cols = [c for c in df.columns if c.startswith('ps_car_') and c.endswith('_cat')]
    if car_cols:
        df['car_cat_sum'] = df[car_cols].sum(axis=1)
        df['car_cat_mean'] = df[car_cols].mean(axis=1)
    
    # Sum of all ind features
    ind_cols = [c for c in df.columns if c.startswith('ps_ind_') and c.endswith('_bin')]
    if ind_cols:
        df['ind_bin_sum'] = df[ind_cols].sum(axis=1)
    
    # Sum of all calc features
    calc_cols = [c for c in df.columns if c.startswith('ps_calc_')]
    if calc_cols:
        df['calc_sum'] = df[calc_cols].sum(axis=1)
        df['calc_mean'] = df[calc_cols].mean(axis=1)
        df['calc_std'] = df[calc_cols].std(axis=1)
    
    # ===== 2.5: Binning Continuous Variables =====
    print("Creating binned features...")
    
    if 'ps_reg_03' in df.columns:
        df['ps_reg_03_binned'] = pd.qcut(df['ps_reg_03'].fillna(-1), q=10, labels=False, duplicates='drop')
    
    if 'ps_car_13' in df.columns:
        df['ps_car_13_binned'] = pd.qcut(df['ps_car_13'].fillna(-1), q=10, labels=False, duplicates='drop')
    
    # ===== 2.6: Combination Features =====
    print("Creating combination features...")
    
    if 'ps_ind_06_bin' in df.columns and 'ps_ind_07_bin' in df.columns:
        df['ind_06_07_combined'] = df['ps_ind_06_bin'].astype(str) + '_' + df['ps_ind_07_bin'].astype(str)
        df['ind_06_07_combined'] = LabelEncoder().fit_transform(df['ind_06_07_combined'])
    
    if 'ps_car_01_cat' in df.columns and 'ps_car_02_cat' in df.columns:
        df['car_01_02_combined'] = df['ps_car_01_cat'].astype(str) + '_' + df['ps_car_02_cat'].astype(str)
        df['car_01_02_combined'] = LabelEncoder().fit_transform(df['car_01_02_combined'])
    
    print(f"Feature engineering complete. New shape: {df.shape}")
    return df

# Apply feature engineering
print("\nApplying to train set...")
train_fe = create_features(train, is_train=True)

print("Applying to test set...")
test_fe = create_features(test, is_train=False)


STEP 2: FEATURE ENGINEERING

Applying to train set...
Creating missing value indicators...
Creating interaction features...
Creating polynomial features...
Creating aggregation features...
Creating binned features...
Creating combination features...
Feature engineering complete. New shape: (296209, 107)
Applying to test set...
Creating missing value indicators...
Creating interaction features...
Creating polynomial features...
Creating aggregation features...
Creating binned features...
Creating combination features...
Feature engineering complete. New shape: (126948, 104)


In [4]:
print("\n" + "=" * 70)
print("STEP 3: FEATURE SELECTION")
print("=" * 70)

# Separate features and target
X_full = train_fe.drop([TARGET, ID_COL], axis=1)
y_full = train_fe[TARGET]
test_final = test_fe.drop([ID_COL], axis=1, errors='ignore')

# Fill missing values temporarily for feature selection
X_full_filled = X_full.fillna(-999)
test_final_filled = test_final.fillna(-999)

# Quick feature importance with LightGBM
print("Calculating feature importance...")
lgb_selector = lgb.LGBMClassifier(
    n_estimators=100,
    random_state=RANDOM_SEED,
    verbose=-1,
    class_weight='balanced'
)
lgb_selector.fit(X_full_filled, y_full)

# Get feature importance
feature_importance = pd.DataFrame({
    'feature': X_full.columns,
    'importance': lgb_selector.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 30 most important features:")
print(feature_importance.head(30))

# Select top N features (keeping more features now due to engineering)
N_FEATURES = 60  # Increased from 22
selected_features = feature_importance.head(N_FEATURES)['feature'].tolist()

print(f"\nSelected {len(selected_features)} features")

# Apply selection
X_full = X_full[selected_features]
test_final = test_final[selected_features]


STEP 3: FEATURE SELECTION
Calculating feature importance...


  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 247, in _count_physical_cores
    cpu_count_physical = _count_physical_cores_win32()
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\SHEIKHANI LAPTOP\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\joblib\externals\loky\backend\context.py", line 299, in _count_physical_cores_win32
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.11_3.11.2544.0_x64__qbz5n2kfra8p0\Lib\subprocess.py


Top 30 most important features:
                     feature  importance
62                  feature6         144
34                 ps_car_13         136
15                 ps_ind_03         125
63                  feature7         117
98                  calc_sum         110
35                 ps_car_14         104
100                 calc_std          99
81         car13_car15_ratio          96
58                  feature2          91
25                 ps_ind_15          90
96              car_cat_mean          83
80   ind15_reg01_interaction          83
2              ps_ind_05_cat          77
60                  feature4          72
79   car13_reg03_interaction          69
104       car_01_02_combined          66
31                 ps_reg_03          66
29                 ps_reg_01          65
95               car_cat_sum          62
82       reg02_reg03_product          59
37                ps_calc_01          49
13             ps_car_11_cat          48
36                 ps_ca

In [5]:
print("\n" + "=" * 70)
print("STEP 4: HANDLING MISSING VALUES")
print("=" * 70)

# For tree-based models, -999 is often better than mean/median
X_full = X_full.fillna(-999)
test_final = test_final.fillna(-999)

print(f"Missing values in train: {X_full.isnull().sum().sum()}")
print(f"Missing values in test: {test_final.isnull().sum().sum()}")


STEP 4: HANDLING MISSING VALUES
Missing values in train: 0
Missing values in test: 0


In [6]:
print("\n" + "=" * 70)
print("STEP 5: CREATING TRAIN/VALIDATION SPLIT")
print("=" * 70)

X_train, X_val, y_train, y_val = train_test_split(
    X_full, y_full, 
    test_size=0.25, 
    stratify=y_full, 
    random_state=RANDOM_SEED
)

print(f"X_train: {X_train.shape}")
print(f"X_val: {X_val.shape}")
print(f"Target distribution in train: {y_train.value_counts(normalize=True).to_dict()}")
print(f"Target distribution in val: {y_val.value_counts(normalize=True).to_dict()}")


STEP 5: CREATING TRAIN/VALIDATION SPLIT
X_train: (222156, 60)
X_val: (74053, 60)
Target distribution in train: {0: 0.9487342227983939, 1: 0.05126577720160608}
Target distribution in val: {0: 0.9487259125221126, 1: 0.05127408747788746}


In [7]:
print("\n" + "=" * 70)
print("STEP 6: DEFINING MODELS WITH CLASS WEIGHTS")
print("=" * 70)

# Calculate scale_pos_weight for XGBoost/LightGBM
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"Scale pos weight: {scale_pos_weight:.2f}")

models = {
    'XGBoost': xgb.XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        random_state=RANDOM_SEED,
        eval_metric='auc',
        tree_method='hist'
    ),
    'LightGBM': lgb.LGBMClassifier(
        class_weight='balanced',
        random_state=RANDOM_SEED,
        verbose=-1,
        metric='auc'
    ),
    'CatBoost': CatBoostClassifier(
        auto_class_weights='Balanced',
        random_state=RANDOM_SEED,
        verbose=0,
        eval_metric='AUC'
    )
}


STEP 6: DEFINING MODELS WITH CLASS WEIGHTS
Scale pos weight: 18.51


In [8]:
print("\n" + "=" * 70)
print("STEP 7: HYPERPARAMETER TUNING (EXPANDED GRIDS)")
print("=" * 70)

param_grids = {
    'XGBoost': {
        'n_estimators': [300, 500, 700],
        'max_depth': [3, 4, 5, 6],
        'learning_rate': [0.01, 0.03, 0.05, 0.07],
        'min_child_weight': [1, 3, 5],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'gamma': [0, 0.1, 0.2]
    },
    'LightGBM': {
        'n_estimators': [300, 500, 700],
        'num_leaves': [31, 50, 70, 90],
        'learning_rate': [0.01, 0.03, 0.05, 0.07],
        'min_child_samples': [20, 30, 50],
        'subsample': [0.8, 0.9, 1.0],
        'colsample_bytree': [0.8, 0.9, 1.0],
        'reg_alpha': [0, 0.1, 0.5],
        'reg_lambda': [0, 0.1, 0.5]
    },
    'CatBoost': {
        'iterations': [300, 500, 700],
        'depth': [4, 6, 8, 10],
        'learning_rate': [0.01, 0.03, 0.05, 0.07],
        'l2_leaf_reg': [1, 3, 5, 7],
        'border_count': [32, 64, 128],
        'bagging_temperature': [0, 0.5, 1.0]
    }
}

# Randomized search for efficiency
from sklearn.model_selection import RandomizedSearchCV

tuning_results = []
best_models = {}

for name, model in models.items():
    print(f"\n{'='*50}")
    print(f"Tuning {name}...")
    print(f"{'='*50}")
    
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_grids[name],
        n_iter=30,  # Test 30 random combinations
        scoring='roc_auc',
        cv=5,  # 5-fold CV for robustness
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=1
    )
    
    start = time.time()
    search.fit(X_train, y_train)
    train_time = time.time() - start
    
    # Get best model
    best_model = search.best_estimator_
    best_params = search.best_params_
    
    # Evaluate on validation
    start_pred = time.time()
    y_pred_proba = best_model.predict_proba(X_val)[:, 1]
    pred_time = time.time() - start_pred
    
    val_auc = roc_auc_score(y_val, y_pred_proba)
    
    # Cross-validation score for reliability
    cv_scores = cross_val_score(
        best_model, X_train, y_train,
        cv=5, scoring='roc_auc', n_jobs=-1
    )
    
    print(f"\n{'='*50}")
    print(f"‚úÖ {name} Results:")
    print(f"{'='*50}")
    print(f"Best Params: {best_params}")
    print(f"Validation AUROC: {val_auc:.4f}")
    print(f"CV AUROC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
    print(f"Train Time: {train_time:.2f}s | Predict Time: {pred_time:.2f}s")
    
    # Save results
    tuning_results.append({
        'Model': name,
        'Best Params': best_params,
        'Val AUROC': round(val_auc, 4),
        'CV AUROC Mean': round(cv_scores.mean(), 4),
        'CV AUROC Std': round(cv_scores.std(), 4),
        'Train Time (s)': round(train_time, 2)
    })
    
    # Store best model
    best_models[name] = best_model
    
    # Save model
    joblib.dump(best_model, f'best_{name.lower()}_enhanced.joblib')

# Results DataFrame
results_df = pd.DataFrame(tuning_results).sort_values('Val AUROC', ascending=False)
print("\n" + "=" * 70)
print("TUNING RESULTS SUMMARY:")
print("=" * 70)
print(results_df.to_string(index=False))


STEP 7: HYPERPARAMETER TUNING (EXPANDED GRIDS)

Tuning XGBoost...
Fitting 5 folds for each of 30 candidates, totalling 150 fits

‚úÖ XGBoost Results:
Best Params: {'subsample': 0.9, 'n_estimators': 700, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 0.1, 'colsample_bytree': 0.8}
Validation AUROC: 0.6353
CV AUROC: 0.6352 (+/- 0.0028)
Train Time: 2286.07s | Predict Time: 0.72s

Tuning LightGBM...
Fitting 5 folds for each of 30 candidates, totalling 150 fits

‚úÖ LightGBM Results:
Best Params: {'subsample': 0.9, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'num_leaves': 31, 'n_estimators': 500, 'min_child_samples': 50, 'learning_rate': 0.01, 'colsample_bytree': 0.8}
Validation AUROC: 0.6359
CV AUROC: 0.6340 (+/- 0.0024)
Train Time: 4250.89s | Predict Time: 2.00s

Tuning CatBoost...
Fitting 5 folds for each of 30 candidates, totalling 150 fits

‚úÖ CatBoost Results:
Best Params: {'learning_rate': 0.03, 'l2_leaf_reg': 7, 'iterations': 500, 'depth': 4, 'border_count': 128, '

In [9]:
print("\n" + "=" * 70)
print("STEP 8: CREATING STACKING ENSEMBLE")
print("=" * 70)

# Create base estimators from best models
estimators = [
    ('xgb', best_models['XGBoost']),
    ('lgb', best_models['LightGBM']),
    ('cat', best_models['CatBoost'])
]

# Create stacking classifier
stacking_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(
        class_weight='balanced',
        random_state=RANDOM_SEED,
        max_iter=1000
    ),
    cv=5,
    n_jobs=-1
)

print("Training stacking ensemble...")
start = time.time()
stacking_model.fit(X_train, y_train)
stack_train_time = time.time() - start

# Evaluate stacking
y_pred_stack = stacking_model.predict_proba(X_val)[:, 1]
stack_auc = roc_auc_score(y_val, y_pred_stack)

# Cross-validation
stack_cv_scores = cross_val_score(
    stacking_model, X_train, y_train,
    cv=5, scoring='roc_auc', n_jobs=-1
)

print(f"\n{'='*50}")
print("‚úÖ STACKING ENSEMBLE Results:")
print(f"{'='*50}")
print(f"Validation AUROC: {stack_auc:.4f}")
print(f"CV AUROC: {stack_cv_scores.mean():.4f} (+/- {stack_cv_scores.std():.4f})")
print(f"Train Time: {stack_train_time:.2f}s")

# Save stacking model
joblib.dump(stacking_model, 'best_stacking_ensemble.joblib')


STEP 8: CREATING STACKING ENSEMBLE
Training stacking ensemble...

‚úÖ STACKING ENSEMBLE Results:
Validation AUROC: 0.6374
CV AUROC: 0.6363 (+/- 0.0022)
Train Time: 504.51s


['best_stacking_ensemble.joblib']

In [10]:
print("\n" + "=" * 70)
print("STEP 9: CREATING WEIGHTED ENSEMBLE")
print("=" * 70)

# Get predictions from all models
pred_xgb = best_models['XGBoost'].predict_proba(X_val)[:, 1]
pred_lgb = best_models['LightGBM'].predict_proba(X_val)[:, 1]
pred_cat = best_models['CatBoost'].predict_proba(X_val)[:, 1]

# Calculate individual AUC scores
auc_xgb = roc_auc_score(y_val, pred_xgb)
auc_lgb = roc_auc_score(y_val, pred_lgb)
auc_cat = roc_auc_score(y_val, pred_cat)

print(f"XGBoost Val AUC: {auc_xgb:.4f}")
print(f"LightGBM Val AUC: {auc_lgb:.4f}")
print(f"CatBoost Val AUC: {auc_cat:.4f}")

# Simple average
pred_avg = (pred_xgb + pred_lgb + pred_cat) / 3
auc_avg = roc_auc_score(y_val, pred_avg)
print(f"\nSimple Average AUC: {auc_avg:.4f}")

# Weighted by performance
total_auc = auc_xgb + auc_lgb + auc_cat
w_xgb = auc_xgb / total_auc
w_lgb = auc_lgb / total_auc
w_cat = auc_cat / total_auc

pred_weighted = w_xgb * pred_xgb + w_lgb * pred_lgb + w_cat * pred_cat
auc_weighted = roc_auc_score(y_val, pred_weighted)
print(f"Weighted Average AUC: {auc_weighted:.4f}")
print(f"Weights: XGB={w_xgb:.3f}, LGB={w_lgb:.3f}, CAT={w_cat:.3f}")


STEP 9: CREATING WEIGHTED ENSEMBLE
XGBoost Val AUC: 0.6353
LightGBM Val AUC: 0.6359
CatBoost Val AUC: 0.6378

Simple Average AUC: 0.6373
Weighted Average AUC: 0.6373
Weights: XGB=0.333, LGB=0.333, CAT=0.334


In [11]:
print("\n" + "=" * 70)
print("STEP 10: FINAL MODEL SELECTION")
print("=" * 70)

# Compare all approaches
final_scores = {
    'XGBoost': auc_xgb,
    'LightGBM': auc_lgb,
    'CatBoost': auc_cat,
    'Stacking': stack_auc,
    'Simple Average': auc_avg,
    'Weighted Average': auc_weighted
}

best_approach = max(final_scores, key=final_scores.get)
best_score = final_scores[best_approach]

print("\nFinal Validation Scores:")
for approach, score in sorted(final_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"  {approach:20s}: {score:.4f}")

print(f"\nüèÜ BEST APPROACH: {best_approach} (AUROC = {best_score:.4f})")


STEP 10: FINAL MODEL SELECTION

Final Validation Scores:
  CatBoost            : 0.6378
  Stacking            : 0.6374
  Weighted Average    : 0.6373
  Simple Average      : 0.6373
  LightGBM            : 0.6359
  XGBoost             : 0.6353

üèÜ BEST APPROACH: CatBoost (AUROC = 0.6378)


In [12]:
print("\n" + "=" * 70)
print("STEP 11: RETRAINING ON FULL DATA & CREATING SUBMISSIONS")
print("=" * 70)

# Retrain best individual models on full data
final_models = {}

for name, model in best_models.items():
    print(f"\nRetraining {name} on full training data...")
    model.fit(X_full, y_full)
    final_models[name] = model

# Retrain stacking on full data
print("\nRetraining Stacking Ensemble on full training data...")
stacking_model.fit(X_full, y_full)

# Generate predictions
print("\nGenerating predictions on test set...")

pred_test_xgb = final_models['XGBoost'].predict_proba(test_final)[:, 1]
pred_test_lgb = final_models['LightGBM'].predict_proba(test_final)[:, 1]
pred_test_cat = final_models['CatBoost'].predict_proba(test_final)[:, 1]
pred_test_stack = stacking_model.predict_proba(test_final)[:, 1]
pred_test_avg = (pred_test_xgb + pred_test_lgb + pred_test_cat) / 3
pred_test_weighted = w_xgb * pred_test_xgb + w_lgb * pred_test_lgb + w_cat * pred_test_cat

# Create submissions
submissions = {
    'xgboost': pred_test_xgb,
    'lightgbm': pred_test_lgb,
    'catboost': pred_test_cat,
    'stacking': pred_test_stack,
    'simple_average': pred_test_avg,
    'weighted_average': pred_test_weighted
}

test_ids = test[ID_COL] if ID_COL in test.columns else range(len(test))

for name, predictions in submissions.items():
    submission = pd.DataFrame({
        'id': test_ids,
        'target': predictions
    })
    filename = f'submission_{name}_enhanced.csv'
    submission.to_csv(filename, index=False)
    print(f"‚úÖ Created: {filename}")


STEP 11: RETRAINING ON FULL DATA & CREATING SUBMISSIONS

Retraining XGBoost on full training data...

Retraining LightGBM on full training data...

Retraining CatBoost on full training data...

Retraining Stacking Ensemble on full training data...

Generating predictions on test set...
‚úÖ Created: submission_xgboost_enhanced.csv
‚úÖ Created: submission_lightgbm_enhanced.csv
‚úÖ Created: submission_catboost_enhanced.csv
‚úÖ Created: submission_stacking_enhanced.csv
‚úÖ Created: submission_simple_average_enhanced.csv
‚úÖ Created: submission_weighted_average_enhanced.csv


In [13]:
print("\n" + "=" * 70)
print("üéâ PIPELINE COMPLETE!")
print("=" * 70)
print(f"\nüìä Final Statistics:")
print(f"  - Features engineered: {X_full.shape[1]} (from original ~65)")
print(f"  - Best validation AUROC: {best_score:.4f}")
print(f"  - Best approach: {best_approach}")
print(f"  - Submissions created: {len(submissions)}")
print(f"\nüí° Recommendations:")
print(f"  1. Submit '{best_approach.lower().replace(' ', '_')}_enhanced.csv' first")
print(f"  2. Try 'weighted_average_enhanced.csv' as alternative")
print(f"  3. Monitor Kaggle leaderboard scores")
print(f"  4. If score < 0.65, consider:")
print(f"     - More feature engineering iterations")
print(f"     - Deeper hyperparameter tuning")
print(f"     - Neural network models")
print(f"     - Advanced ensembling techniques")

print("\n" + "=" * 70)


üéâ PIPELINE COMPLETE!

üìä Final Statistics:
  - Features engineered: 60 (from original ~65)
  - Best validation AUROC: 0.6378
  - Best approach: CatBoost
  - Submissions created: 6

üí° Recommendations:
  1. Submit 'catboost_enhanced.csv' first
  2. Try 'weighted_average_enhanced.csv' as alternative
  3. Monitor Kaggle leaderboard scores
  4. If score < 0.65, consider:
     - More feature engineering iterations
     - Deeper hyperparameter tuning
     - Neural network models
     - Advanced ensembling techniques

