Imports

In [1]:
# --- Imports ---
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.base import clone
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder


# Models
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, roc_auc_score
import time
from matplotlib import pyplot
import warnings
warnings.filterwarnings('ignore')

# reproducible seed
RANDOM_SEED = 42


Fetching and reading

In [2]:
train = pd.read_csv('train1.csv')
test  = pd.read_csv('test.csv')   # used to generate final submission

# quick shape
print("Train shape:", train.shape)
print("Test shape: ", test.shape)


Train shape: (296209, 67)
Test shape:  (126948, 66)


Train-test split

In [4]:
X = train.drop('target', axis=1)
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)
print("X_train", X_train.shape, "X_val", X_val.shape)

print(f"Target distribution in train: {y_train.value_counts(normalize=True).to_dict()}")
print(f"Target distribution in val: {y_val.value_counts(normalize=True).to_dict()}")

X_train (222156, 66) X_val (74053, 66)
Target distribution in train: {0: 0.9487342227983939, 1: 0.05126577720160608}
Target distribution in val: {0: 0.9487259125221126, 1: 0.05127408747788746}


In [5]:
print(f"Target distribution:\n{y.value_counts()}")

Target distribution:
target
0    281023
1     15186
Name: count, dtype: int64


Imputing

In [9]:
from sklearn.impute import SimpleImputer

bin_cols = [c for c in X_train.columns if '_bin' in c]
cat_cols = [c for c in X_train.columns if '_cat' in c]
num_cols = [c for c in X_train.columns if c not in bin_cols + cat_cols + ['id']]

# Categorical columns ‚Üí mode (most_frequent)
imputer_cat = SimpleImputer(strategy='most_frequent')
X_train[cat_cols] = imputer_cat.fit_transform(X_train[cat_cols])
X_val[cat_cols] = imputer_cat.transform(X_val[cat_cols])

# Fill numeric NaNs with median from training data
median_vals = X_train[num_cols].median()
X_train[num_cols] = X_train[num_cols].fillna(median_vals)
X_val[num_cols] = X_val[num_cols].fillna(median_vals)

# fill binary columns with mode
imputer_bin = SimpleImputer(strategy='most_frequent')
X_train[bin_cols] = imputer_bin.fit_transform(X_train[bin_cols])
X_val[bin_cols] = imputer_bin.transform(X_val[bin_cols])



Slicing

In [10]:
print("Binary cols:", len(bin_cols))
print("Categorical cols:", len(cat_cols))
print("Numeric cols:", len(num_cols))

print("Binary Columns ({}):".format(len(bin_cols)), bin_cols, "\n")
print("Categorical Columns ({}):".format(len(cat_cols)), cat_cols, "\n")
print("Numeric Columns ({}):".format(len(num_cols)), num_cols)


Binary cols: 17
Categorical cols: 14
Numeric cols: 34
Binary Columns (17): ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin'] 

Categorical Columns (14): ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat'] 

Numeric Columns (34): ['ps_ind_01', 'ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06', 'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11', 'ps_calc_12', 'ps_calc_13',

In [12]:
X_train.isna().sum().sum(), X_val.isna().sum().sum() #no more Na columns


(0, 0)

In [13]:
# Convert all categorical (_cat) columns to category dtype
for c in cat_cols:
    X_train[c] = X_train[c].astype('category')
    X_val[c] = X_val[c].astype('category')


In [14]:
X_train.dtypes.value_counts()


int64       37
float64     15
category     6
category     1
category     1
category     1
category     1
category     1
category     1
category     1
category     1
Name: count, dtype: int64

Preprocessors

In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler


preprocessor_cnb = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ('bin', 'passthrough', bin_cols)
])


preprocessor_gnb = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('bin', 'passthrough', bin_cols)
])


preprocessor_knn = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols),
    ('bin', 'passthrough', bin_cols)
])


# No scaling or encoding required
preprocessor_tree = ColumnTransformer([
    ('num', 'passthrough', num_cols),
    ('cat', 'passthrough', cat_cols),
    ('bin', 'passthrough', bin_cols)
])


Models

In [23]:
# ===========================
# üß† DEFINE BASE MODELS (no hyperparameters yet)
# ===========================

models = {
    "CategoricalNB": Pipeline([
        ("preprocessor", preprocessor_cnb),
        ("model", CategoricalNB())
    ]),

    "GaussianNB": Pipeline([
        ("preprocessor", preprocessor_gnb),
        ("model", GaussianNB())
    ]),

    "KNN": Pipeline([
        ("preprocessor", preprocessor_knn),
        ("model", KNeighborsClassifier())
    ]),

    "DecisionTree": Pipeline([
        ("preprocessor", preprocessor_tree),
        ("model", DecisionTreeClassifier(random_state=42))
    ]),

    "RandomForest": Pipeline([
        ("preprocessor", preprocessor_tree),
        ("model", RandomForestClassifier(random_state=42))
    ]),

    "AdaBoost": Pipeline([
        ("preprocessor", preprocessor_tree),
        ("model", AdaBoostClassifier(random_state=42))
    ]),

    "XGBoost": Pipeline([
        ("preprocessor", preprocessor_tree),
        ("model", xgb.XGBClassifier(
            random_state=42,
            use_label_encoder=False,
            eval_metric='logloss'
        ))
    ]),

    "LightGBM": Pipeline([
        ("preprocessor", preprocessor_tree),
        ("model", lgb.LGBMClassifier(random_state=42))
    ]),

    "CatBoost": Pipeline([
        ("preprocessor", preprocessor_tree),
        ("model", CatBoostClassifier(
            verbose=0,
            random_state=42
        ))
    ])
}
print("‚úÖ Models defined successfully with preprocessing pipelines.")

‚úÖ Models defined successfully with preprocessing pipelines.


Hyperparameter grids

In [24]:
# ===========================
# üéõÔ∏è DEFINE HYPERPARAMETER GRIDS (lightweight)
# ===========================

param_grids = {

    # --- Categorical Naive Bayes ---
    "CategoricalNB": {
        "model__alpha": [0.5, 1.0, 2.0]
    },

    # --- Gaussian Naive Bayes ---
    "GaussianNB": {
        "model__var_smoothing": [1e-9, 1e-7]
    },

    # --- K-Nearest Neighbors ---
    "KNN": {
        "model__n_neighbors": [5, 7, 9],
        "model__weights": ["uniform", "distance"]
    },

    # --- Decision Tree ---
    "DecisionTree": {
        "model__criterion": ["gini", "entropy"],
        "model__max_depth": [5, 7],
        "model__min_samples_split": [2, 5],
        "model__min_samples_leaf": [1,2]
    },

    # --- Random Forest ---
    "RandomForest": {
        "model__n_estimators": [100],
        "model__max_depth": [5,10],
        "model__min_samples_split": [2, 5]
    },

    # --- AdaBoost ---
    "AdaBoost": {
        "model__n_estimators": [50, 100],
        "model__learning_rate": [0.01, 0.1, 0.5]
    },

    # --- XGBoost ---
    "XGBoost": {
        "model__n_estimators": [100],
        "model__max_depth": [3, 5],
        "model__learning_rate": [0.05, 0.1],
        "model__subsample": [0.8, 1.0]
    },

    # --- LightGBM ---
    "LightGBM": {
        "model__n_estimators": [100],
        "model__num_leaves": [31, 63],
        "model__learning_rate": [0.05, 0.1],
        "model__subsample": [0.8, 1.0]
    },

    # --- CatBoost ---
    "CatBoost": {
        "model__depth": [4, 6, 8],
        "model__learning_rate": [0.03, 0.1],
        "model__iterations": [200]
    }
}


Random Search CV large parameter grid

In [27]:
import time
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
import pandas as pd

results = []

# Loop through each model
for name, (prep, model) in models.items():
    print(f"\n[{time.strftime('%H:%M:%S')}] üîπ Starting {name} tuning with Random Search...")

    pipe = Pipeline([
        ('preprocessor', prep),
        ('model', model)
    ])

    # --- Training time ---
    start_train = time.time()

    random_search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=param_grids[name],
        n_iter=5,                 # number of random combinations to try (tune as needed)
        scoring='roc_auc',
        cv=3,
        n_jobs=1,                 # use all cores for speed
        random_state=42,
        verbose=1
    )

    random_search.fit(X_train, y_train)
    train_time = time.time() - start_train

    # --- Prediction time ---
    best_model = random_search.best_estimator_
    start_pred = time.time()
    y_pred = best_model.predict_proba(X_val)[:, 1]
    pred_time = time.time() - start_pred

    # --- AUROC ---
    auc = roc_auc_score(y_val, y_pred)

    results.append({
        'Model': name,
        'Best AUROC': auc,
        'Train Time (s)': round(train_time, 2),
        'Predict Time (s)': round(pred_time, 2),
        'Best Params': random_search.best_params_
    })

# --- Results summary ---
results_df = pd.DataFrame(results).sort_values(by='Best AUROC', ascending=False).reset_index(drop=True)

print("\n===============================")
print("üèÜ FINAL COMPARISON OF MODELS (Random Search)")
print("===============================")
display(results_df)

best_model_name = results_df.iloc[0]['Model']
best_auc = results_df.iloc[0]['Best AUROC']
print(f"\nüèÜ Best tuned model: {best_model_name} (AUROC = {best_auc:.4f})")



[22:19:29] üîπ Starting CategoricalNB tuning with Random Search...
Fitting 3 folds for each of 3 candidates, totalling 9 fits

[22:20:14] üîπ Starting GaussianNB tuning with Random Search...
Fitting 3 folds for each of 2 candidates, totalling 6 fits

[22:20:24] üîπ Starting KNN tuning with Random Search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits

[22:58:31] üîπ Starting DecisionTree tuning with Random Search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits

[22:59:19] üîπ Starting RandomForest tuning with Random Search...
Fitting 3 folds for each of 4 candidates, totalling 12 fits

[23:05:35] üîπ Starting AdaBoost tuning with Random Search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits

[23:15:43] üîπ Starting XGBoost tuning with Random Search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits

[23:16:40] üîπ Starting LightGBM tuning with Random Search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits
[

Unnamed: 0,Model,Best AUROC,Train Time (s),Predict Time (s),Best Params
0,CatBoost,0.636402,209.99,0.23,"{'model__learning_rate': 0.1, 'model__iteratio..."
1,XGBoost,0.63344,56.28,0.25,"{'model__subsample': 0.8, 'model__n_estimators..."
2,LightGBM,0.633219,58.96,0.37,"{'model__subsample': 1.0, 'model__num_leaves':..."
3,AdaBoost,0.625238,604.79,2.72,"{'model__n_estimators': 100, 'model__learning_..."
4,RandomForest,0.622117,375.65,1.03,"{'model__n_estimators': 100, 'model__min_sampl..."
5,CategoricalNB,0.60572,43.53,1.68,{'model__alpha': 0.5}
6,DecisionTree,0.60235,47.06,0.15,"{'model__min_samples_split': 5, 'model__min_sa..."
7,GaussianNB,0.598768,9.57,0.49,{'model__var_smoothing': 1e-09}
8,KNN,0.521121,2110.85,176.62,"{'model__weights': 'distance', 'model__n_neigh..."



üèÜ Best tuned model: CatBoost (AUROC = 0.6364)


Grid Search CV on narrow hyperparameter grid

In [28]:
import time
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score

# --- Narrow parameter grids ---
cat_param_grid = {
    'model__depth': [5, 6, 7],
    'model__learning_rate': [0.05, 0.1],
    'model__iterations': [300, 500],
    'model__l2_leaf_reg': [3, 5, 7]
}

xgb_param_grid = {
    'model__max_depth': [4, 5, 6],
    'model__learning_rate': [0.05, 0.1],
    'model__n_estimators': [200, 400],
    'model__subsample': [0.8, 1.0],
    'model__colsample_bytree': [0.8, 1.0]
}

lgbm_param_grid = {
    'model__num_leaves': [15, 31, 63],
    'model__learning_rate': [0.05, 0.1],
    'model__n_estimators': [200, 400],
    'model__subsample': [0.8, 1.0]
}

param_grids_top3 = {
    'CatBoost': cat_param_grid,
    'XGBoost': xgb_param_grid,
    'LightGBM': lgbm_param_grid
}

# --- Reuse your preprocessor ---
preprocessor_tree = ColumnTransformer([
    ('num', 'passthrough', num_cols),
    ('cat', 'passthrough', cat_cols),
    ('bin', 'passthrough', bin_cols)
])

# --- Define top 3 models ---
models_top3 = {
    'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42)
}

# --- Run Grid Search for each ---
results_top3 = []

for name, model in models_top3.items():
    print(f"\nüîπ Fine-tuning {name}...")
    pipe = Pipeline([
        ('preprocessor', preprocessor_tree),
        ('model', model)
    ])
    
    param_grid = param_grids_top3[name]

    # üîç Show parameter combinations being tested
    total_combos = 1
    for k in param_grid:
        total_combos *= len(param_grid[k])
    print(f"  ‚û§ Testing {total_combos} hyperparameter combinations:")
    for param, values in param_grid.items():
        print(f"     {param}: {values}")
    
    grid = GridSearchCV(
        estimator=pipe,
        param_grid=param_grids_top3[name],
        cv=3,
        scoring='roc_auc',
        n_jobs=1,
        verbose=2
    )
    
    start_train = time.time()
    grid.fit(X_train, y_train)
    train_time = time.time() - start_train
    
    # ‚úÖ Show best found hyperparameters
    print(f"\n‚úÖ Best hyperparameters for {name}:")
    for k, v in grid.best_params_.items():
        print(f"     {k}: {v}")


    best_model = grid.best_estimator_

    start_pred = time.time()
    y_pred = best_model.predict_proba(X_val)[:, 1]
    pred_time = time.time() - start_pred

    auc = roc_auc_score(y_val, y_pred)
    print(f"üéØ {name} AUROC on validation set: {auc:.6f}")


    results_top3.append({
        'Model': name,
        'Best AUROC': round(auc, 6),
        'Train Time (s)': round(train_time, 2),
        'Predict Time (s)': round(pred_time, 2),
        'Best Params': grid.best_params_
    })

# --- Show comparison ---
results_top3_df = pd.DataFrame(results_top3).sort_values(by='Best AUROC', ascending=False).reset_index(drop=True)
print("\n===============================")
print("üèÜ FINAL COMPARISON OF TOP 3 MODELS (Fine-Tuned)")
print("===============================")
display(results_top3_df)

best_model_name = results_top3_df.iloc[0]['Model']
print(f"\nüèÖ Best Fine-Tuned Model: {best_model_name} (AUROC = {results_top3_df.iloc[0]['Best AUROC']})")



üîπ Fine-tuning CatBoost...
  ‚û§ Testing 36 hyperparameter combinations:
     model__depth: [5, 6, 7]
     model__learning_rate: [0.05, 0.1]
     model__iterations: [300, 500]
     model__l2_leaf_reg: [3, 5, 7]
Fitting 3 folds for each of 36 candidates, totalling 108 fits
[CV] END model__depth=5, model__iterations=300, model__l2_leaf_reg=3, model__learning_rate=0.05; total time=  23.6s
[CV] END model__depth=5, model__iterations=300, model__l2_leaf_reg=3, model__learning_rate=0.05; total time=  20.7s
[CV] END model__depth=5, model__iterations=300, model__l2_leaf_reg=3, model__learning_rate=0.05; total time=  20.7s
[CV] END model__depth=5, model__iterations=300, model__l2_leaf_reg=3, model__learning_rate=0.1; total time=  20.7s
[CV] END model__depth=5, model__iterations=300, model__l2_leaf_reg=3, model__learning_rate=0.1; total time=  21.7s
[CV] END model__depth=5, model__iterations=300, model__l2_leaf_reg=3, model__learning_rate=0.1; total time=  20.6s
[CV] END model__depth=5, model_

Unnamed: 0,Model,Best AUROC,Train Time (s),Predict Time (s),Best Params
0,CatBoost,0.636566,2945.27,0.57,"{'model__depth': 6, 'model__iterations': 300, ..."
1,LightGBM,0.635346,459.74,0.47,"{'model__learning_rate': 0.05, 'model__n_estim..."
2,XGBoost,0.635004,1186.04,0.31,"{'model__colsample_bytree': 0.8, 'model__learn..."



üèÖ Best Fine-Tuned Model: CatBoost (AUROC = 0.636566)


Full data train + File creation for top 3 models

In [30]:
# === Final Training and Kaggle Submissions for Top 3 Models ===
import joblib

# --- Load full train/test datasets ---
train_full = pd.read_csv("train1.csv")
test_full = pd.read_csv("test.csv")

# Separate target
y_full = train_full['target']
X_full = train_full.drop(['target', 'id'], axis=1)
test_ids = test_full['id']
X_test = test_full.drop(['id'], axis=1)

# --- Define the top 3 best parameter sets from your fine-tuned GridSearch ---
best_params_dict = {
    "CatBoost": {
        'iterations': 300,
        'depth': 6,
        'learning_rate': 0.05,
        'l2_leaf_reg': 3
    },
    "LightGBM": {
        'num_leaves': 15,
        'learning_rate': 0.05,
        'n_estimators': 200,
        'subsample': 0.8
    },
    "XGBoost": {
        'max_depth': 4,
        'learning_rate': 0.05,
        'n_estimators': 200,
        'subsample': 0.8,
        'colsample_bytree': 0.8
    }
}

# --- Initialize models ---
models_top3 = {
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42, **best_params_dict["CatBoost"]),
    "LightGBM": lgb.LGBMClassifier(random_state=42, **best_params_dict["LightGBM"]),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, **best_params_dict["XGBoost"])
}

# --- Loop over each top model for final training ---
for model_name, model in models_top3.items():
    print(f"\nüèÅ Retraining {model_name} on 100% training data with tuned parameters...")

    # Create pipeline with preprocessor
    pipe = Pipeline([
        ('preprocessor', preprocessor_tree),
        ('model', model)
    ])

    # --- Training ---
    start_train = time.time()
    pipe.fit(X_full, y_full)
    train_time = time.time() - start_train
    print(f"‚úÖ {model_name} training complete in {train_time:.2f}s")

    # Save trained model
    joblib.dump(pipe, f"{model_name}_final.pkl")

    # --- Prediction ---
    start_pred = time.time()
    test_preds = pipe.predict_proba(X_test)[:, 1]
    pred_time = time.time() - start_pred
    print(f"‚úÖ {model_name} predictions complete in {pred_time:.2f}s")

    # --- Create submission DataFrame ---
    submission = pd.DataFrame({
        'id': test_ids,
        'target': test_preds
    })

    # --- Save submission file ---
    filename = f"submission_{model_name.lower()}.csv"
    submission.to_csv(filename, index=False)
    print(f"üìÅ {filename} created successfully!")

print("\nüéØ All top 3 models retrained and submission files generated!")



üèÅ Retraining CatBoost on 100% training data with tuned parameters...
‚úÖ CatBoost training complete in 34.24s
‚úÖ CatBoost predictions complete in 0.29s
üìÅ submission_catboost.csv created successfully!

üèÅ Retraining LightGBM on 100% training data with tuned parameters...
[LightGBM] [Info] Number of positive: 15186, number of negative: 281023
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.063914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2423
[LightGBM] [Info] Number of data points in the train set: 296209, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051268 -> initscore=-2.918063
[LightGBM] [Info] Start training from score -2.918063
‚úÖ LightGBM training complete in 8.12s
‚úÖ LightGBM predictions complete in 0.88s
üìÅ submission_lightgbm.csv created successfully!

üèÅ Retraining XGBoost 

Stacking Ensemble of the top3 models (CatB + xgb + lgbm)

In [31]:
# === Step 2.8: Stacking Ensemble of Top 3 Tuned Models ===

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# --- Base models (using the tuned hyperparameters) ---
base_models = [
    ('catboost', CatBoostClassifier(verbose=0, random_state=42, **best_params_dict["CatBoost"])),
    ('lightgbm', lgb.LGBMClassifier(random_state=42, **best_params_dict["LightGBM"])),
    ('xgboost', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, **best_params_dict["XGBoost"]))
]

# --- Meta-model (simple and strong for stacking) ---
meta_model = LogisticRegression(max_iter=1000, random_state=42)

# --- Build the stacking ensemble ---
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5,                # 5-fold stacking
    stack_method='predict_proba',
    n_jobs=1
)

# --- Create pipeline with preprocessor ---
stack_pipe = Pipeline([
    ('preprocessor', preprocessor_tree),
    ('stacking', stacking_model)
])

# --- Train on training split ---
print("\nü§ù Training Stacking Ensemble (Validation Mode)...")
start_train = time.time()
stack_pipe.fit(X_train, y_train)
train_time = time.time() - start_train
print(f"‚úÖ Ensemble trained in {train_time:.2f}s")

# --- Evaluate on validation split ---
start_pred = time.time()
y_val_pred = stack_pipe.predict_proba(X_val)[:, 1]
pred_time = time.time() - start_pred
val_auc = roc_auc_score(y_val, y_val_pred)

print(f"‚úÖ Validation predictions complete in {pred_time:.2f}s")
print(f"\nüìä Validation AUROC for Stacking Ensemble: {val_auc:.6f}")



ü§ù Training Stacking Ensemble (Validation Mode)...
[LightGBM] [Info] Number of positive: 11389, number of negative: 210767
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2414
[LightGBM] [Info] Number of data points in the train set: 222156, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051266 -> initscore=-2.918105
[LightGBM] [Info] Start training from score -2.918105
[LightGBM] [Info] Number of positive: 9111, number of negative: 168613
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.031646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2410
[LightGBM] [Info] Number of data points in the train se

In [32]:
# --- Train the ensemble on the full training data ---
print("\nü§ù Training Stacking Ensemble on top of CatBoost, LightGBM, and XGBoost...")

start_train = time.time()
stack_pipe.fit(X_full, y_full)
train_time = time.time() - start_train
print(f"‚úÖ Stacking Ensemble training complete in {train_time:.2f}s")

# --- Save the stacked model ---
joblib.dump(stack_pipe, "Stacking_Ensemble_final.pkl")

# --- Predict on test data ---
start_pred = time.time()
stack_preds = stack_pipe.predict_proba(X_test)[:, 1]
pred_time = time.time() - start_pred
print(f"‚úÖ Ensemble predictions complete in {pred_time:.2f}s")

# --- Create submission DataFrame ---
submission_stack = pd.DataFrame({
    'id': test_ids,
    'target': stack_preds
})

# --- Save submission file ---
submission_file = "submission_stacking_ensemble.csv"
submission_stack.to_csv(submission_file, index=False)
print(f"üìÅ {submission_file} created successfully!")

print("\nüèÜ Stacking Ensemble completed and ready for submission!")


ü§ù Training Stacking Ensemble on top of CatBoost, LightGBM, and XGBoost...
[LightGBM] [Info] Number of positive: 15186, number of negative: 281023
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2423
[LightGBM] [Info] Number of data points in the train set: 296209, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051268 -> initscore=-2.918063
[LightGBM] [Info] Start training from score -2.918063
[LightGBM] [Info] Number of positive: 12149, number of negative: 224818
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.043443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2425
[LightGBM] [Info] Number of da

Fine tuning stacking ensemble meta model

In [35]:
# === Step: Stacking Ensemble Validation with Meta-Model Fine-Tuning ===


# --- Define base learners (each with preprocessing) ---
estimators = [
    ('cat', Pipeline([
        ('pre', preprocessor_tree),
        ('model', models_top3['CatBoost'])
    ])),
    ('lgb', Pipeline([
        ('pre', preprocessor_tree),
        ('model', models_top3['LightGBM'])
    ])),
    ('xgb', Pipeline([
        ('pre', preprocessor_tree),
        ('model', models_top3['XGBoost'])
    ]))
]

# --- Meta-model hyperparameter grid (Logistic Regression) ---
meta_param_grid = {
    'final_estimator__C': [0.01, 0.1, 1],
    'final_estimator__solver': ['lbfgs', 'saga'],
    'final_estimator__max_iter': [1000, 1500],
    'final_estimator__class_weight': [None, 'balanced'],
    'final_estimator__penalty': ['l2','l1']
    
}

# --- Define stacking classifier with placeholder meta-model ---
stack_base = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(random_state=42),
    stack_method='predict_proba',
    n_jobs=1
)

# --- Grid search to tune Logistic Regression meta-model ---
print("\nüîç Tuning meta-model (Logistic Regression) inside stacking ensemble...")
start_tune = time.time()
grid_meta = GridSearchCV(
    estimator=stack_base,
    param_grid=meta_param_grid,
    scoring='roc_auc',
    cv=2,
    verbose=2,
    n_jobs=1
)

grid_meta.fit(X_train, y_train)
tune_time = time.time() - start_tune
print(f"\n‚úÖ Meta-model tuning complete in {tune_time:.2f}s")

# --- Show best meta-model hyperparameters ---
print("\nüèÜ Best meta-model hyperparameters:")
for k, v in grid_meta.best_params_.items():
    print(f"   {k}: {v}")

# --- Evaluate tuned stacking model ---
best_stack = grid_meta.best_estimator_

start_pred = time.time()
val_preds = best_stack.predict_proba(X_val)[:, 1]
pred_time = time.time() - start_pred

auc = roc_auc_score(y_val, val_preds)
print(f"\nüéØ Validation AUROC (stacked ensemble): {auc:.5f}")
print(f"‚è±Ô∏è Prediction time: {pred_time:.2f}s")



üîç Tuning meta-model (Logistic Regression) inside stacking ensemble...
Fitting 2 folds for each of 48 candidates, totalling 96 fits
[LightGBM] [Info] Number of positive: 5695, number of negative: 105383
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029256 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2393
[LightGBM] [Info] Number of data points in the train set: 111078, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051270 -> initscore=-2.918013
[LightGBM] [Info] Start training from score -2.918013
[LightGBM] [Info] Number of positive: 4556, number of negative: 84306
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM]

In [37]:
# === Final Training and Submission for Stacking Ensemble (Tuned Logistic Regression) ===

# --- Tuned meta-model parameters ---
meta_model = LogisticRegression(
    C=0.1,
    class_weight=None,
    max_iter=1000,
    penalty='l1',
    solver='saga',
    random_state=42
)

# --- Define stacking ensemble using your top 3 models ---
final_stack_model = StackingClassifier(
    estimators=[
        ('cat', Pipeline([
            ('pre', preprocessor_tree),
            ('model', models_top3['CatBoost'])
        ])),
        ('lgb', Pipeline([
            ('pre', preprocessor_tree),
            ('model', models_top3['LightGBM'])
        ])),
        ('xgb', Pipeline([
            ('pre', preprocessor_tree),
            ('model', models_top3['XGBoost'])
        ]))
    ],
    final_estimator=meta_model,
    stack_method='predict_proba',
    n_jobs=1
)

# --- Train the stacking model ---
print("\nüèÅ Training final stacked model on 100% training data...")
start_train = time.time()
final_stack_model.fit(X_full, y_full)
train_time = time.time() - start_train
print(f"‚úÖ Training complete in {train_time:.2f}s")

# --- Save model ---
joblib.dump(final_stack_model, "final_stacking_model.pkl")
print("üíæ Model saved as 'final_stacking_model.pkl'")

# --- Predict on test set ---
print("\nüìä Generating predictions for submission...")
start_pred = time.time()
stack_test_preds = final_stack_model.predict_proba(X_test)[:, 1]
pred_time = time.time() - start_pred
print(f"‚úÖ Predictions complete in {pred_time:.2f}s")

# --- Create submission file ---
submission = pd.DataFrame({
    'id': test_ids,
    'target': stack_test_preds
})
filename = "submission_stacking_ensemble_tuned.csv"
submission.to_csv(filename, index=False)

print(f"üìÅ {filename} created successfully!")
print("\nüéØ Final stacked model with tuned Logistic Regression is ready for submission!")



üèÅ Training final stacked model on 100% training data...
[LightGBM] [Info] Number of positive: 15186, number of negative: 281023
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.062404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2423
[LightGBM] [Info] Number of data points in the train set: 296209, number of used features: 64
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.051268 -> initscore=-2.918063
[LightGBM] [Info] Start training from score -2.918063
[LightGBM] [Info] Number of positive: 12149, number of negative: 224818
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.045551 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2425
[LightGBM] [Info] Number of data points in the t