In [1]:
# --- Imports ---
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.base import clone
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import LabelEncoder


# Models
from sklearn.naive_bayes import CategoricalNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve, roc_auc_score
import time
from matplotlib import pyplot
import warnings
warnings.filterwarnings('ignore')

# reproducible seed (your ERP)
RANDOM_SEED = 42


In [6]:
train = pd.read_csv('train1.csv')
test  = pd.read_csv('test.csv')   # used to generate final submission

# Identify target and id (adjust names if different)
TARGET = 'target'   # change if different
ID_COL = 'id'       # change if different

# quick shape
print("Train shape:", train.shape)
print("Test shape: ", test.shape)


Train shape: (296209, 67)
Test shape:  (126948, 66)


In [7]:
# heuristics: columns ending with "_cat" are categorical (as your assignment says)
cat_cols = [c for c in train.columns if c.endswith('_cat')]
# But also some integer columns might be binary categorical (0/1)
binary_cols = [c for c in train.columns if train[c].dropna().nunique() == 2 and c != TARGET and c != ID_COL and c not in cat_cols]

# Continuous numeric columns = numeric columns excluding ID and target and categorical/binary detected above
num_cols = [c for c in train.select_dtypes(include=['int64','float64']).columns
            if c not in cat_cols + [TARGET, ID_COL]]

# If some cat_cols are numeric type, keep them in cat_cols
print("Categorical (explicit):", cat_cols)
print("Binary cols:", binary_cols)
print("Numeric cols:", num_cols)


Categorical (explicit): ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat']
Binary cols: ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
Numeric cols: ['ps_ind_01', 'ps_ind_03', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_11', 'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01', 'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 

In [11]:
#X = train.drop([TARGET, ID_COL], axis=1)
selected_features = ['ps_ind_05_cat', 'ps_car_04_cat', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02',
                     'ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_15', 'feature2', 'feature4', 'ps_ind_04_cat',
                     'ps_car_02_cat', 'ps_car_05_cat', 'ps_car_08_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
                     'ps_ind_09_bin', 'ps_ind_12_bin', 'ps_ind_16_bin', 'ps_ind_17_bin']

X = train[selected_features]
y = train[TARGET]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)
print("X_train", X_train.shape, "X_val", X_val.shape)


X_train (222156, 22) X_val (74053, 22)


In [18]:
# --- Re-identify columns for the reduced dataset ---
# Make sure this runs AFTER you select your 22 feature columns
num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# If you have binary columns (0/1) that you don't want scaled, you can detect them automatically:
binary_cols = [col for col in num_cols if X_train[col].nunique() == 2]

# Remove binary cols from numeric list to avoid duplication
num_cols = [col for col in num_cols if col not in binary_cols]

print(f"Numeric columns: {num_cols}")
print(f"Binary columns: {binary_cols}")
print(f"Categorical columns: {cat_cols}")

Numeric columns: ['ps_ind_05_cat', 'ps_car_04_cat', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03', 'ps_car_12', 'ps_car_13', 'ps_car_15', 'feature2', 'feature4']
Binary columns: ['ps_ind_04_cat', 'ps_car_02_cat', 'ps_car_05_cat', 'ps_car_08_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_12_bin', 'ps_ind_16_bin', 'ps_ind_17_bin']
Categorical columns: []


In [23]:
# --- Define imputers ---
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')
bin_imputer = SimpleImputer(strategy='most_frequent')  # new addition

# --- Combined preprocessing transformers ---

# For KNN and Naive Bayes (scaling required)
preprocessor_scaled = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', num_imputer),
        ('scaler', MinMaxScaler())
    ]), num_cols),
    ('bin', Pipeline([
        ('imputer', bin_imputer)  # handle NaNs in binary features
    ]), binary_cols),
    ('cat', Pipeline([
        ('imputer', cat_imputer),
        ('encoder', OrdinalEncoder())
    ]), cat_cols)
])

# For tree-based models (no scaling needed)
preprocessor_trees = ColumnTransformer([
    ('num', Pipeline([
        ('imputer', num_imputer)
    ]), num_cols),
    ('bin', Pipeline([
        ('imputer', bin_imputer)
    ]), binary_cols),
    ('cat', Pipeline([
        ('imputer', cat_imputer),
        ('encoder', OrdinalEncoder())
    ]), cat_cols)
])


In [24]:
# --- Define models with appropriate preprocessing ---
models = {
    # Distance/probability-based models (require scaling)
    'KNN': Pipeline([
        ('pre', preprocessor_scaled),
        ('knn', KNeighborsClassifier(n_neighbors=7))
    ]),
    'Naive Bayes': Pipeline([
        ('pre', preprocessor_scaled),
        ('nb', GaussianNB())
    ]),

    # Tree-based models (no scaling required)
    'Decision Tree': Pipeline([
        ('pre', preprocessor_trees),
        ('dt', DecisionTreeClassifier(max_depth=7, random_state=42))
    ]),
    'Random Forest': Pipeline([
        ('pre', preprocessor_trees),
        ('rf', RandomForestClassifier(random_state=42))
    ]),
    'Extra Trees': Pipeline([
        ('pre', preprocessor_trees),
        ('et', ExtraTreesClassifier(random_state=42))
    ]),
    'AdaBoost': Pipeline([
        ('pre', preprocessor_trees),
        ('ada', AdaBoostClassifier(random_state=42))
    ]),
    'XGBoost': Pipeline([
        ('pre', preprocessor_trees),
        ('xgb', xgb.XGBClassifier(random_state=42, eval_metric='logloss'))
    ]),
    'LightGBM': Pipeline([
        ('pre', preprocessor_trees),
        ('lgb', lgb.LGBMClassifier(random_state=42, verbose=-1))
    ]),
    'CatBoost': Pipeline([
        ('pre', preprocessor_trees),
        ('cat', CatBoostClassifier(random_state=42, verbose=0))
    ])
}

print("‚úÖ Models defined successfully with preprocessing pipelines.")

‚úÖ Models defined successfully with preprocessing pipelines.


In [25]:
# Lightweight hyperparameter grids aligned with pipeline step names
param_distributions = {
    'KNN': {
        'knn__n_neighbors': [7, 9],
        'knn__weights': ['uniform', 'distance']
    },
    'Naive Bayes': {
        # GaussianNB uses var_smoothing (small positive float)
        'nb__var_smoothing': [1e-9, 1e-7]
    },
    'Decision Tree': {
        'dt__max_depth': [5],
        'dt__min_samples_leaf': [1],
        'dt__min_samples_split': [2],
        'dt__criterion': ['entropy']
    },
    'Random Forest': {
        'rf__n_estimators': [100],
        'rf__max_depth': [10, None],
        'rf__min_samples_leaf': [1, 2]
    },
    'Extra Trees': {
        'et__n_estimators': [100],
        'et__max_depth': [10, None],
        'et__min_samples_leaf': [1, 2]
    },
    'AdaBoost': {
        'ada__n_estimators': [50, 100],
        'ada__learning_rate': [0.05, 0.1, 0.5]
    },
    'XGBoost': {
        'xgb__n_estimators': [100],
        'xgb__max_depth': [3, 6],
        'xgb__learning_rate': [0.05, 0.1]
    },
    'LightGBM': {
        'lgb__n_estimators': [100],
        'lgb__num_leaves': [31, 63],
        'lgb__learning_rate': [0.05, 0.1]
    },
    'CatBoost': {
        'cat__iterations': [200],
        'cat__depth': [4, 6],
        'cat__learning_rate': [0.05, 0.1]
    }
}


In [27]:
# --- Imports ---
import joblib  # for saving trained models
joblib.parallel_backend('threading', n_jobs=1)



# --- Model tuning setup ---
tuning_results = []

print("üöÄ Starting RandomizedSearchCV for all models...\n")

# Loop through models & their param grids
for name, model in models.items():
    if name not in param_distributions:
        print(f"‚ö†Ô∏è Skipping {name} ‚Äî no param grid defined.\n")
        continue

    print(f"[{time.strftime('%H:%M:%S')}] üîπ Tuning {name}...")
    start_train = time.time()

    # Randomized search setup
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_distributions[name],
        n_iter=5,                     # fewer iterations for speed
        scoring='roc_auc',
        cv=3,                         # 3-fold CV (good balance)
        random_state=42,
        n_jobs=1,                    # parallel processing
        verbose=1
    )

    # --- Fit model ---
    search.fit(X_train, y_train)
    train_time = time.time() - start_train

    # --- Best estimator and params ---
    best_model = search.best_estimator_
    best_params = search.best_params_

    # --- Evaluate on validation set ---
    start_pred = time.time()
    if hasattr(best_model, "predict_proba"):
        y_pred = best_model.predict_proba(X_val)[:, 1]
    else:
        y_pred = best_model.decision_function(X_val)
    pred_time = time.time() - start_pred

    auroc = roc_auc_score(y_val, y_pred)

    # --- Print summary ---
    print(f"‚úÖ {name} done")
    print(f"   Best Params: {best_params}")
    print(f"   Validation AUROC: {auroc:.4f}")
    print(f"   Training Time: {train_time:.2f}s | Prediction Time: {pred_time:.2f}s\n")

    # --- Save best model to file (optional) ---
    filename = f"best_{name.replace(' ', '_').lower()}.joblib"
    joblib.dump(best_model, filename)

    # --- Store results ---
    tuning_results.append({
        'Model': name,
        'Best Params': best_params,
        'AUROC': round(auroc, 4),
        'Train Time (s)': round(train_time, 2),
        'Predict Time (s)': round(pred_time, 2),
        'Model File': filename
    })

# --- Results summary ---
results_df = pd.DataFrame(tuning_results).sort_values(by='AUROC', ascending=False).reset_index(drop=True)

print("\nüìä Model Comparison After Hyperparameter Tuning:")
display(results_df)

best_model_name = results_df.iloc[0]['Model']
print(f"\nüèÜ Best tuned model: {best_model_name} (AUROC = {results_df.iloc[0]['AUROC']})")

# Optional: load best model later if needed
best_model = joblib.load(results_df.iloc[0]['Model File'])


üöÄ Starting RandomizedSearchCV for all models...

[20:56:23] üîπ Tuning KNN...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
‚úÖ KNN done
   Best Params: {'knn__weights': 'distance', 'knn__n_neighbors': 9}
   Validation AUROC: 0.5339
   Training Time: 585.43s | Prediction Time: 68.40s

[21:07:17] üîπ Tuning Naive Bayes...
Fitting 3 folds for each of 2 candidates, totalling 6 fits
‚úÖ Naive Bayes done
   Best Params: {'nb__var_smoothing': 1e-07}
   Validation AUROC: 0.6075
   Training Time: 6.98s | Prediction Time: 0.23s

[21:07:24] üîπ Tuning Decision Tree...
Fitting 3 folds for each of 1 candidates, totalling 3 fits
‚úÖ Decision Tree done
   Best Params: {'dt__min_samples_split': 2, 'dt__min_samples_leaf': 1, 'dt__max_depth': 5, 'dt__criterion': 'entropy'}
   Validation AUROC: 0.6029
   Training Time: 8.63s | Prediction Time: 0.11s

[21:07:33] üîπ Tuning Random Forest...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
‚úÖ Random Forest done
   Best Para

Unnamed: 0,Model,Best Params,AUROC,Train Time (s),Predict Time (s),Model File
0,CatBoost,"{'cat__learning_rate': 0.1, 'cat__iterations':...",0.6246,110.75,0.29,best_catboost.joblib
1,LightGBM,"{'lgb__num_leaves': 31, 'lgb__n_estimators': 1...",0.6242,37.36,0.35,best_lightgbm.joblib
2,XGBoost,"{'xgb__n_estimators': 100, 'xgb__max_depth': 3...",0.6233,34.39,0.17,best_xgboost.joblib
3,AdaBoost,"{'ada__n_estimators': 100, 'ada__learning_rate...",0.621,351.06,1.54,best_adaboost.joblib
4,Random Forest,"{'rf__n_estimators': 100, 'rf__min_samples_lea...",0.6189,559.0,1.28,best_random_forest.joblib
5,Extra Trees,"{'et__n_estimators': 100, 'et__min_samples_lea...",0.6164,383.18,1.1,best_extra_trees.joblib
6,Naive Bayes,{'nb__var_smoothing': 1e-07},0.6075,6.98,0.23,best_naive_bayes.joblib
7,Decision Tree,"{'dt__min_samples_split': 2, 'dt__min_samples_...",0.6029,8.63,0.11,best_decision_tree.joblib
8,KNN,"{'knn__weights': 'distance', 'knn__n_neighbors...",0.5339,585.43,68.4,best_knn.joblib



üèÜ Best tuned model: CatBoost (AUROC = 0.6246)


In [30]:
from sklearn.model_selection import GridSearchCV

# Narrow grids around best random search results
fine_grids = {
    'CatBoost': {
        'cat__iterations': [100, 150, 200],
        'cat__depth': [6, 7, 8],
        'cat__learning_rate': [0.01, 0.05, 0.1]
    },
    'LightGBM': {
        'lgb__n_estimators': [100, 150, 200],
        'lgb__num_leaves': [31, 50, 63],
        'lgb__learning_rate': [0.01, 0.05, 0.1]
    },
    'XGBoost': {
        'xgb__n_estimators': [100, 150, 200],
        'xgb__max_depth': [3, 4, 5],
        'xgb__learning_rate': [0.01, 0.05, 0.1]
    },
    'Random Forest': {
        'rf__n_estimators': [100, 200],
        'rf__max_depth': [10, 15, 20],
        'rf__min_samples_leaf': [1, 2]
    },
    'AdaBoost': {
        'ada__n_estimators': [100, 150, 200],
        'ada__learning_rate': [0.01, 0.05, 0.1, 0.5]
    }
}

grid_results = []

print("üéØ Starting fine-tuned GridSearchCV on top models...\n")

for name in fine_grids.keys():
    print(f"üîπ Grid searching {name}...")

    grid = GridSearchCV(
        estimator=models[name],
        param_grid=fine_grids[name],
        scoring='roc_auc',
        cv=3,
        n_jobs=1,
        verbose=1
    )

    start = time.time()
    grid.fit(X_train, y_train)
    train_time = time.time() - start

    # --- Save AUROC for each parameter combo ---
    results_df = pd.DataFrame(grid.cv_results_)
    results_df = results_df[['params', 'mean_test_score', 'std_test_score']].sort_values(by='mean_test_score', ascending=False)
    print("\nüìä AUROC for each parameter combination:")
    print(results_df)

    # Save to individual CSV for this model
    csv_name = f"grid_results_{name.replace(' ', '_')}.csv"
    results_df.to_csv(csv_name, index=False)
    print(f"üíæ Saved detailed results to: {csv_name}\n")

    # --- Evaluate best estimator ---
    best_est = grid.best_estimator_
    best_params = grid.best_params_

    start_pred = time.time()
    y_pred = best_est.predict_proba(X_val)[:, 1]
    pred_time = time.time() - start_pred
    auroc = roc_auc_score(y_val, y_pred)

    print(f"‚úÖ {name} ‚Äî Best Params: {best_params}")
    print(f"   AUROC: {auroc:.4f} | Train Time: {train_time:.2f}s | Predict Time: {pred_time:.2f}s\n")

    grid_results.append({
        'Model': name,
        'Best Params': best_params,
        'AUROC': round(auroc, 4),
        'Train Time (s)': round(train_time, 2),
        'Predict Time (s)': round(pred_time, 2)
    })

# --- Combine all best results ---
grid_results_df = pd.DataFrame(grid_results).sort_values(by='AUROC', ascending=False)
display(grid_results_df)

# Save combined summary
grid_results_df.to_csv("grid_results_summary.csv", index=False)
print("üíæ Saved summary of best results to: grid_results_summary.csv")

# Print top model
best_model_name = grid_results_df.iloc[0]['Model']
print(f"\nüèÜ Final best model after fine-tuning: {best_model_name} (AUROC = {grid_results_df.iloc[0]['AUROC']})")


üéØ Starting fine-tuned GridSearchCV on top models...

üîπ Grid searching CatBoost...
Fitting 3 folds for each of 27 candidates, totalling 81 fits

üìä AUROC for each parameter combination:
                                               params  mean_test_score  \
7   {'cat__depth': 6, 'cat__iterations': 200, 'cat...         0.625974   
2   {'cat__depth': 6, 'cat__iterations': 100, 'cat...         0.625777   
4   {'cat__depth': 6, 'cat__iterations': 150, 'cat...         0.625216   
5   {'cat__depth': 6, 'cat__iterations': 150, 'cat...         0.625033   
13  {'cat__depth': 7, 'cat__iterations': 150, 'cat...         0.624981   
16  {'cat__depth': 7, 'cat__iterations': 200, 'cat...         0.624816   
22  {'cat__depth': 8, 'cat__iterations': 150, 'cat...         0.624806   
19  {'cat__depth': 8, 'cat__iterations': 100, 'cat...         0.624792   
10  {'cat__depth': 7, 'cat__iterations': 100, 'cat...         0.624325   
1   {'cat__depth': 6, 'cat__iterations': 100, 'cat...         0.624

Unnamed: 0,Model,Best Params,AUROC,Train Time (s),Predict Time (s)
0,CatBoost,"{'cat__depth': 6, 'cat__iterations': 200, 'cat...",0.6253,1035.95,0.27
2,XGBoost,"{'xgb__learning_rate': 0.05, 'xgb__max_depth':...",0.6243,254.93,0.25
1,LightGBM,"{'lgb__learning_rate': 0.05, 'lgb__n_estimator...",0.6242,343.79,0.49
4,AdaBoost,"{'ada__learning_rate': 0.5, 'ada__n_estimators...",0.621,1895.33,3.03
3,Random Forest,"{'rf__max_depth': 10, 'rf__min_samples_leaf': ...",0.6201,2508.03,2.75


üíæ Saved summary of best results to: grid_results_summary.csv

üèÜ Final best model after fine-tuning: CatBoost (AUROC = 0.6253)


In [32]:
# === Step 2.7: Final Training and Kaggle Submissions for Top 3 Models ===

# Load full train/test datasets
train_full = pd.read_csv("train1.csv")
test_full = pd.read_csv("test.csv")

# Separate target
y_full = train_full['target']
X_full = train_full.drop('target', axis=1)

# Use same feature set as in training
X_full = X_full[selected_features]
test_full = test_full[selected_features]

# --- Define the top 3 best parameter sets from your GridSearch results ---
best_params_dict = {
    "CatBoost": {'cat__iterations': 200, 'cat__depth': 6, 'cat__learning_rate': 0.05},
    "LightGBM": {'lgb__n_estimators': 100, 'lgb__num_leaves': 31, 'lgb__learning_rate': 0.05},
    "XGBoost": {'xgb__n_estimators': 200, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.05}
}

# --- Loop over each top model for final training ---
for model_name, params in best_params_dict.items():
    print(f"\nüèÅ Retraining {model_name} on 100% training data with tuned parameters...")
    
    model = models[model_name]
    model.set_params(**params)
    
    start_train = time.time()
    model.fit(X_full, y_full)
    train_time = time.time() - start_train
    
    print(f"‚úÖ {model_name} training complete in {train_time:.2f}s")

    # Predict on Kaggle test data
    start_pred = time.time()
    test_preds = model.predict_proba(test_full)[:, 1]
    pred_time = time.time() - start_pred

    print(f"‚úÖ Predictions complete in {pred_time:.2f}s")

    # Create submission DataFrame
    submission = pd.DataFrame({
        'id': test_full.index,   # replace with test_full['id'] if available
        'target': test_preds
    })
    
    # Save submission file
    filename = f"submission_{model_name.lower()}.csv"
    submission.to_csv(filename, index=False)
    print(f"üìÅ {filename} created successfully!")



üèÅ Retraining CatBoost on 100% training data with tuned parameters...


‚úÖ CatBoost training complete in 20.53s
‚úÖ Predictions complete in 0.21s
üìÅ submission_catboost.csv created successfully!

üèÅ Retraining LightGBM on 100% training data with tuned parameters...
‚úÖ LightGBM training complete in 7.61s
‚úÖ Predictions complete in 0.63s
üìÅ submission_lightgbm.csv created successfully!

üèÅ Retraining XGBoost on 100% training data with tuned parameters...
‚úÖ XGBoost training complete in 6.31s
‚úÖ Predictions complete in 0.41s
üìÅ submission_xgboost.csv created successfully!
