In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.base import clone
import joblib
import gc

# Set display options to show all columns
pd.set_option('display.max_columns', None)

# Set up white-grid plot style
sns.set_style("whitegrid")

# Optional: Set a color palette (you can choose a different one if you prefer)
sns.set_palette("deep")

CWD = os.getcwd()
print(f'CWD: {CWD}')

ROOT = os.path.abspath(os.path.dirname(os.path.dirname(CWD)))
sys.path.append(ROOT)

from utils.visualization import boxplot_by_category, stacked_barplot_by_category
from utils.tuning import instantiate_model
from utils.utils import Dataloader

  from .autonotebook import tqdm as notebook_tqdm


CWD: /data_analysis/Insurance/src


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
PROJ = os.path.dirname(CWD)
TRAIN_PATH = [os.path.join(PROJ, 'data', x)  for x in ['train.csv', 'train_orig.csv']]
TEST_PATH = os.path.join(PROJ, 'data', 'test.csv')

target = 'Response'
num_cols = ['Age', 'Annual_Premium', 'Vintage']
cat_cols = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Vehicle_Age']

# train = pd.concat([pd.read_csv(PATH, index_col='id') for PATH in TRAIN_PATH]).reset_index(drop=True)
train = pd.read_csv( TRAIN_PATH[0], index_col = 'id')
test = pd.read_csv(TEST_PATH, index_col = 'id')

for col in cat_cols:
    train[col], test[col] = train[col].astype('category'), test[col].astype('category')

# train = train.sample(1000)

In [3]:
N_FOLDS = 5
MODEL = 'XGBoost'
def objective(trial):
    score = 0
    # Call the function to instantiate the model
    model = instantiate_model(trial, num_cols, cat_cols, datetime_columns=None, string_columns=None, models=[MODEL])

    X = train.drop(target, axis=1)
    y = train[target]

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

    f1_scores = []

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Fit the model
        m = clone(model)
        m.fit(X_train, y_train)

        # Predict on validation set
        y_pred = m.predict_proba(X_val)

        # Calculate F1 score
        score += roc_auc_score(y_val, y_pred[:, 1])/N_FOLDS

        del m
        gc.collect()

    return score

## Hyper parameter optimization

In [4]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')

# Seed using parameters from public notebook
if MODEL == 'XGBoost':
    study.enqueue_trial(
        {
            'xgb_alpha': 1.302348865795227e-06,
            'xgb_max_depth': 15, 
            'xgb_learning_rate': 0.061800451723613786, 
            'xgb_subsample': 0.7098803046786328, 
            'xgb_colsample_bytree': 0.2590672912533101, 
            'xgb_min_child_weight': 10, 
            'xgb_gamma': 0.8399887056014855, 
            'xgb_reg_alpha': 0.0016943548302122801, 
            'xgb_max_bin': 71284,
            'xgb_n_estimators': 1200,

        }
    )
study.optimize(objective, n_trials=100, show_progress_bar=True, gc_after_trial=True)  # You can adjust the number of trials

[I 2024-07-23 01:31:32,147] A new study created in memory with name: no-name-8478d28e-411f-4360-940a-62da9308b90e
Best trial: 0. Best value: 0.842797:   1%|          | 1/100 [05:45<9:30:16, 345.62s/it]

[I 2024-07-23 01:37:17,696] Trial 0 finished with value: 0.8427965426416549 and parameters: {'numerical_strategy': 'median', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 37.95696709232721, 'min_samples_leaf': 26, 'clf': 'XGBoost', 'xgb_alpha': 1.302348865795227e-06, 'xgb_max_depth': 15, 'xgb_learning_rate': 0.061800451723613786, 'xgb_n_estimators': 1200, 'xgb_min_child_weight': 10, 'xgb_subsample': 0.7098803046786328, 'xgb_colsample_bytree': 0.2590672912533101, 'xgb_reg_alpha': 0.0016943548302122801, 'xgb_reg_lambda': 0.14372827107324893, 'percentile': 47}. Best is trial 0 with value: 0.8427965426416549.


Best trial: 1. Best value: 0.86257:   2%|▏         | 2/100 [17:35<15:14:02, 559.61s/it]

[I 2024-07-23 01:49:07,114] Trial 1 finished with value: 0.8625701498569582 and parameters: {'numerical_strategy': 'median', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 66.77746736625397, 'min_samples_leaf': 33, 'clf': 'XGBoost', 'xgb_alpha': 0.0639877641744776, 'xgb_max_depth': 18, 'xgb_learning_rate': 0.17289588872212988, 'xgb_n_estimators': 971, 'xgb_min_child_weight': 3, 'xgb_subsample': 0.9919503912765959, 'xgb_colsample_bytree': 0.9550823556271376, 'xgb_reg_alpha': 4.232175400270493, 'xgb_reg_lambda': 8.70492781399606, 'percentile': 70}. Best is trial 1 with value: 0.8625701498569582.


Best trial: 1. Best value: 0.86257:   3%|▎         | 3/100 [21:24<11:01:05, 408.92s/it]

[I 2024-07-23 01:52:56,721] Trial 2 finished with value: 0.8617774862797997 and parameters: {'numerical_strategy': 'median', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 85.4706342996679, 'min_samples_leaf': 5, 'clf': 'XGBoost', 'xgb_alpha': 0.07377972719608539, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.016552414534946942, 'xgb_n_estimators': 105, 'xgb_min_child_weight': 9, 'xgb_subsample': 0.9966131354753043, 'xgb_colsample_bytree': 0.9183042543520682, 'xgb_reg_alpha': 3.9559689133439737, 'xgb_reg_lambda': 3.5651749718903725, 'percentile': 80}. Best is trial 1 with value: 0.8625701498569582.


Best trial: 1. Best value: 0.86257:   4%|▍         | 4/100 [27:02<10:09:05, 380.69s/it]

[I 2024-07-23 01:58:34,114] Trial 3 finished with value: 0.8479185865146126 and parameters: {'numerical_strategy': 'most_frequent', 'with_centering': False, 'with_scaling': True, 'categorical_strategy': 'constant', 'smoothing': 97.61004465134432, 'min_samples_leaf': 14, 'clf': 'XGBoost', 'xgb_alpha': 0.014054164734604156, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.026921551095895814, 'xgb_n_estimators': 1077, 'xgb_min_child_weight': 3, 'xgb_subsample': 0.8131935107265755, 'xgb_colsample_bytree': 0.4186477715838023, 'xgb_reg_alpha': 9.357540533015358, 'xgb_reg_lambda': 6.491528677308648, 'percentile': 47}. Best is trial 1 with value: 0.8625701498569582.


Best trial: 1. Best value: 0.86257:   5%|▌         | 5/100 [30:59<8:41:05, 329.11s/it] 

[I 2024-07-23 02:02:31,756] Trial 4 finished with value: 0.8350569850372203 and parameters: {'numerical_strategy': 'mean', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'smoothing': 89.2812682619048, 'min_samples_leaf': 41, 'clf': 'XGBoost', 'xgb_alpha': 0.052671290147041167, 'xgb_max_depth': 3, 'xgb_learning_rate': 0.23987863957513814, 'xgb_n_estimators': 812, 'xgb_min_child_weight': 1, 'xgb_subsample': 0.5601300643123939, 'xgb_colsample_bytree': 0.4875897887045233, 'xgb_reg_alpha': 2.761050969086293, 'xgb_reg_lambda': 7.640229496845828, 'percentile': 25}. Best is trial 1 with value: 0.8625701498569582.


Best trial: 5. Best value: 0.863034:   6%|▌         | 6/100 [36:43<8:43:27, 334.12s/it]

[I 2024-07-23 02:08:15,616] Trial 5 finished with value: 0.8630339910980382 and parameters: {'numerical_strategy': 'mean', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'constant', 'smoothing': 69.26075467689968, 'min_samples_leaf': 15, 'clf': 'XGBoost', 'xgb_alpha': 0.07751545645237722, 'xgb_max_depth': 16, 'xgb_learning_rate': 0.13627821851268462, 'xgb_n_estimators': 292, 'xgb_min_child_weight': 6, 'xgb_subsample': 0.7360280148799636, 'xgb_colsample_bytree': 0.959460512298792, 'xgb_reg_alpha': 3.0810818518604486, 'xgb_reg_lambda': 4.7339427517340855, 'percentile': 75}. Best is trial 5 with value: 0.8630339910980382.


Best trial: 5. Best value: 0.863034:   7%|▋         | 7/100 [41:58<8:27:58, 327.73s/it]

[I 2024-07-23 02:13:30,186] Trial 6 finished with value: 0.8625855969456426 and parameters: {'numerical_strategy': 'median', 'with_centering': False, 'with_scaling': True, 'categorical_strategy': 'constant', 'smoothing': 60.86484659522777, 'min_samples_leaf': 32, 'clf': 'XGBoost', 'xgb_alpha': 0.060912046379332126, 'xgb_max_depth': 2, 'xgb_learning_rate': 0.11084849806978626, 'xgb_n_estimators': 1312, 'xgb_min_child_weight': 9, 'xgb_subsample': 0.8553475307656567, 'xgb_colsample_bytree': 0.5004868639665583, 'xgb_reg_alpha': 8.476957823146531, 'xgb_reg_lambda': 6.10494533762102, 'percentile': 73}. Best is trial 5 with value: 0.8630339910980382.


Best trial: 7. Best value: 0.863996:   8%|▊         | 8/100 [52:32<10:52:00, 425.22s/it]

[I 2024-07-23 02:24:04,149] Trial 7 finished with value: 0.8639958463154103 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 88.38110821526305, 'min_samples_leaf': 34, 'clf': 'XGBoost', 'xgb_alpha': 0.013091131610297478, 'xgb_max_depth': 13, 'xgb_learning_rate': 0.03797973514828191, 'xgb_n_estimators': 1178, 'xgb_min_child_weight': 1, 'xgb_subsample': 0.6897229833980643, 'xgb_colsample_bytree': 0.8121407648494744, 'xgb_reg_alpha': 3.497121633052192, 'xgb_reg_lambda': 2.685496517908201, 'percentile': 74}. Best is trial 7 with value: 0.8639958463154103.


Best trial: 7. Best value: 0.863996:   9%|▉         | 9/100 [57:06<9:33:37, 378.21s/it] 

[I 2024-07-23 02:28:39,009] Trial 8 finished with value: 0.7738560440752336 and parameters: {'numerical_strategy': 'most_frequent', 'with_centering': True, 'with_scaling': True, 'categorical_strategy': 'most_frequent', 'smoothing': 63.4577229548944, 'min_samples_leaf': 36, 'clf': 'XGBoost', 'xgb_alpha': 0.07667387859850233, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.1456248971075232, 'xgb_n_estimators': 1143, 'xgb_min_child_weight': 10, 'xgb_subsample': 0.8008619387680036, 'xgb_colsample_bytree': 0.6172256825493287, 'xgb_reg_alpha': 6.722732966071998, 'xgb_reg_lambda': 5.125661413236317, 'percentile': 4}. Best is trial 7 with value: 0.8639958463154103.


Best trial: 7. Best value: 0.863996:  10%|█         | 10/100 [1:01:31<8:34:29, 342.99s/it]

[I 2024-07-23 02:33:03,134] Trial 9 finished with value: 0.7625954504392283 and parameters: {'numerical_strategy': 'constant', 'with_centering': True, 'with_scaling': True, 'categorical_strategy': 'constant', 'smoothing': 9.071740903639087, 'min_samples_leaf': 24, 'clf': 'XGBoost', 'xgb_alpha': 0.04735865826502196, 'xgb_max_depth': 16, 'xgb_learning_rate': 0.2619138254546821, 'xgb_n_estimators': 1281, 'xgb_min_child_weight': 2, 'xgb_subsample': 0.7456255104644873, 'xgb_colsample_bytree': 0.3518803935323162, 'xgb_reg_alpha': 7.513533241025843, 'xgb_reg_lambda': 8.416295135322601, 'percentile': 17}. Best is trial 7 with value: 0.8639958463154103.


Best trial: 10. Best value: 0.879942:  11%|█         | 11/100 [1:08:18<8:58:00, 362.71s/it]

[I 2024-07-23 02:39:50,542] Trial 10 finished with value: 0.879941679978261 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 36.547868835435324, 'min_samples_leaf': 44, 'clf': 'XGBoost', 'xgb_alpha': 0.03506467562866786, 'xgb_max_depth': 12, 'xgb_learning_rate': 0.08051442239654633, 'xgb_n_estimators': 532, 'xgb_min_child_weight': 5, 'xgb_subsample': 0.572980829156482, 'xgb_colsample_bytree': 0.7838470818667488, 'xgb_reg_alpha': 0.7427926240129499, 'xgb_reg_lambda': 1.5496985366138354, 'percentile': 100}. Best is trial 10 with value: 0.879941679978261.


Best trial: 11. Best value: 0.880073:  12%|█▏        | 12/100 [1:14:53<9:06:20, 372.51s/it]

[I 2024-07-23 02:46:25,442] Trial 11 finished with value: 0.880072787971681 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 33.14312548440427, 'min_samples_leaf': 46, 'clf': 'XGBoost', 'xgb_alpha': 0.027839859626778405, 'xgb_max_depth': 12, 'xgb_learning_rate': 0.07935218564149366, 'xgb_n_estimators': 524, 'xgb_min_child_weight': 6, 'xgb_subsample': 0.5735279773872959, 'xgb_colsample_bytree': 0.7740236716742344, 'xgb_reg_alpha': 0.5670872945488754, 'xgb_reg_lambda': 1.739195515357646, 'percentile': 99}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  13%|█▎        | 13/100 [1:20:54<8:55:18, 369.18s/it]

[I 2024-07-23 02:52:26,981] Trial 12 finished with value: 0.8799284139556449 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 34.33579626873251, 'min_samples_leaf': 49, 'clf': 'XGBoost', 'xgb_alpha': 0.03430995091981854, 'xgb_max_depth': 11, 'xgb_learning_rate': 0.09431588583533049, 'xgb_n_estimators': 505, 'xgb_min_child_weight': 6, 'xgb_subsample': 0.5047542650070769, 'xgb_colsample_bytree': 0.7328439476371698, 'xgb_reg_alpha': 0.019148177892418294, 'xgb_reg_lambda': 0.774569102719939, 'percentile': 99}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  14%|█▍        | 14/100 [1:27:22<8:56:59, 374.65s/it]

[I 2024-07-23 02:58:54,272] Trial 13 finished with value: 0.8780877840424647 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 20.627685307374318, 'min_samples_leaf': 50, 'clf': 'XGBoost', 'xgb_alpha': 0.032632715576579596, 'xgb_max_depth': 11, 'xgb_learning_rate': 0.19555168935031897, 'xgb_n_estimators': 543, 'xgb_min_child_weight': 5, 'xgb_subsample': 0.6209920889616273, 'xgb_colsample_bytree': 0.7290955638417447, 'xgb_reg_alpha': 1.7025800367814226, 'xgb_reg_lambda': 1.9546487273297042, 'percentile': 99}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  15%|█▌        | 15/100 [1:45:48<14:03:24, 595.34s/it]

[I 2024-07-23 03:17:21,067] Trial 14 finished with value: 0.8701913664475731 and parameters: {'numerical_strategy': 'constant', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 41.593261072807344, 'min_samples_leaf': 43, 'clf': 'XGBoost', 'xgb_alpha': 0.09643352406323016, 'xgb_max_depth': 20, 'xgb_learning_rate': 0.08198290894280769, 'xgb_n_estimators': 585, 'xgb_min_child_weight': 5, 'xgb_subsample': 0.6132299437499197, 'xgb_colsample_bytree': 0.8156805248777197, 'xgb_reg_alpha': 1.6157224681624567, 'xgb_reg_lambda': 1.604104336075758, 'percentile': 90}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  16%|█▌        | 16/100 [1:49:50<11:24:13, 488.74s/it]

[I 2024-07-23 03:21:22,237] Trial 15 finished with value: 0.8481075497524455 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 21.532782494956685, 'min_samples_leaf': 43, 'clf': 'XGBoost', 'xgb_alpha': 0.0387083302439675, 'xgb_max_depth': 8, 'xgb_learning_rate': 0.06447951384566435, 'xgb_n_estimators': 328, 'xgb_min_child_weight': 7, 'xgb_subsample': 0.5088001196813369, 'xgb_colsample_bytree': 0.6341436620541973, 'xgb_reg_alpha': 5.66657288879858, 'xgb_reg_lambda': 3.378397184984418, 'percentile': 54}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  17%|█▋        | 17/100 [1:58:45<11:35:18, 502.64s/it]

[I 2024-07-23 03:30:17,204] Trial 16 finished with value: 0.8647286026024321 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 47.34355340189095, 'min_samples_leaf': 46, 'clf': 'XGBoost', 'xgb_alpha': 0.021047870838130728, 'xgb_max_depth': 13, 'xgb_learning_rate': 0.11844528666551898, 'xgb_n_estimators': 766, 'xgb_min_child_weight': 4, 'xgb_subsample': 0.6315675339715431, 'xgb_colsample_bytree': 0.8458416700201935, 'xgb_reg_alpha': 1.3550659682219472, 'xgb_reg_lambda': 1.3463008736594873, 'percentile': 88}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  18%|█▊        | 18/100 [2:08:27<11:59:35, 526.53s/it]

[I 2024-07-23 03:39:59,363] Trial 17 finished with value: 0.8634875788984391 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 26.021411455241626, 'min_samples_leaf': 39, 'clf': 'XGBoost', 'xgb_alpha': 0.02029971851979583, 'xgb_max_depth': 13, 'xgb_learning_rate': 0.29849678874760577, 'xgb_n_estimators': 1474, 'xgb_min_child_weight': 7, 'xgb_subsample': 0.5725822912643073, 'xgb_colsample_bytree': 0.6176312547103794, 'xgb_reg_alpha': 5.452462491500006, 'xgb_reg_lambda': 2.6827273162834353, 'percentile': 57}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  19%|█▉        | 19/100 [2:15:14<11:02:37, 490.83s/it]

[I 2024-07-23 03:46:47,021] Trial 18 finished with value: 0.8785366819400549 and parameters: {'numerical_strategy': 'most_frequent', 'with_centering': False, 'with_scaling': True, 'categorical_strategy': 'most_frequent', 'smoothing': 2.602742543568258, 'min_samples_leaf': 27, 'clf': 'XGBoost', 'xgb_alpha': 0.02780103687210756, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.1826391427915862, 'xgb_n_estimators': 742, 'xgb_min_child_weight': 7, 'xgb_subsample': 0.667013837645917, 'xgb_colsample_bytree': 0.7234830267925478, 'xgb_reg_alpha': 0.9918122836548252, 'xgb_reg_lambda': 0.12746283173009743, 'percentile': 89}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  20%|██        | 20/100 [2:19:10<9:12:13, 414.17s/it] 

[I 2024-07-23 03:50:42,535] Trial 19 finished with value: 0.8353101958255998 and parameters: {'numerical_strategy': 'constant', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 54.469541964640825, 'min_samples_leaf': 19, 'clf': 'XGBoost', 'xgb_alpha': 0.00017084452197397432, 'xgb_max_depth': 6, 'xgb_learning_rate': 0.05333962858722254, 'xgb_n_estimators': 354, 'xgb_min_child_weight': 4, 'xgb_subsample': 0.5502386264611364, 'xgb_colsample_bytree': 0.871964527513916, 'xgb_reg_alpha': 2.4730105073597466, 'xgb_reg_lambda': 4.363552050049914, 'percentile': 33}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  21%|██        | 21/100 [2:22:41<7:44:56, 353.12s/it]

[I 2024-07-23 03:54:13,312] Trial 20 finished with value: 0.8637717165781842 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 29.668247697888262, 'min_samples_leaf': 46, 'clf': 'XGBoost', 'xgb_alpha': 0.04395772823768882, 'xgb_max_depth': 14, 'xgb_learning_rate': 0.08856236594258271, 'xgb_n_estimators': 50, 'xgb_min_child_weight': 8, 'xgb_subsample': 0.8819067170854464, 'xgb_colsample_bytree': 0.6579164533142623, 'xgb_reg_alpha': 0.679740841171554, 'xgb_reg_lambda': 2.426822764965296, 'percentile': 61}. Best is trial 11 with value: 0.880072787971681.


Best trial: 11. Best value: 0.880073:  22%|██▏       | 22/100 [2:28:44<7:42:57, 356.12s/it]

[I 2024-07-23 04:00:16,431] Trial 21 finished with value: 0.8800516294443426 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 34.15766687094393, 'min_samples_leaf': 50, 'clf': 'XGBoost', 'xgb_alpha': 0.033054754850799985, 'xgb_max_depth': 11, 'xgb_learning_rate': 0.08951317619238054, 'xgb_n_estimators': 507, 'xgb_min_child_weight': 6, 'xgb_subsample': 0.5156929216016412, 'xgb_colsample_bytree': 0.7430041164406271, 'xgb_reg_alpha': 0.1766104390019867, 'xgb_reg_lambda': 1.0515329907912794, 'percentile': 98}. Best is trial 11 with value: 0.880072787971681.


Best trial: 22. Best value: 0.880098:  23%|██▎       | 23/100 [2:35:06<7:47:10, 364.03s/it]

[I 2024-07-23 04:06:38,878] Trial 22 finished with value: 0.8800976147593484 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 12.497106861739162, 'min_samples_leaf': 50, 'clf': 'XGBoost', 'xgb_alpha': 0.02647289806590801, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.11205809043008402, 'xgb_n_estimators': 617, 'xgb_min_child_weight': 6, 'xgb_subsample': 0.5937288324214606, 'xgb_colsample_bytree': 0.760659569079914, 'xgb_reg_alpha': 2.1374993936138735, 'xgb_reg_lambda': 1.063613718685042, 'percentile': 100}. Best is trial 22 with value: 0.8800976147593484.


Best trial: 22. Best value: 0.880098:  24%|██▍       | 24/100 [2:41:33<7:49:35, 370.74s/it]

[I 2024-07-23 04:13:05,292] Trial 23 finished with value: 0.8676087424160184 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 14.366923521652609, 'min_samples_leaf': 50, 'clf': 'XGBoost', 'xgb_alpha': 0.02450772401551447, 'xgb_max_depth': 10, 'xgb_learning_rate': 0.12686512472442993, 'xgb_n_estimators': 674, 'xgb_min_child_weight': 6, 'xgb_subsample': 0.5328656175127711, 'xgb_colsample_bytree': 0.6850832932092469, 'xgb_reg_alpha': 2.56317420907345, 'xgb_reg_lambda': 0.8583699228413941, 'percentile': 83}. Best is trial 22 with value: 0.8800976147593484.


Best trial: 24. Best value: 0.880392:  25%|██▌       | 25/100 [2:46:36<7:17:56, 350.35s/it]

[I 2024-07-23 04:18:08,084] Trial 24 finished with value: 0.8803920313223899 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 11.369939471970994, 'min_samples_leaf': 39, 'clf': 'XGBoost', 'xgb_alpha': 0.010410018666236438, 'xgb_max_depth': 9, 'xgb_learning_rate': 0.16258909131474306, 'xgb_n_estimators': 420, 'xgb_min_child_weight': 8, 'xgb_subsample': 0.642818730371998, 'xgb_colsample_bytree': 0.564852759525231, 'xgb_reg_alpha': 1.9048115191690633, 'xgb_reg_lambda': 3.5444562530159986, 'percentile': 94}. Best is trial 24 with value: 0.8803920313223899.


Best trial: 24. Best value: 0.880392:  26%|██▌       | 26/100 [2:52:29<7:13:20, 351.35s/it]

[I 2024-07-23 04:24:01,774] Trial 25 finished with value: 0.8803526574268663 and parameters: {'numerical_strategy': 'mean', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 11.813755875621906, 'min_samples_leaf': 38, 'clf': 'XGBoost', 'xgb_alpha': 0.009321748372918978, 'xgb_max_depth': 6, 'xgb_learning_rate': 0.21809620253759643, 'xgb_n_estimators': 889, 'xgb_min_child_weight': 8, 'xgb_subsample': 0.6568510984105924, 'xgb_colsample_bytree': 0.5541474672317693, 'xgb_reg_alpha': 2.001005810311109, 'xgb_reg_lambda': 3.772369452069552, 'percentile': 91}. Best is trial 24 with value: 0.8803920313223899.


Best trial: 24. Best value: 0.880392:  27%|██▋       | 27/100 [2:57:46<6:54:58, 341.08s/it]

[I 2024-07-23 04:29:18,874] Trial 26 finished with value: 0.8643225283067304 and parameters: {'numerical_strategy': 'mean', 'with_centering': True, 'with_scaling': True, 'categorical_strategy': 'constant', 'smoothing': 2.1604150234150783, 'min_samples_leaf': 37, 'clf': 'XGBoost', 'xgb_alpha': 0.00961226329133488, 'xgb_max_depth': 6, 'xgb_learning_rate': 0.21419187710574633, 'xgb_n_estimators': 883, 'xgb_min_child_weight': 8, 'xgb_subsample': 0.6578352539506586, 'xgb_colsample_bytree': 0.5439909128991353, 'xgb_reg_alpha': 4.553532506094161, 'xgb_reg_lambda': 3.876705942436258, 'percentile': 65}. Best is trial 24 with value: 0.8803920313223899.


Best trial: 24. Best value: 0.880392:  28%|██▊       | 28/100 [3:03:14<6:44:30, 337.09s/it]

[I 2024-07-23 04:34:46,653] Trial 27 finished with value: 0.8792472571257642 and parameters: {'numerical_strategy': 'most_frequent', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 13.282540209473778, 'min_samples_leaf': 31, 'clf': 'XGBoost', 'xgb_alpha': 0.00906052055708545, 'xgb_max_depth': 4, 'xgb_learning_rate': 0.15686279627651012, 'xgb_n_estimators': 933, 'xgb_min_child_weight': 8, 'xgb_subsample': 0.6057967853100638, 'xgb_colsample_bytree': 0.5530508357276738, 'xgb_reg_alpha': 2.122959611179023, 'xgb_reg_lambda': 9.999843699109977, 'percentile': 90}. Best is trial 24 with value: 0.8803920313223899.


Best trial: 24. Best value: 0.880392:  29%|██▉       | 29/100 [3:06:49<5:55:34, 300.49s/it]

[I 2024-07-23 04:38:21,762] Trial 28 finished with value: 0.8363991952638603 and parameters: {'numerical_strategy': 'constant', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 8.721636109614288, 'min_samples_leaf': 40, 'clf': 'XGBoost', 'xgb_alpha': 0.00509724862973272, 'xgb_max_depth': 7, 'xgb_learning_rate': 0.2171801955859604, 'xgb_n_estimators': 207, 'xgb_min_child_weight': 9, 'xgb_subsample': 0.661922085112359, 'xgb_colsample_bytree': 0.40343126034823695, 'xgb_reg_alpha': 2.0080005586030296, 'xgb_reg_lambda': 5.863081057791678, 'percentile': 39}. Best is trial 24 with value: 0.8803920313223899.


In [None]:
# Print the best parameters and best value
print('Best trial:')
trial = study.best_trial
print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

final_score = f'{trial.value*100:.3f}'.replace('.', '')
FILE_NAME = f'{MODEL}_{final_score}'

# You can now use the best parameters to create your final model
best_model = instantiate_model(trial, num_cols, cat_cols, datetime_columns=None, string_columns=None, models=[MODEL])

# Re-fit Final model
X = train.drop(target, axis=1)
y = train[target]
best_model.fit(X, y)
y_preds_proba = best_model.predict_proba(test)

model_filename = os.path.join(PROJ, 'out', f'{FILE_NAME}.joblib')
joblib.dump(trial, model_filename)

## Create Submission file

First file uses the prediction of the model.
Second file uses the information that the synthetic data often contains the same rows as the original training data that generated it, but with the responses flipped. Using that information, we override the values in the first file using information from the original data set.

In [None]:
# Create submission file
SUB_PATH = os.path.join(PROJ, 'data', 'sample_submission.csv')
OUT_PATH = os.path.join(PROJ, 'out', f'{FILE_NAME}.csv')
sub = pd.read_csv(SUB_PATH)
sub['Response'] = y_preds_proba[:, 1]
sub.to_csv(OUT_PATH, index=False)

sub.shape


In [None]:
#  Find all matching data from the original train data set on the synthetic test dataset
orig_train = pd.read_csv( TRAIN_PATH[1], index_col = 'id')
match_columns = sorted(set(test.columns) - set(['id']))
test_merged = pd.merge(test.reset_index(), orig_train[match_columns + ['Response']], on=match_columns, how='inner').drop_duplicates().set_index('id')
test_merged['Response'] = 1 - test_merged['Response']


# Override the model predictions
sub.set_index('id', inplace=True)
matching_ids = sub.index.intersection(test_merged.index)
sub.loc[matching_ids, 'Response'] = test_merged.loc[matching_ids, 'Response']
sub.reset_index(inplace=True)

OUT_PATH = os.path.join(PROJ, 'out', f'{FILE_NAME}_override.csv')
sub.to_csv(OUT_PATH, index=False)
sub.shape