In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
import joblib

# Set display options to show all columns
pd.set_option('display.max_columns', None)

# Set up white-grid plot style
sns.set_style("whitegrid")

# Optional: Set a color palette (you can choose a different one if you prefer)
sns.set_palette("deep")

CWD = os.getcwd()
print(f'CWD: {CWD}')

ROOT = os.path.abspath(os.path.dirname(os.path.dirname(CWD)))
sys.path.append(ROOT)

from utils.visualization import boxplot_by_category, stacked_barplot_by_category
from utils.tuning import instantiate_model

  from .autonotebook import tqdm as notebook_tqdm


CWD: /data_analysis/Insurance/src


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
PROJ = os.path.dirname(CWD)
TRAIN_PATH = os.path.join(PROJ, 'data', 'train.csv')
TEST_PATH = os.path.join(PROJ, 'data', 'test.csv')

train = pd.read_csv(TRAIN_PATH, low_memory=False, na_values='?', index_col='id')
test = pd.read_csv(TEST_PATH, low_memory=False, na_values='?', index_col='id')

target = 'Response'
num_cols = ['Age', 'Annual_Premium', 'Vintage']
cat_cols = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Vehicle_Age']

train[cat_cols] = train[cat_cols].astype('category')
test[cat_cols] = test[cat_cols].astype('category')

# train = train.sample(1000)

In [3]:
def objective(trial):
    # Call the function to instantiate the model
    model = instantiate_model(trial, num_cols, cat_cols, datetime_columns=None, string_columns=None)

    X = train.drop(target, axis=1)
    y = train[target]

    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

    f1_scores = []

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict on validation set
        y_pred = model.predict_proba(X_val)

        # Calculate F1 score
        f1 = roc_auc_score(y_val, y_pred[:, 1])
        f1_scores.append(f1)

    return sum(f1_scores)/len(f1_scores)

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # You can adjust the number of trials

# Print the best parameters and best value
print('Best trial:')
trial = study.best_trial
print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

# You can now use the best parameters to create your final model
best_model = instantiate_model(trial, num_cols, cat_cols, datetime_columns=None, string_columns=None)

model_filename = os.path.join(PROJ, 'out', 'best_model.joblib')
joblib.dump(best_model, model_filename)

[I 2024-07-20 11:41:06,855] A new study created in memory with name: no-name-5bdd5f1e-6978-4bbb-874f-c33068e027f5
[I 2024-07-20 11:44:26,331] Trial 0 finished with value: 0.8479752374689987 and parameters: {'numerical_strategy': 'most_frequent', 'with_centering': False, 'with_scaling': False, 'categorical_strategy': 'constant', 'smoothing': 76.10029868156795, 'min_samples_leaf': 32, 'clf': 'XGBoost', 'xgb_max_depth': 4, 'xgb_learning_rate': 0.13084903534841466, 'xgb_n_estimators': 676, 'xgb_min_child_weight': 6, 'xgb_subsample': 0.5896428971730541, 'xgb_colsample_bytree': 0.9109517251964876, 'xgb_reg_alpha': 6.487386653800083, 'xgb_reg_lambda': 5.348713952413967, 'percentile': 53}. Best is trial 0 with value: 0.8479752374689987.
[I 2024-07-20 11:47:17,444] Trial 1 finished with value: 0.7738560441623157 and parameters: {'numerical_strategy': 'mean', 'with_centering': True, 'with_scaling': False, 'categorical_strategy': 'most_frequent', 'smoothing': 67.5998044036548, 'min_samples_leaf':

Best trial:
  Value:  0.8805687408908682
  Params: 
    numerical_strategy: median
    with_centering: False
    with_scaling: True
    categorical_strategy: constant
    smoothing: 22.71602308255698
    min_samples_leaf: 26
    clf: XGBoost
    xgb_max_depth: 9
    xgb_learning_rate: 0.2570236380980364
    xgb_n_estimators: 379
    xgb_min_child_weight: 7
    xgb_subsample: 0.9034556837727007
    xgb_colsample_bytree: 0.5043125948856921
    xgb_reg_alpha: 6.437758025125069
    xgb_reg_lambda: 0.9310201217343881
    percentile: 92


['/data_analysis/Insurance/out/best_model.joblib']