In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
import os
import optuna
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score
import joblib

# Set display options to show all columns
pd.set_option('display.max_columns', None)

# Set up white-grid plot style
sns.set_style("whitegrid")

# Optional: Set a color palette (you can choose a different one if you prefer)
sns.set_palette("deep")

CWD = os.getcwd()
print(f'CWD: {CWD}')

ROOT = os.path.abspath(os.path.dirname(os.path.dirname(CWD)))
sys.path.append(ROOT)

from utils.visualization import boxplot_by_category, stacked_barplot_by_category
from utils.tuning import instantiate_model
from utils.utils import Dataloader

  from .autonotebook import tqdm as notebook_tqdm


CWD: /data_analysis/Insurance/src


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
PROJ = os.path.dirname(CWD)
TRAIN_PATH = [os.path.join(PROJ, 'data', x)  for x in ['train.csv', 'train_orig.csv']]
TEST_PATH = os.path.join(PROJ, 'data', 'test.csv')

target = 'Response'
num_cols = ['Age', 'Annual_Premium', 'Vintage']
cat_cols = ['Gender', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Vehicle_Age']

train = pd.concat([pd.read_csv(PATH, index_col='id') for PATH in TRAIN_PATH]).reset_index(drop=True) 
test = pd.read_csv(TEST_PATH, index_col = 'id')

for col in cat_cols:
    train[col], test[col] = train[col].astype('category'), test[col].astype('category')

In [6]:
N_FOLDS = 5
def objective(trial):
    score = 0
    # Call the function to instantiate the model
    model = instantiate_model(trial, num_cols, cat_cols, datetime_columns=None, string_columns=None, models=['XGBoost'])

    X = train.drop(target, axis=1)
    y = train[target]

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

    f1_scores = []

    for train_index, val_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # Fit the model
        model.fit(X_train, y_train)

        # Predict on validation set
        y_pred = model.predict_proba(X_val)

        # Calculate F1 score
        score += roc_auc_score(y_val, y_pred[:, 1])/N_FOLDS

    return score

# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)  # You can adjust the number of trials

# Print the best parameters and best value
print('Best trial:')
trial = study.best_trial
print('  Value: ', trial.value)
print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

# You can now use the best parameters to create your final model
best_model = instantiate_model(trial, num_cols, cat_cols, datetime_columns=None, string_columns=None)

model_filename = os.path.join(PROJ, 'out', 'best_xgb_model.joblib')
joblib.dump(best_model, model_filename)

[I 2024-07-21 00:12:23,771] A new study created in memory with name: no-name-c73f50bc-afd3-4636-902b-9bf59589d73b
