In [2]:
import numpy as np
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

In [5]:
train_orig = pd.read_csv('./input/train.csv', index_col='id')
test_orig = pd.read_csv('./input/test.csv', index_col='id')
sample_submission = pd.read_csv('./input/sample_submission.csv')
X_train_orig = pd.DataFrame(train_orig.drop(['defects'], axis=1))
y_train_orig = pd.Series(train_orig.defects)

In [7]:
# Define an objective function for Optuna
def objective(trial):
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'auc',
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear']),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
    }
    # Initialize StratifiedKFold for cross-validation
    n_splits = 5  # Number of folds
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Initialize an array to store cross-validated ROC AUC scores
    roc_auc_scores = []

    for train_idx, val_idx in kf.split(X_train_orig, y_train_orig):
        X_train, X_val = X_train_orig.iloc[train_idx], X_train_orig.iloc[val_idx]
        y_train, y_val = y_train_orig.iloc[train_idx], y_train_orig.iloc[val_idx]

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dval = xgb.DMatrix(X_val, label=y_val)

        # Train and evaluate the XGBoost model on each fold
        bst = xgb.train(params, dtrain, evals=[(dval, 'eval')], verbose_eval=False)
        y_pred = bst.predict(dval)

        # Calculate ROC AUC score for the fold and append it to the list
        fold_roc_auc = roc_auc_score(y_val, y_pred)
        roc_auc_scores.append(fold_roc_auc)

    # Return the mean ROC AUC score across all folds
    return np.mean(roc_auc_scores)

# Create an Optuna study and optimize
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and ROC AUC score
best_params = study.best_params
best_roc_auc = study.best_value
print("Best Hyperparameters:", best_params)
print("Best Mean ROC AUC Score:", best_roc_auc)


[I 2023-10-08 20:22:20,934] A new study created in memory with name: no-name-b14db22a-c3ef-4762-888a-56e3b79f86ac
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
  'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
  'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
Parameters: { "colsample_bytree", "max_depth", "min_child_weight", "subsample" } are not used.

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_

Best Hyperparameters: {'booster': 'gbtree', 'max_depth': 6, 'learning_rate': 0.15922609320044884, 'subsample': 0.9885524793197935, 'colsample_bytree': 0.9003308377177487, 'min_child_weight': 2}
Best Mean ROC AUC Score: 0.7912506090315479


In [11]:
# Use the found hyperparameters to train a model
best_dtrain = xgb.DMatrix(X_train_orig, label=y_train_orig)
best_model = xgb.train(best_params, best_dtrain, num_boost_round=100)

# Make predictions on new data
dtest = xgb.DMatrix(test_orig)
y_pred_proba = best_model.predict(dtest)

submission = pd.Series(data=y_pred_proba, index=test_orig.index, name='defects')
submission.to_csv('solution-v2_0.csv')

  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
  if is_sparse(data):
  if is_sparse(dtype):
  is_categorical_dtype(dtype) or is_pa_ext_categorical_dtype(dtype)
  if is_categorical_dtype(dtype):
  return is_int or is_bool or is_float or is_categorical_dtype(dtype)
