# Spaceship Titanic - Kaggle competition

https://www.kaggle.com/competitions/spaceship-titanic

This notebook serves as an introduction for the class about MLFlow and Optuna.
The students are introduced to the concept of Hyperparameter Tuning and experiment tracking with MLFlow.

In [1]:
# pip install pandas

We load the train data. The PassengerId column is used as the index of the dataframe

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 891.4+ KB


## Preprocessing Pipeline

We identified null values in all columns. We will clean these by type.

In [4]:
train.isna().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
# pip install scikit-learn

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [7]:
# Converting back to Pandas DataFrame
onehot_encoder_feature_names = list(preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out())
column_order =  numerical_cols + onehot_encoder_feature_names

# Show the cleaned DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=column_order, index=train.index)
y = train['Transported']

X_preprocessed.head()

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan,VIP_False,VIP_True,VIP_nan,CryoSleep_False,CryoSleep_True,CryoSleep_nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0001_01,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0002_01,24.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0004_01,16.0,303.0,70.0,151.0,565.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Hyperparameters tuning of a Decision Tree Classifier

We extend the pipeline with a decision tree classifier to predict the Transported variable. 

In [None]:
# pip install optuna
# pip install plotly
# pip install jupyter anywidget

In [9]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier(criterion='entropy', random_state= 42)

def objective(trial):

    params = {
        # trial parameters to optimize
        'max_depth' : trial.suggest_int('max_depth', 3, 40, log=True),
        'min_samples_split' : trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf' : trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2025-04-14 14:56:03,851] A new study created in memory with name: no-name-e7153958-e9de-4925-8885-39b189eaee3c
[I 2025-04-14 14:56:05,575] Trial 0 finished with value: 0.7372606110015213 and parameters: {'max_depth': 37, 'min_samples_split': 0.0001784841789098473, 'min_samples_leaf': 2.2720590400610583e-06}. Best is trial 0 with value: 0.7372606110015213.
[I 2025-04-14 14:56:06,155] Trial 1 finished with value: 0.7818958027145477 and parameters: {'max_depth': 7, 'min_samples_split': 2.8658028218241053e-05, 'min_samples_leaf': 9.956359680145311e-06}. Best is trial 1 with value: 0.7818958027145477.
[I 2025-04-14 14:56:06,724] Trial 2 finished with value: 0.750143893128003 and parameters: {'max_depth': 18, 'min_samples_split': 0.0005575645677071511, 'min_samples_leaf': 0.0006984529567714103}. Best is trial 1 with value: 0.7818958027145477.
[I 2025-04-14 14:56:06,880] Trial 3 finished with value: 0.7832761047412272 and parameters: {'max_depth': 8, 'min_samples_split': 0.0007155590321713

--------------------------------------
best_params = {'max_depth': 8, 'min_samples_split': 0.00020185973981485638, 'min_samples_leaf': 0.00019464219873075093} with cross_validation_score = 0.7836210644451959


In [10]:
from optuna.visualization import plot_optimization_history
plot_optimization_history(study)

In [11]:
from optuna.visualization import plot_contour
plot_contour(study)

In [12]:
plot_contour(study, params=['min_samples_split', 'min_samples_leaf'])

In [13]:
from optuna.visualization import plot_slice
plot_slice(study)

# Evaluation on Kaggle

We will now publish on kaggle our tree with the most optimal depth, min_samples_leaf and min_samples_split.

In [14]:
train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
test = pd.read_csv("Data/test.csv", index_col= "PassengerId")

X = train.drop(['Transported'], axis = 1)
X_test = test

y = train['Transported']

model = DecisionTreeClassifier(criterion= 'entropy', random_state= 42)
model.set_params(**study.best_params)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
#kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
#kaggle_submission.to_csv("Data/optuna_optimal_decision_tree.csv", index=True)

## Hyperparameters tuning of a XGBoost Classifier

We extend the pipeline with a XGBoost classifier to predict the Transported variable. Since it uses some regularization, we need to adapt the pipeline and add a StandardScaler after the ColumnTransformer

In [17]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", IterativeImputer(random_state=0))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder())
])

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [None]:
# pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.0-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.0
Note: you may need to restart the kernel to use updated packages.


In [18]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

model = XGBClassifier(objective='binary:logistic', random_state=42)

def objective(trial):
    params = {
        # trial parameters to optimize for XGBoost
        'eta': trial.suggest_float('eta', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'alpha': trial.suggest_float('alpha', 1e-6, 100, log=True),
        'lambda': trial.suggest_float('lambda', 1e-6, 100, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=250)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2025-04-14 15:10:31,033] A new study created in memory with name: no-name-ef1cb17e-a581-4837-ad89-efd448139560
[I 2025-04-14 15:10:32,980] Trial 0 finished with value: 0.7950092344382675 and parameters: {'eta': 0.07569264793708867, 'n_estimators': 817, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.9550079793856718, 'colsample_bytree': 0.5194289889797268, 'gamma': 0.21808108496123174, 'alpha': 0.0005774691013508212, 'lambda': 2.2782415968387436e-06}. Best is trial 0 with value: 0.7950092344382675.
[I 2025-04-14 15:10:33,632] Trial 1 finished with value: 0.7946643409072711 and parameters: {'eta': 0.16783752552837353, 'n_estimators': 181, 'max_depth': 23, 'min_child_weight': 7, 'subsample': 0.8540747971419969, 'colsample_bytree': 0.8346258631475696, 'gamma': 0.4519636579040343, 'alpha': 3.499358914340897, 'lambda': 3.536600610729209e-06}. Best is trial 0 with value: 0.7950092344382675.
[I 2025-04-14 15:10:34,292] Trial 2 finished with value: 0.7955850054691961 and parameters: {

--------------------------------------
best_params = {'eta': 0.027948907134121702, 'n_estimators': 258, 'max_depth': 38, 'min_child_weight': 5, 'subsample': 0.5474871916189095, 'colsample_bytree': 0.9232016561722225, 'gamma': 0.030650413074285975, 'alpha': 0.0009363765260398734, 'lambda': 15.75199199808453} with cross_validation_score = 0.8012212883745338


In [19]:
train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
test = pd.read_csv("Data/test.csv", index_col= "PassengerId")

X = train.drop(['Transported'], axis = 1)
X_test = test

y = train['Transported']

model = XGBClassifier(objective='binary:logistic', random_state=42)
model.set_params(**study.best_params)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test).astype(bool)

# Writing the submission DataFrame to a CSV file
#kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
#kaggle_submission.to_csv("Data/optuna_XGBoost.csv", index=True)

## Random Forest

We extend the pipeline with a RandomForest classifier to predict the Transported variable. 

In [20]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=42, n_jobs=-1)

def objective(trial):
    
    params = {
        # trial parameters to optimize for RandomForestClassifier
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50, log=True),
        'min_samples_split': trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2025-04-14 15:14:05,884] A new study created in memory with name: no-name-3d9568a0-d56b-4b40-82ff-b505ad36dd6f
[I 2025-04-14 15:14:07,643] Trial 0 finished with value: 0.7947802097815564 and parameters: {'n_estimators': 536, 'max_depth': 9, 'min_samples_split': 0.0006874249230241148, 'min_samples_leaf': 1.1281369971123427e-05, 'max_features': 0.2363428946520311}. Best is trial 0 with value: 0.7947802097815564.
[I 2025-04-14 15:14:10,141] Trial 1 finished with value: 0.7968504312161733 and parameters: {'n_estimators': 479, 'max_depth': 10, 'min_samples_split': 4.927954479436587e-06, 'min_samples_leaf': 1.4260561570591731e-05, 'max_features': 0.5116734845930702}. Best is trial 1 with value: 0.7968504312161733.
[I 2025-04-14 15:14:10,539] Trial 2 finished with value: 0.784311314717994 and parameters: {'n_estimators': 100, 'max_depth': 7, 'min_samples_split': 3.5099681604718553e-06, 'min_samples_leaf': 3.421454737209137e-05, 'max_features': 0.18881050241002537}. Best is trial 1 with val

--------------------------------------
best_params = {'n_estimators': 427, 'max_depth': 10, 'min_samples_split': 6.954256866653626e-06, 'min_samples_leaf': 0.0008320733165917644, 'max_features': 0.7778801929239431} with cross_validation_score = 0.8007611215260017


In [21]:
model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.set_params(**study.best_params)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
#kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
#kaggle_submission.to_csv("Data/optuna_RandomForest.csv", index=True)

## Cat Boost

We extend the pipeline with a CatBoost classifier to predict the Transported variable. 

In [None]:
#pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp313-cp313-macosx_11_0_universal2.whl.metadata (1.4 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)
Collecting matplotlib (from catboost)
  Downloading matplotlib-3.10.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib->catboost)
  Downloading contourpy-1.3.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib->catboost)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib->catboost)
  Downloading fonttools-4.57.0-cp313-cp313-macosx_10_13_universal2.whl.metadata (102 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib->catboost)
  Downloading kiwisolver-1.4.8-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.2 kB)
Collecting pillow>=8 (from matplotlib->catboost)
  Downloading pillow-11.2.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (8.9 kB)
Coll

In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())]
    )

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OrdinalEncoder())]
    )

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
        remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [24]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

model = CatBoostClassifier(random_state=42, verbose=False)

def objective(trial):
    
    params = {
        # trial parameters to optimize for RandomForestClassifier
        'iterations' : trial.suggest_int("iterations", 100, 1000, log=True),
        'learning_rate' : trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        'depth' : trial.suggest_int("depth", 3, 8),
        'l2_leaf_reg' : trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        'bagging_temperature': trial.suggest_float("bagging_temperature", 0.0, 10.0),
        'random_strength' : trial.suggest_float("random_strength", 1e-8, 10.0, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2025-04-14 15:19:57,535] A new study created in memory with name: no-name-213a63bf-f41b-4687-8551-be8111b14712
[I 2025-04-14 15:19:58,057] Trial 0 finished with value: 0.7935135929210801 and parameters: {'iterations': 122, 'learning_rate': 0.05349207150737952, 'depth': 5, 'l2_leaf_reg': 0.43978161020997475, 'bagging_temperature': 1.7931192800562823, 'random_strength': 5.431653362195306}. Best is trial 0 with value: 0.7935135929210801.
[I 2025-04-14 15:19:59,666] Trial 1 finished with value: 0.795124441582831 and parameters: {'iterations': 822, 'learning_rate': 0.011920302073527253, 'depth': 4, 'l2_leaf_reg': 7.351809739108348e-07, 'bagging_temperature': 9.151957260101977, 'random_strength': 1.524979959377485}. Best is trial 1 with value: 0.795124441582831.
[I 2025-04-14 15:20:02,080] Trial 2 finished with value: 0.7809747411147895 and parameters: {'iterations': 797, 'learning_rate': 0.21204563508135188, 'depth': 7, 'l2_leaf_reg': 11.214112433782253, 'bagging_temperature': 2.66990326

--------------------------------------
best_params = {'iterations': 875, 'learning_rate': 0.01798631678440163, 'depth': 7, 'l2_leaf_reg': 0.18078667201650117, 'bagging_temperature': 7.06393723015598, 'random_strength': 6.626189330754125} with cross_validation_score = 0.799266141738536


In [25]:
model = CatBoostClassifier(random_state=42, verbose=False)
model.set_params(**study.best_params)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
#kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
#kaggle_submission.to_csv("Data/optuna_CatBoost.csv", index=True)