# Spaceship Titanic - Kaggle competition

https://www.kaggle.com/competitions/spaceship-titanic

This notebook serves as an introduction for the class about MLFlow and Optuna.
The students are introduced to the concept of Hyperparameter Tuning and experiment tracking with MLFlow.

In [1]:
# pip install pandas

We load the train data. The PassengerId column is used as the index of the dataframe

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 891.4+ KB


## Preprocessing Pipeline

We identified null values in all columns. We will clean these by type.

In [4]:
train.isna().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
# pip install scikit-learn

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [7]:
# Converting back to Pandas DataFrame
onehot_encoder_feature_names = list(preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out())
column_order =  numerical_cols + onehot_encoder_feature_names

# Show the cleaned DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=column_order, index=train.index)
y = train['Transported']

X_preprocessed.head()

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan,VIP_False,VIP_True,VIP_nan,CryoSleep_False,CryoSleep_True,CryoSleep_nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0001_01,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0002_01,24.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0004_01,16.0,303.0,70.0,151.0,565.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Hyperparameters tuning of a Decision Tree Classifier

We extend the pipeline with a decision tree classifier to predict the Transported variable. 

In [8]:
# pip install optuna

In [9]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier(criterion='entropy', random_state= 42)

def objective(trial):

    params = {
        # trial parameters to optimize
        'max_depth' : trial.suggest_int('max_depth', 3, 40, log=True),
        'min_samples_split' : trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf' : trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-08 09:51:58,729] A new study created in memory with name: no-name-6b803135-68f3-42cb-a1f1-8e00ad12bbbc
[I 2024-03-08 09:51:59,666] Trial 0 finished with value: 0.7182813423319752 and parameters: {'max_depth': 3, 'min_samples_split': 7.597281284497825e-05, 'min_samples_leaf': 1.9205330728421652e-06}. Best is trial 0 with value: 0.7182813423319752.
[I 2024-03-08 09:52:00,181] Trial 1 finished with value: 0.7818958027145477 and parameters: {'max_depth': 7, 'min_samples_split': 1.1370277670399297e-06, 'min_samples_leaf': 1.1407763276657445e-05}. Best is trial 1 with value: 0.7818958027145477.
[I 2024-03-08 09:52:00,688] Trial 2 finished with value: 0.7599252510106267 and parameters: {'max_depth': 4, 'min_samples_split': 1.6735374343183501e-06, 'min_samples_leaf': 2.1410087918408277e-06}. Best is trial 1 with value: 0.7818958027145477.
[I 2024-03-08 09:52:00,831] Trial 3 finished with value: 0.7599252510106267 and parameters: {'ma

--------------------------------------
best_params = {'max_depth': 8, 'min_samples_split': 2.5348787574042338e-06, 'min_samples_leaf': 4.035669104116147e-05} with cross_validation_score = 0.7832761709141994


# Evaluation on Kaggle

We will now publish on kaggle our tree with the most optimal depth, min_samples_leaf and min_samples_split.

In [10]:
train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
test = pd.read_csv("Data/test.csv", index_col= "PassengerId")

X = train.drop(['Transported'], axis = 1)
X_test = test

y = train['Transported']

model = DecisionTreeClassifier(criterion= 'entropy', random_state= 42)
model.set_params(**study.best_params)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission.to_csv("Data/optuna_optimal_decision_tree.csv", index=True)

## Hyperparameters tuning of a XGBoost Classifier

We extend the pipeline with a XGBoost classifier to predict the Transported variable. Since it uses some regularization, we need to adapt the pipeline and add a StandardScaler after the ColumnTransformer

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", IterativeImputer(random_state=0))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder())
])

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [12]:
# pip install xgboost

In [13]:
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score

model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

def objective(trial):
    params = {
        # trial parameters to optimize for XGBoost
        'eta': trial.suggest_float('eta', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'alpha': trial.suggest_float('alpha', 1e-6, 100, log=True),
        'lambda': trial.suggest_float('lambda', 1e-6, 100, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-08 09:52:17,061] A new study created in memory with name: no-name-3bbc91f9-c294-4b5f-9a53-a577d045d62d
[I 2024-03-08 09:52:17,321] Trial 0 finished with value: 0.7919043324106615 and parameters: {'eta': 0.18761841933452134, 'n_estimators': 232, 'max_depth': 12, 'min_child_weight': 10, 'subsample': 0.8545467179447319, 'colsample_bytree': 0.9058303439041682, 'gamma': 0.15282100904764823, 'alpha': 56.44486773959078, 'lambda': 4.357719062669205}. Best is trial 0 with value: 0.7919043324106615.
[I 2024-03-08 09:52:17,790] Trial 1 finished with value: 0.7791337428558005 and parameters: {'eta': 0.13453623843929136, 'n_estimators': 528, 'max_depth': 16, 'min_child_weight': 9, 'subsample': 0.9719642545890281, 'colsample_bytree': 0.5839714007960962, 'gamma': 0.17388622833345585, 'alpha': 0.005005392771705609, 'lambda': 0.3695536968819174}. Best is trial 0 with value: 0.7919043324106615.
[I 2024-03-08 09:52:18,170] Trial 2 finished with value: 0.799266141738536 and parameters: {'eta': 

--------------------------------------
best_params = {'eta': 0.01772669860488179, 'n_estimators': 780, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.9883228661476542, 'colsample_bytree': 0.6776668438685005, 'gamma': 0.39195960614040065, 'alpha': 1.6199948471894543, 'lambda': 0.0001609765917268194} with cross_validation_score = 0.8006461790733269


In [14]:
model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)
model.set_params(**study.best_params)

preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission.to_csv("Data/optuna_XGBoost.csv", index=True)

## Random Forest

We extend the pipeline with a RandomForest classifier to predict the Transported variable. 

In [15]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=42, n_jobs=-1)

def objective(trial):
    
    params = {
        # trial parameters to optimize for RandomForestClassifier
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50, log=True),
        'min_samples_split': trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-08 09:57:03,669] A new study created in memory with name: no-name-4d4ddafc-0fec-42e3-a159-4c0808811c3a
[I 2024-03-08 09:57:05,117] Trial 0 finished with value: 0.7930540878022698 and parameters: {'n_estimators': 220, 'max_depth': 14, 'min_samples_split': 1.1332791700149848e-05, 'min_samples_leaf': 9.974695473822727e-05, 'max_features': 0.4526441178742635}. Best is trial 0 with value: 0.7930540878022698.
[I 2024-03-08 09:57:06,218] Trial 1 finished with value: 0.7799393326191064 and parameters: {'n_estimators': 100, 'max_depth': 36, 'min_samples_split': 5.559559047032465e-06, 'min_samples_leaf': 2.0122378685468686e-05, 'max_features': 0.8076638549047728}. Best is trial 0 with value: 0.7930540878022698.
[I 2024-03-08 09:57:06,962] Trial 2 finished with value: 0.7768346291104168 and parameters: {'n_estimators': 181, 'max_depth': 4, 'min_samples_split': 2.511267114459126e-05, 'min_samples_leaf': 0.000963267918186857, 'max_features': 0.6002834187155344}. Best is trial 0 with valu

--------------------------------------
best_params = {'n_estimators': 729, 'max_depth': 9, 'min_samples_split': 0.00022932261803507484, 'min_samples_leaf': 2.3221977955596838e-05, 'max_features': 0.5847127611706968} with cross_validation_score = 0.7996111014425046


In [16]:
model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.set_params(**study.best_params)

preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission.to_csv("Data/optuna_RandomForest.csv", index=True)

## Cat Boost

We extend the pipeline with a CatBoost classifier to predict the Transported variable. 

In [17]:
#pip install catboost

In [18]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())]
    )

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OrdinalEncoder())]
    )

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
        remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [19]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

model = CatBoostClassifier(random_state=42, verbose=False)

def objective(trial):
    
    params = {
        # trial parameters to optimize for RandomForestClassifier
        'iterations' : trial.suggest_int("iterations", 100, 1000, log=True),
        'learning_rate' : trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        'depth' : trial.suggest_int("depth", 3, 8),
        'l2_leaf_reg' : trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        'bagging_temperature': trial.suggest_float("bagging_temperature", 0.0, 10.0),
        'random_strength' : trial.suggest_float("random_strength", 1e-8, 10.0, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-08 10:00:32,325] A new study created in memory with name: no-name-5f23a74a-d9c1-416c-a09a-9762326ce2df
[I 2024-03-08 10:00:34,027] Trial 0 finished with value: 0.7799399281758561 and parameters: {'iterations': 292, 'learning_rate': 0.17530492004711823, 'depth': 8, 'l2_leaf_reg': 0.5980723401318083, 'bagging_temperature': 4.586065313934702, 'random_strength': 2.591806836662259e-08}. Best is trial 0 with value: 0.7799399281758561.
[I 2024-03-08 10:00:35,890] Trial 1 finished with value: 0.7955854686800014 and parameters: {'iterations': 347, 'learning_rate': 0.01030338789181496, 'depth': 8, 'l2_leaf_reg': 1.4521578340262686e-06, 'bagging_temperature': 0.8855580365773474, 'random_strength': 0.0007053837113010888}. Best is trial 1 with value: 0.7955854686800014.
[I 2024-03-08 10:00:37,005] Trial 2 finished with value: 0.7852332365663904 and parameters: {'iterations': 325, 'learning_rate': 0.0014183265264815568, 'depth': 6, 'l2_leaf_reg': 9.096078851764123e-05, 'bagging_temperatur

--------------------------------------
best_params = {'iterations': 127, 'learning_rate': 0.16649315238746995, 'depth': 7, 'l2_leaf_reg': 1.2139153469425277, 'bagging_temperature': 4.932266122780948, 'random_strength': 1.0088304950996625e-08} with cross_validation_score = 0.7988054455062267


In [20]:
model = CatBoostClassifier(random_state=42, verbose=False)
model.set_params(**study.best_params)

preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission.to_csv("Data/optuna_CatBoost.csv", index=True)