# Spaceship Titanic - Kaggle competition

https://www.kaggle.com/competitions/spaceship-titanic

This notebook serves as an introduction for the class about MLFlow and Optuna.
The students are introduced to the concept of Hyperparameter Tuning and experiment tracking with MLFlow.

In [1]:
# pip install pandas

We load the train data. The PassengerId column is used as the index of the dataframe

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 891.4+ KB


## Preprocessing Pipeline

We identified null values in all columns. We will clean these by type.

In [4]:
train.isna().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
# pip install scikit-learn

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [7]:
# Converting back to Pandas DataFrame
onehot_encoder_feature_names = list(preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out())
column_order =  numerical_cols + onehot_encoder_feature_names

# Show the cleaned DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=column_order, index=train.index)
y = train['Transported']

X_preprocessed.head()

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan,VIP_False,VIP_True,VIP_nan,CryoSleep_False,CryoSleep_True,CryoSleep_nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0001_01,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0002_01,24.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0004_01,16.0,303.0,70.0,151.0,565.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Hyperparameters tuning of a Decision Tree Classifier

We extend the pipeline with a decision tree classifier to predict the Transported variable. 

In [8]:
# pip install optuna

In order to have a local tracking server run the following command in your terminal:

mlflow server --host 127.0.0.1 --port 8080

In [9]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier(criterion='entropy', random_state= 42)

def objective(trial):

    params = {
        # trial parameters to optimize
        'max_depth' : trial.suggest_int('max_depth', 3, 40, log=True),
        'min_samples_split' : trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf' : trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-03-18 23:52:20,164] A new study created in memory with name: no-name-b78adc15-5b06-4f5e-afc1-0043800939f1
[I 2024-03-18 23:52:21,018] Trial 0 finished with value: 0.7625702508815895 and parameters: {'max_depth': 5, 'min_samples_split': 1.666157047860217e-05, 'min_samples_leaf': 0.0003250707036980528}. Best is trial 0 with value: 0.7625702508815895.
[I 2024-03-18 23:52:21,586] Trial 1 finished with value: 0.7785598246680928 and parameters: {'max_depth': 10, 'min_samples_split': 7.570028269588529e-06, 'min_samples_leaf': 4.043775061400592e-06}. Best is trial 1 with value: 0.7785598246680928.
[I 2024-03-18 23:52:22,144] Trial 2 finished with value: 0.7557802422063128 and parameters: {'max_depth': 17, 'min_samples_split': 0.00014400203958473323, 'min_samples_leaf': 0.0009115904925067541}. Best is trial 1 with value: 0.7785598246680928.
[I 2024-03-18 23:52:22,309] Trial 3 finished with value: 0.73944689982934 and parameters: {'max_de

--------------------------------------
best_params = {'max_depth': 8, 'min_samples_split': 2.6917675062676835e-06, 'min_samples_leaf': 0.00019334776173225604} with cross_validation_score = 0.7836210644451959


### Using MLFlow for evaluation

The best model is saved in MLFlow for later use. First we need to start a local tracking server. After installing mlflow, execute the following command in the terminal:

mlflow server --host 127.0.0.1 --port 8080

In [10]:
# pip install mlflow
# pip install setuptools

In [13]:
import mlflow
from mlflow.models import infer_signature

mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")
mlflow.set_experiment("Spaceship_Titanic")

2024/03/18 23:58:35 INFO mlflow.tracking.fluent: Experiment with name 'Spaceship_Titanic' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/927528599907738554', creation_time=1710802715345, experiment_id='927528599907738554', last_update_time=1710802715345, lifecycle_stage='active', name='Spaceship_Titanic', tags={}>

In [15]:
train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

X = train.drop(['Transported'], axis = 1)
y = train['Transported']

# Start an MLflow run
with mlflow.start_run():
    # Fit the model with the best hyperparameters from the study
    model = DecisionTreeClassifier(criterion= 'entropy', random_state= 42)
    model.set_params(**study.best_params)

    model_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

    model_pipeline.fit(X, y)
    
    # Log the hyperparameters
    mlflow.log_params(study.best_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", study.best_value)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Simple Decision Tree Classifier")

    # Infer the model signature
    signature = infer_signature(X, model_pipeline.predict(X))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model_pipeline,
        signature=signature,
        input_example=X,
        registered_model_name="decision_tree_model",
        artifact_path="decision_tree_model"
    )

Registered model 'decision_tree_model' already exists. Creating a new version of this model...
2024/03/19 00:13:51 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: decision_tree_model, version 2
Created version '2' of model 'decision_tree_model'.


## Hyperparameters tuning of a XGBoost Classifier

We extend the pipeline with a XGBoost classifier to predict the Transported variable. Since it uses some regularization, we need to adapt the pipeline and add a StandardScaler after the ColumnTransformer

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", IterativeImputer(random_state=0))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder())
])

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [None]:
# pip install xgboost

In [17]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

model = XGBClassifier(objective='binary:logistic', random_state=42)

def objective(trial):
    params = {
        # trial parameters to optimize for XGBoost
        'eta': trial.suggest_float('eta', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'alpha': trial.suggest_float('alpha', 1e-6, 100, log=True),
        'lambda': trial.suggest_float('lambda', 1e-6, 100, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=250)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-19 00:14:08,393] A new study created in memory with name: no-name-ece7ee15-1c3e-4930-a02b-c02890a2d17b
[I 2024-03-19 00:14:10,085] Trial 0 finished with value: 0.77165725576714 and parameters: {'eta': 0.1437429292765166, 'n_estimators': 921, 'max_depth': 23, 'min_child_weight': 7, 'subsample': 0.8045086191159545, 'colsample_bytree': 0.7389136340185377, 'gamma': 0.0495141526123839, 'alpha': 0.03420147684406142, 'lambda': 0.20020357524800164}. Best is trial 0 with value: 0.77165725576714.
[I 2024-03-19 00:14:11,124] Trial 1 finished with value: 0.7850018296826807 and parameters: {'eta': 0.10716623269317768, 'n_estimators': 333, 'max_depth': 30, 'min_child_weight': 2, 'subsample': 0.6238387065882023, 'colsample_bytree': 0.9228330054588263, 'gamma': 0.06572877023213392, 'alpha': 2.36333939256243, 'lambda': 5.602040990547645}. Best is trial 1 with value: 0.7850018296826807.
[I 2024-03-19 00:14:11,960] Trial 2 finished with value: 0.7780991946087557 and parameters: {'eta': 0.14997

--------------------------------------
best_params = {'eta': 0.03361230763576166, 'n_estimators': 151, 'max_depth': 10, 'min_child_weight': 3, 'subsample': 0.7819222727599687, 'colsample_bytree': 0.9267119294058667, 'gamma': 0.034591442974199776, 'alpha': 4.578034912326603, 'lambda': 0.5793614707452585} with cross_validation_score = 0.8012216192393946


In [18]:
train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

X = train.drop(['Transported'], axis = 1)
y = train['Transported']

# Start an MLflow run
with mlflow.start_run():
    # Fit the model with the best hyperparameters from the study
    model = XGBClassifier(objective='binary:logistic', random_state=42)
    model.set_params(**study.best_params)

    model_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('scaler', StandardScaler()),
            ('classifier', model)
        ])

    model_pipeline.fit(X, y)
    
    # Log the hyperparameters
    mlflow.log_params(study.best_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", study.best_value)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "XGBoost Classifier")

    # Infer the model signature
    signature = infer_signature(X, model_pipeline.predict(X))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model_pipeline,
        signature=signature,
        input_example=X,
        registered_model_name="XGBoost_model",
        artifact_path="XGBoost_model"
    )

Successfully registered model 'XGBoost_model'.
2024/03/19 00:16:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: XGBoost_model, version 1
Created version '1' of model 'XGBoost_model'.


## Random Forest

We extend the pipeline with a RandomForest classifier to predict the Transported variable. 

In [19]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=42, n_jobs=-1)

def objective(trial):
    
    params = {
        # trial parameters to optimize for RandomForestClassifier
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50, log=True),
        'min_samples_split': trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-19 00:20:24,105] A new study created in memory with name: no-name-2d95a428-b5ff-4550-bd76-43fe89a2dee3
[I 2024-03-19 00:20:25,276] Trial 0 finished with value: 0.7423237697948174 and parameters: {'n_estimators': 589, 'max_depth': 3, 'min_samples_split': 6.066134126699176e-06, 'min_samples_leaf': 1.994219971772114e-06, 'max_features': 0.16839304211453318}. Best is trial 0 with value: 0.7423237697948174.
[I 2024-03-19 00:20:26,225] Trial 1 finished with value: 0.7907537167704148 and parameters: {'n_estimators': 203, 'max_depth': 21, 'min_samples_split': 0.000856834446460495, 'min_samples_leaf': 6.459669547653005e-06, 'max_features': 0.17866570136904392}. Best is trial 1 with value: 0.7907537167704148.
[I 2024-03-19 00:20:26,912] Trial 2 finished with value: 0.7802870054149343 and parameters: {'n_estimators': 167, 'max_depth': 4, 'min_samples_split': 4.065099596768056e-05, 'min_samples_leaf': 0.00021869235326431172, 'max_features': 0.6831354417525116}. Best is trial 1 with valu

--------------------------------------
best_params = {'n_estimators': 908, 'max_depth': 9, 'min_samples_split': 1.9286857153060706e-05, 'min_samples_leaf': 7.083051075025364e-05, 'max_features': 0.5803462002221914} with cross_validation_score = 0.7992658770466473


In [20]:
train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

X = train.drop(['Transported'], axis = 1)
y = train['Transported']

# Start an MLflow run
with mlflow.start_run():
    # Fit the model with the best hyperparameters from the study
    model = RandomForestClassifier(random_state=42, n_jobs=-1)
    model.set_params(**study.best_params)

    model_pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('scaler', StandardScaler()),
            ('classifier', model)
        ])  

    model_pipeline.fit(X, y)
    
    # Log the hyperparameters
    mlflow.log_params(study.best_params)

    # Log the loss metric
    mlflow.log_metric("accuracy", study.best_value)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Random Forest Classifier")

    # Infer the model signature
    signature = infer_signature(X, model_pipeline.predict(X))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model_pipeline,
        signature=signature,
        input_example=X,
        registered_model_name="random_forest_model",
        artifact_path="random_forest_model"
    )

Successfully registered model 'random_forest_model'.
2024/03/19 00:24:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest_model, version 1
Created version '1' of model 'random_forest_model'.
