# Spaceship Titanic - Kaggle competition

https://www.kaggle.com/competitions/spaceship-titanic

This notebook serves as an introduction for the class about MLFlow and Optuna.
The students are introduced to the concept of Hyperparameter Tuning and experiment tracking with MLFlow.

In [1]:
# pip install pandas

We load the train data. The PassengerId column is used as the index of the dataframe

In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
train

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8693 entries, 0001_01 to 9280_02
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(6)
memory usage: 891.4+ KB


## Preprocessing Pipeline

We identified null values in all columns. We will clean these by type.

In [4]:
train.isna().sum()

HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [5]:
# pip install scikit-learn

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean"))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [7]:
# Converting back to Pandas DataFrame
onehot_encoder_feature_names = list(preprocessing_pipeline.named_steps['preprocessor'].named_transformers_['cat'].named_steps['encoder'].get_feature_names_out())
column_order =  numerical_cols + onehot_encoder_feature_names

# Show the cleaned DataFrame
X_preprocessed = pd.DataFrame(X_preprocessed, columns=column_order, index=train.index)
y = train['Transported']

X_preprocessed.head()

Unnamed: 0_level_0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,HomePlanet_nan,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan,VIP_False,VIP_True,VIP_nan,CryoSleep_False,CryoSleep_True,CryoSleep_nan
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0001_01,39.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0002_01,24.0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0003_01,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
0003_02,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
0004_01,16.0,303.0,70.0,151.0,565.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Hyperparameters tuning of a Decision Tree Classifier

We extend the pipeline with a decision tree classifier to predict the Transported variable. 

In [None]:
# pip install optuna

In [10]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

model = DecisionTreeClassifier(criterion='entropy', random_state= 42)

def objective(trial):

    params = {
        # trial parameters to optimize
        'max_depth' : trial.suggest_int('max_depth', 3, 40, log=True),
        'min_samples_split' : trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf' : trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-11 13:44:11,592] A new study created in memory with name: no-name-1071573e-ddbd-4af7-a72d-77d31ea04b4b
[I 2024-03-11 13:44:11,784] Trial 0 finished with value: 0.7372606110015213 and parameters: {'max_depth': 37, 'min_samples_split': 1.0810413355451296e-05, 'min_samples_leaf': 1.905023308885468e-05}. Best is trial 0 with value: 0.7372606110015213.
[I 2024-03-11 13:44:11,938] Trial 1 finished with value: 0.773842618173348 and parameters: {'max_depth': 12, 'min_samples_split': 9.519073805355795e-06, 'min_samples_leaf': 4.5802414064889195e-06}. Best is trial 1 with value: 0.773842618173348.
[I 2024-03-11 13:44:12,113] Trial 2 finished with value: 0.7396773141184668 and parameters: {'max_depth': 31, 'min_samples_split': 2.1813284186803713e-06, 'min_samples_leaf': 0.00040670412240233314}. Best is trial 1 with value: 0.773842618173348.
[I 2024-03-11 13:44:12,280] Trial 3 finished with value: 0.7368008411908222 and parameters: {'max_depth': 33, 'min_samples_split': 4.80469802162836

--------------------------------------
best_params = {'max_depth': 8, 'min_samples_split': 0.0005986305306660823, 'min_samples_leaf': 0.0002413823058985668} with cross_validation_score = 0.7837362054167871


# Evaluation on Kaggle

We will now publish on kaggle our tree with the most optimal depth, min_samples_leaf and min_samples_split.

In [11]:
train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
test = pd.read_csv("Data/test.csv", index_col= "PassengerId")

X = train.drop(['Transported'], axis = 1)
X_test = test

y = train['Transported']

model = DecisionTreeClassifier(criterion= 'entropy', random_state= 42)
model.set_params(**study.best_params)

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission.to_csv("Data/optuna_optimal_decision_tree.csv", index=True)

## Hyperparameters tuning of a XGBoost Classifier

We extend the pipeline with a XGBoost classifier to predict the Transported variable. Since it uses some regularization, we need to adapt the pipeline and add a StandardScaler after the ColumnTransformer

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", IterativeImputer(random_state=0))]
)

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OneHotEncoder())
])

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [None]:
# pip install xgboost

In [18]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

model = XGBClassifier(objective='binary:logistic', random_state=42)

def objective(trial):
    params = {
        # trial parameters to optimize for XGBoost
        'eta': trial.suggest_float('eta', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 0.5),
        'alpha': trial.suggest_float('alpha', 1e-6, 100, log=True),
        'lambda': trial.suggest_float('lambda', 1e-6, 100, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=250)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-11 13:54:31,385] A new study created in memory with name: no-name-04a0adeb-0b69-4ffb-aeb2-021b319fdb0c
[I 2024-03-11 13:54:31,595] Trial 0 finished with value: 0.787877442361687 and parameters: {'eta': 0.1834501260098997, 'n_estimators': 185, 'max_depth': 33, 'min_child_weight': 3, 'subsample': 0.727370589100657, 'colsample_bytree': 0.9449280167374343, 'gamma': 0.050087368849585034, 'alpha': 94.76528906714671, 'lambda': 0.0013716278279390392}. Best is trial 0 with value: 0.787877442361687.
[I 2024-03-11 13:54:31,904] Trial 1 finished with value: 0.7938592805277427 and parameters: {'eta': 0.0733776919334365, 'n_estimators': 267, 'max_depth': 7, 'min_child_weight': 4, 'subsample': 0.98028325754109, 'colsample_bytree': 0.987195071129926, 'gamma': 0.2548463320050759, 'alpha': 0.8230159683493552, 'lambda': 0.005357138982436045}. Best is trial 1 with value: 0.7938592805277427.
[I 2024-03-11 13:54:33,106] Trial 2 finished with value: 0.7843111823720497 and parameters: {'eta': 0.016

--------------------------------------
best_params = {'eta': 0.016395101936645933, 'n_estimators': 594, 'max_depth': 28, 'min_child_weight': 5, 'subsample': 0.9620760293430018, 'colsample_bytree': 0.7344765409252414, 'gamma': 0.3685991587496929, 'alpha': 4.643067403064722, 'lambda': 0.1610111170790045} with cross_validation_score = 0.800877056573259


In [20]:
train = pd.read_csv("Data/train.csv", index_col= "PassengerId")
test = pd.read_csv("Data/test.csv", index_col= "PassengerId")

X = train.drop(['Transported'], axis = 1)
X_test = test

y = train['Transported']

model = XGBClassifier(objective='binary:logistic', random_state=42)
model.set_params(**study.best_params)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test).astype(bool)

# Writing the submission DataFrame to a CSV file
kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission.to_csv("Data/optuna_XGBoost.csv", index=True)

## Random Forest

We extend the pipeline with a RandomForest classifier to predict the Transported variable. 

In [15]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

model = RandomForestClassifier(random_state=42, n_jobs=-1)

def objective(trial):
    
    params = {
        # trial parameters to optimize for RandomForestClassifier
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 50, log=True),
        'min_samples_split': trial.suggest_float('min_samples_split', 1e-6, 1e-3, log=True),
        'min_samples_leaf': trial.suggest_float('min_samples_leaf', 1e-6, 1e-3, log=True),
        'max_features': trial.suggest_float('max_features', 0.1, 1.0)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-11 13:48:25,475] A new study created in memory with name: no-name-d32bb126-a030-437d-b1ee-a6004a6028eb
[I 2024-03-11 13:48:30,062] Trial 0 finished with value: 0.7855760787352494 and parameters: {'n_estimators': 943, 'max_depth': 39, 'min_samples_split': 1.1323501846307218e-05, 'min_samples_leaf': 2.45738334001676e-06, 'max_features': 0.3072622437168274}. Best is trial 0 with value: 0.7855760787352494.
[I 2024-03-11 13:48:31,081] Trial 1 finished with value: 0.793513923785941 and parameters: {'n_estimators': 113, 'max_depth': 14, 'min_samples_split': 6.697019026766778e-05, 'min_samples_leaf': 5.186329911516327e-05, 'max_features': 0.19706136575688965}. Best is trial 1 with value: 0.793513923785941.
[I 2024-03-11 13:48:32,512] Trial 2 finished with value: 0.7908683283582287 and parameters: {'n_estimators': 172, 'max_depth': 13, 'min_samples_split': 0.00018077994388962794, 'min_samples_leaf': 5.557513981955184e-05, 'max_features': 0.782504000680964}. Best is trial 1 with value

--------------------------------------
best_params = {'n_estimators': 437, 'max_depth': 10, 'min_samples_split': 0.00017401163180449155, 'min_samples_leaf': 0.0006272004327245007, 'max_features': 0.6124539122248343} with cross_validation_score = 0.8000711359450923


In [16]:
model = RandomForestClassifier(random_state=42, n_jobs=-1)
model.set_params(**study.best_params)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission.to_csv("Data/optuna_RandomForest.csv", index=True)

## Cat Boost

We extend the pipeline with a CatBoost classifier to predict the Transported variable. 

In [None]:
#pip install catboost

In [21]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

train = pd.read_csv("Data/train.csv", index_col= "PassengerId")

# Step 1: Define transformers for different column types
numerical_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())]
    )

categorical_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep']
categorical_transformer = Pipeline(
    steps=[
        ('encoder', OrdinalEncoder())]
    )

# Step 2: Create a ColumnTransformer that applies the transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)],
        remainder='drop' 
)

# Step 3: Assemble the preprocessing pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)
])

# Fit and transform the DataFrame
X_preprocessed = preprocessing_pipeline.fit_transform(train)

preprocessing_pipeline

In [22]:
import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score

model = CatBoostClassifier(random_state=42, verbose=False)

def objective(trial):
    
    params = {
        # trial parameters to optimize for RandomForestClassifier
        'iterations' : trial.suggest_int("iterations", 100, 1000, log=True),
        'learning_rate' : trial.suggest_float("learning_rate", 1e-3, 0.3, log=True),
        'depth' : trial.suggest_int("depth", 3, 8),
        'l2_leaf_reg' : trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        'bagging_temperature': trial.suggest_float("bagging_temperature", 0.0, 10.0),
        'random_strength' : trial.suggest_float("random_strength", 1e-8, 10.0, log=True)
    }

    model.set_params(**params)

    cv_score = cross_val_score(model, X_preprocessed, y, cv=5, scoring='accuracy', n_jobs=-1).mean()

    return cv_score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

print("--------------------------------------")
print("best_params =", study.best_params, "with cross_validation_score =", study.best_value)

[I 2024-03-11 16:12:16,835] A new study created in memory with name: no-name-ebc68530-5e26-4412-8db4-2180dc0213c1
[I 2024-03-11 16:12:17,969] Trial 0 finished with value: 0.7945487367248747 and parameters: {'iterations': 119, 'learning_rate': 0.27072505865871777, 'depth': 3, 'l2_leaf_reg': 2.5638482746465385e-06, 'bagging_temperature': 5.9278544952522, 'random_strength': 2.618371216395463}. Best is trial 0 with value: 0.7945487367248747.
[I 2024-03-11 16:12:18,937] Trial 1 finished with value: 0.7544023224066316 and parameters: {'iterations': 285, 'learning_rate': 0.0027643565159609818, 'depth': 3, 'l2_leaf_reg': 1.4067685357241614e-06, 'bagging_temperature': 3.9438492608079665, 'random_strength': 2.3954632989995392e-05}. Best is trial 0 with value: 0.7945487367248747.
[I 2024-03-11 16:12:19,787] Trial 2 finished with value: 0.7969655060147923 and parameters: {'iterations': 286, 'learning_rate': 0.038277831703432415, 'depth': 6, 'l2_leaf_reg': 0.0010113350394207296, 'bagging_temperatur

--------------------------------------
best_params = {'iterations': 667, 'learning_rate': 0.020145902947252298, 'depth': 8, 'l2_leaf_reg': 5.3402103207967375, 'bagging_temperature': 6.448131536756745, 'random_strength': 7.5176314334223245e-06} with cross_validation_score = 0.7984606843211746


In [23]:
model = CatBoostClassifier(random_state=42, verbose=False)
model.set_params(**study.best_params)

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
    ])

model_pipeline.fit(X, y)

y_pred = model_pipeline.predict(X_test)

# Writing the submission DataFrame to a CSV file
kaggle_submission = pd.DataFrame(y_pred, columns=['Transported'], index=X_test.index)
kaggle_submission.to_csv("Data/optuna_CatBoost.csv", index=True)