In [1]:
import joblib
import json
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostClassifier
from catboost import Pool

# Load dataset

In [2]:
data = pd.read_csv('../data/raw/train.csv')
main_test = pd.read_csv('../data/raw/test.csv')

In [3]:
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Features engineering

In [4]:
class TransformBooleanToNumeric(BaseEstimator, TransformerMixin):
    def __init__(self, transform_boolean=True):
        self.transform_boolean = transform_boolean

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_boolean:
            X = X.apply(lambda x: x * 1)
            return X

In [5]:
class TransformCategoricalValues(BaseEstimator, TransformerMixin):
    def __init__(self, transform_categorical=True):
        self.transform_categorical = transform_categorical

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_categorical:
            X['PassengerGroupId'] = X['PassengerId'].str.split('_', expand=True)[0]
            X['PassengerGroupCount'] = X['PassengerId'].str.split('_', expand=True)[1]
            X['CabinDesc'] = X['Cabin'].str.split('/', expand=True)[0]
            X['CabinNumber'] = X['Cabin'].str.split('/', expand=True)[1]
            X['CabinSide'] = X['Cabin'].str.split('/', expand=True)[2]

            X.drop(['PassengerId', 'Cabin'], axis=1, inplace=True)

            X[['PassengerGroupId', 'PassengerGroupCount', 'CabinNumber']] = X[
                ['PassengerGroupId', 'PassengerGroupCount', 'CabinNumber']].apply(pd.to_numeric)
            return X
        # else:
        # return X

In [6]:
def preprocess_pipeline(df):
    """Basic transformation"""

    boolean_transform = Pipeline([('boolean_to_num', TransformBooleanToNumeric())])
    categorical_split = Pipeline([('categorical_split', TransformCategoricalValues())])

    df = boolean_transform.fit_transform(df)
    df = categorical_split.fit_transform(df)

    return df

In [7]:
data_prepared = preprocess_pipeline(data)
data_prepared

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,PassengerGroupId,PassengerGroupCount,CabinDesc,CabinNumber,CabinSide
0,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,1,1,B,0.0,P
1,Earth,0,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,2,1,F,0.0,S
2,Europa,0,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,3,1,A,0.0,S
3,Europa,0,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,3,2,A,0.0,S
4,Earth,0,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,4,1,F,1.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,0,55 Cancri e,41.0,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,9276,1,A,98.0,P
8689,Earth,1,PSO J318.5-22,18.0,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,9278,1,G,1499.0,S
8690,Earth,0,TRAPPIST-1e,26.0,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,9279,1,G,1500.0,S
8691,Europa,0,55 Cancri e,32.0,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,9280,1,E,608.0,S


In [8]:
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
                        'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide']
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [9]:
numeric_pipeline = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median')),
           ('std_scaler', StandardScaler())]
)

categorical_pipeline = Pipeline(
    steps=[('no_data', SimpleImputer(strategy='most_frequent', missing_values=pd.NA))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)],
    verbose_feature_names_out=False, remainder='passthrough')

In [10]:
def creat_fin_data(df):
    tmp = pd.DataFrame(
        data=preprocessor.fit_transform(df),
        columns=preprocessor.get_feature_names_out(),
        index=df.index)

    tmp['CabinNumber'] = tmp['CabinNumber'].astype('int')

    return tmp

In [11]:
data_prepared = creat_fin_data(data_prepared)

In [12]:
# Save features
features_path = '../data/processed/featured_spaceship-titanic.csv'
data_prepared.to_csv(features_path, index=False)

# Split dataset

In [13]:
train_set, validation_set = train_test_split(data_prepared, train_size=0.85, random_state=42,
                                             stratify=data_prepared['Transported'])

In [14]:
# Save train and test sets
train_set_path = '../data/processed/train_spaceship-titanic.csv'
validation_set_path = '../data/processed/validation_spaceship-titanic.csv'

train_set.to_csv(train_set_path)
validation_set.to_csv(validation_set_path)

# Train

In [15]:
X = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
     'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
     'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide']  # 'Name',
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
                'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide']
y = ['Transported']

In [16]:
train_data = Pool(data=train_set[X],
                  label=train_set[y],
                  cat_features=cat_features,
                  # text_features=text_features
                  )

valid_data = Pool(data=validation_set[X],
                  label=validation_set[y],
                  cat_features=cat_features,
                  # text_features=text_features
                  )

In [17]:
params = {'verbose': 100,
          'random_seed': 42,
          'learning_rate': 0.1,
          'eval_metric': 'Accuracy'}

In [18]:
model = CatBoostClassifier(**params)

In [19]:
model.fit(train_data, eval_set=valid_data)

0:	learn: 0.7383949	test: 0.7216258	best: 0.7216258 (0)	total: 63.8ms	remaining: 1m 3s
100:	learn: 0.8209501	test: 0.8044479	best: 0.8082822 (68)	total: 1.14s	remaining: 10.1s
200:	learn: 0.8432806	test: 0.8006135	best: 0.8082822 (68)	total: 2.92s	remaining: 11.6s
300:	learn: 0.8581675	test: 0.8059816	best: 0.8082822 (68)	total: 3.93s	remaining: 9.12s
400:	learn: 0.8731899	test: 0.8059816	best: 0.8090491 (305)	total: 5.1s	remaining: 7.62s
500:	learn: 0.8826634	test: 0.8090491	best: 0.8113497 (461)	total: 5.97s	remaining: 5.94s
600:	learn: 0.8911896	test: 0.8144172	best: 0.8167178 (569)	total: 6.85s	remaining: 4.55s
700:	learn: 0.9014752	test: 0.8082822	best: 0.8167178 (569)	total: 7.63s	remaining: 3.25s
800:	learn: 0.9112194	test: 0.8151840	best: 0.8167178 (569)	total: 8.32s	remaining: 2.07s
900:	learn: 0.9200162	test: 0.8105828	best: 0.8167178 (569)	total: 8.95s	remaining: 983ms
999:	learn: 0.9281364	test: 0.8052147	best: 0.8167178 (569)	total: 9.56s	remaining: 0us

bestTest = 0.81671

<catboost.core.CatBoostClassifier at 0x16e1c7370>

In [20]:
model_path = '../models/model.joblib'
joblib.dump(model, model_path)

['../models/model.joblib']

# Evaluate

In [26]:
model.get_best_score()

{'learn': {'Accuracy': 0.928136419001218, 'Logloss': 0.21539589347804486},
 'validation': {'Accuracy': 0.816717791411043, 'Logloss': 0.3951064255883237}}

In [31]:
# Save metrics
metrics_file = '../reports/metrics.json'

with open(metrics_file, 'w') as mf:
    json.dump(
        obj=model.get_best_score(),
        fp=mf,
        indent=4
    )