In [1]:
import joblib
import json
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostClassifier
from catboost import Pool

# Load dataset

In [2]:
data = pd.read_csv('../data/raw/train.csv')
main_test = pd.read_csv('../data/raw/test.csv')

In [3]:
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Features engineering

In [4]:
class TransformBooleanToNumeric(BaseEstimator, TransformerMixin):
    def __init__(self, transform_boolean=True):
        self.transform_boolean = transform_boolean

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_boolean:
            X = X.apply(lambda x: x * 1)
            return X

In [5]:
class TransformCategoricalValues(BaseEstimator, TransformerMixin):
    def __init__(self, transform_categorical=True):
        self.transform_categorical = transform_categorical

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_categorical:
            X['PassengerGroupId'] = X['PassengerId'].str.split('_', expand=True)[0]
            X['PassengerGroupCount'] = X['PassengerId'].str.split('_', expand=True)[1]
            X['CabinDesc'] = X['Cabin'].str.split('/', expand=True)[0]
            X['CabinNumber'] = X['Cabin'].str.split('/', expand=True)[1]
            X['CabinSide'] = X['Cabin'].str.split('/', expand=True)[2]

            # X.drop(['PassengerId', 'Cabin'], axis=1, inplace=True)

            X[['PassengerGroupId', 'PassengerGroupCount', 'CabinNumber']] = X[
                ['PassengerGroupId', 'PassengerGroupCount', 'CabinNumber']].apply(pd.to_numeric)
            return X
        # else:
        # return X

In [6]:
def preprocess_pipeline(df):
    """Basic transformation"""

    boolean_transform = Pipeline([('boolean_to_num', TransformBooleanToNumeric())])
    categorical_split = Pipeline([('categorical_split', TransformCategoricalValues())])

    df = boolean_transform.fit_transform(df)
    df = categorical_split.fit_transform(df)

    df[['PassengerId', 'Cabin', 'Name']] = df[['PassengerId', 'Cabin', 'Name']].astype('string')
    df['Cabin'].fillna('no_data', inplace=True)
    df['Name'].fillna('no_data', inplace=True)

    return df

In [7]:
data_prepared = preprocess_pipeline(data)
data_prepared

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,PassengerGroupId,PassengerGroupCount,CabinDesc,CabinNumber,CabinSide
0,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,1,1,B,0.0,P
1,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,2,1,F,0.0,S
2,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,3,1,A,0.0,S
3,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,3,2,A,0.0,S
4,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,4,1,F,1.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0,A/98/P,55 Cancri e,41.0,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,9276,1,A,98.0,P
8689,9278_01,Earth,1,G/1499/S,PSO J318.5-22,18.0,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,9278,1,G,1499.0,S
8690,9279_01,Earth,0,G/1500/S,TRAPPIST-1e,26.0,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,9279,1,G,1500.0,S
8691,9280_01,Europa,0,E/608/S,55 Cancri e,32.0,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,9280,1,E,608.0,S


In [8]:
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
                        'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide']
numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [9]:
numeric_pipeline = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median')),
           ('std_scaler', StandardScaler())]
)

categorical_pipeline = Pipeline(
    steps=[('no_data', SimpleImputer(strategy='most_frequent', missing_values=pd.NA))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)],
    verbose_feature_names_out=False, remainder='passthrough')

In [10]:
def creat_fin_data(df):
    tmp = pd.DataFrame(
        data=preprocessor.fit_transform(df),
        columns=preprocessor.get_feature_names_out(),
        index=df.index)

    tmp['CabinNumber'] = tmp['CabinNumber'].astype('int')

    return tmp

In [11]:
data_prepared = creat_fin_data(data_prepared)

# Split dataset

In [12]:
train_set, validation_set = train_test_split(data_prepared, train_size=0.85, random_state=42,
                                             stratify=data_prepared['Transported'])

# Train

In [13]:
X = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
     'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
     'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide']  #, 'PassengerId', 'Cabin', 'Name']
cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
                'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide']
text_features = ['PassengerId', 'Cabin', 'Name']
y = ['Transported']

In [14]:
train_data = Pool(data=train_set[X],
                  label=train_set[y],
                  cat_features=cat_features,
                  # text_features=text_features
                  )

valid_data = Pool(data=validation_set[X],
                  label=validation_set[y],
                  cat_features=cat_features,
                  # text_features=text_features
                  )

In [36]:
params = {'verbose': 100,
          'random_seed': 42,
          'learning_rate': 0.1,
          'eval_metric': 'Accuracy'}

In [37]:
model = CatBoostClassifier(**params)

In [38]:
model.fit(train_data, eval_set=valid_data)

0:	learn: 0.7383949	test: 0.7216258	best: 0.7216258 (0)	total: 6.9ms	remaining: 6.89s
100:	learn: 0.8209501	test: 0.8044479	best: 0.8082822 (68)	total: 766ms	remaining: 6.82s
200:	learn: 0.8432806	test: 0.8006135	best: 0.8082822 (68)	total: 1.5s	remaining: 5.96s
300:	learn: 0.8581675	test: 0.8059816	best: 0.8082822 (68)	total: 2.17s	remaining: 5.04s
400:	learn: 0.8731899	test: 0.8059816	best: 0.8090491 (305)	total: 2.86s	remaining: 4.27s
500:	learn: 0.8826634	test: 0.8090491	best: 0.8113497 (461)	total: 3.56s	remaining: 3.55s
600:	learn: 0.8911896	test: 0.8144172	best: 0.8167178 (569)	total: 4.25s	remaining: 2.82s
700:	learn: 0.9014752	test: 0.8082822	best: 0.8167178 (569)	total: 4.94s	remaining: 2.11s
800:	learn: 0.9112194	test: 0.8151840	best: 0.8167178 (569)	total: 5.63s	remaining: 1.4s
900:	learn: 0.9200162	test: 0.8105828	best: 0.8167178 (569)	total: 6.29s	remaining: 691ms
999:	learn: 0.9281364	test: 0.8052147	best: 0.8167178 (569)	total: 7.01s	remaining: 0us

bestTest = 0.8167177

<catboost.core.CatBoostClassifier at 0x16ca3be80>

# Predict

In [39]:
main_test_prepared = preprocess_pipeline(main_test)
main_test_prepared = creat_fin_data(main_test_prepared)

In [40]:
main_test_prepared['Transported'] = model.predict(main_test_prepared[X])

In [41]:
main_test_prepared

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,CryoSleep,Destination,VIP,PassengerGroupId,PassengerGroupCount,CabinDesc,CabinNumber,CabinSide,PassengerId,Cabin,Name,Transported
0,-0.114147,-0.357339,-0.28384,-0.312173,-0.267841,-0.246712,Earth,1,TRAPPIST-1e,0,13,1,G,3,S,0013_01,G/3/S,Nelly Carsoning,1
1,-0.684313,-0.357339,-0.277879,-0.312173,2.287504,-0.246712,Earth,0,TRAPPIST-1e,0,18,1,F,4,S,0018_01,F/4/S,Lerome Peckers,0
2,0.170937,-0.357339,-0.28384,-0.312173,-0.267841,-0.246712,Europa,1,55 Cancri e,0,19,1,C,0,S,0019_01,C/0/S,Sabih Unhearfus,1
3,0.669832,-0.357339,4.121518,-0.312173,-0.104002,0.226648,Europa,0,TRAPPIST-1e,0,21,1,C,1,S,0021_01,C/1/S,Meratz Caltilter,1
4,-0.613042,-0.340723,-0.28384,0.832122,-0.267841,-0.246712,Earth,0,TRAPPIST-1e,0,23,1,F,5,S,0023_01,F/5/S,Brence Harperez,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0.384749,-0.357339,-0.28384,-0.312173,-0.267841,-0.246712,Earth,1,TRAPPIST-1e,0,9266,2,G,1496,S,9266_02,G/1496/S,Jeron Peter,1
4273,0.954916,-0.357339,0.277095,-0.281538,-0.25879,-0.130193,Earth,0,TRAPPIST-1e,0,9269,1,F,4,S,9269_01,no_data,Matty Scheron,1
4274,-0.185417,-0.357339,-0.28384,-0.312173,-0.267841,-0.246712,Mars,1,55 Cancri e,0,9271,1,D,296,P,9271_01,D/296/P,Jayrin Pore,1
4275,-0.185417,-0.357339,1.491019,-0.312173,-0.267841,0.176479,Europa,0,TRAPPIST-1e,0,9273,1,D,297,P,9273_01,D/297/P,Kitakan Conale,1


In [56]:
results = main_test_prepared[['PassengerId', 'Transported']].copy()
results['Transported'] = results['Transported'].map({1: True, 0: False})
results.to_csv('../data/processed/submissions.csv', index=False)

# Evaluate

In [69]:
model.get_best_iteration()

569

In [76]:
kaggle_public_score = 0.79214

reports = {
    'params': model.get_params(),
    'validation_best_score': model.get_best_score()['validation'],
    'validation_best_iteration': model.get_best_iteration(),
    'kaggle_public_score': kaggle_public_score,
    'kaggle_score_minus_validation_score': kaggle_public_score - model.get_best_score()['validation']['Accuracy']
}

In [79]:
metrics_file = '../reports/metrics.json'

with open(metrics_file, 'w') as mf:
    json.dump(
        obj=reports,
        fp=mf,
        indent=4
    )

In [80]:
# Save model
model_path = '../models/model.joblib'
joblib.dump(model, model_path)

['../models/model.joblib']

In [81]:
# Save features
features_path = '../data/processed/featured_spaceship-titanic.csv'
data_prepared.to_csv(features_path, index=False)

# Save train and test sets
train_set_path = '../data/processed/train_spaceship-titanic.csv'
validation_set_path = '../data/processed/validation_spaceship-titanic.csv'
train_set.to_csv(train_set_path)
validation_set.to_csv(validation_set_path)