In [1]:
import joblib
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostClassifier
from catboost import Pool

# Load dataset

In [45]:
data = pd.read_csv('../data/raw/train.csv')

In [46]:
data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Features engineering

In [47]:
class TransformBooleanToNumeric(BaseEstimator, TransformerMixin):
    def __init__(self, transform_boolean=True):
        self.transform_boolean = transform_boolean

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_boolean:
            X = X.apply(lambda x: x * 1)
            return X

In [51]:
class TransformCategoricalValues(BaseEstimator, TransformerMixin):
    def __init__(self, transform_categorical=True):
        self.transform_categorical = transform_categorical

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_categorical:
            X = X.copy()
            X['PassengerGroupId'] = X['PassengerId'].str.split('_', expand=True)[0]
            X['PassengerGroupCount'] = X['PassengerId'].str.split('_', expand=True)[1]
            X['CabinDesc'] = X['Cabin'].str.split('/', expand=True)[0]
            X['CabinNumber'] = X['Cabin'].str.split('/', expand=True)[1]
            X['CabinSide'] = X['Cabin'].str.split('/', expand=True)[2]

            # X[['PassengerGroupId', 'PassengerGroupCount', 'CabinNumber']] = X[
            #     ['PassengerGroupId', 'PassengerGroupCount', 'CabinNumber']].apply(pd.to_numeric)
            return X
        else:
            return X

In [86]:
# TODO:
# - Сохранять названия метрик при трансформации данных
# - Удалять из вывода столбцы "доноры"

tmp = ColumnTransformer(
    transformers=[
        ('first', TransformBooleanToNumeric(), ['CryoSleep', 'VIP', 'Transported']),
        ('tho', TransformCategoricalValues(), ['PassengerId', 'Cabin'])
    ],
    verbose_feature_names_out=True,
    remainder='passthrough'
)
x = tmp.fit_transform(data)

In [87]:
tmp.get_feature_names_out()

AttributeError: Transformer first (type TransformBooleanToNumeric) does not provide get_feature_names_out.

In [18]:
# # Костыль :(
# def prepared_raw_data(data):
#
#     transform_boolean = TransformBooleanToNumeric()
#     transform_categorical = TransformCategoricalValues()
#     data_prepared = transform_boolean.transform(data)
#     data_prepared = transform_categorical.transform(data_prepared)
#
#     return data_prepared

In [19]:
# data_prepared = prepared_raw_data(data)
# data_prepared.set_index('PassengerId', inplace=True)
# data_prepared.drop(['Cabin'], axis=1, inplace=True)
# data_prepared

In [8]:
categorical_features = ['HomePlanet', 'Destination', 'CabinDesc', 'CabinSide', 'Transported']
numeric_features = ['CryoSleep', 'VIP', 'PassengerGroupId', 'PassengerGroupCount', 'CabinNumber', 'Age', 'RoomService',
                    'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', ]

In [9]:
numeric_pipeline = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median')),
           # ('std_scaler', StandardScaler())
           ]
)

categorical_pipeline = Pipeline(
    steps=[('no_data', SimpleImputer(strategy='most_frequent'))]
)

preprocessor = ColumnTransformer(
    transformers=[('num', numeric_pipeline, numeric_features),
                  ('cat', categorical_pipeline, categorical_features)],
    verbose_feature_names_out=False, remainder='passthrough'
)

In [10]:
# Костыль :(
def prepared_data(data):
    num_data = numeric_pipeline.fit_transform(data[numeric_features])
    num_data = pd.DataFrame(num_data, columns=numeric_pipeline.get_feature_names_out(), index=data.index)
    cat_data = categorical_pipeline.fit_transform(data[categorical_features])
    cat_data = pd.DataFrame(cat_data, columns=categorical_pipeline.get_feature_names_out(), index=data.index)

    result = pd.concat([num_data, cat_data], axis=1)

    return result

In [11]:
data_prepared = prepared_data(data_prepared)

In [14]:
# Костыль
data_prepared[['CryoSleep', 'VIP', 'PassengerGroupId', 'PassengerGroupCount', 'CabinNumber', ]] = data_prepared[
    ['CryoSleep', 'VIP', 'PassengerGroupId', 'PassengerGroupCount', 'CabinNumber', ]].astype('int')

In [15]:
data_prepared

Unnamed: 0_level_0,CryoSleep,VIP,PassengerGroupId,PassengerGroupCount,CabinNumber,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet,Destination,CabinDesc,CabinSide,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0001_01,0,0,1,1,0,39.0,0.0,0.0,0.0,0.0,0.0,Europa,TRAPPIST-1e,B,P,0
0002_01,0,0,2,1,0,24.0,109.0,9.0,25.0,549.0,44.0,Earth,TRAPPIST-1e,F,S,1
0003_01,0,1,3,1,0,58.0,43.0,3576.0,0.0,6715.0,49.0,Europa,TRAPPIST-1e,A,S,0
0003_02,0,0,3,2,0,33.0,0.0,1283.0,371.0,3329.0,193.0,Europa,TRAPPIST-1e,A,S,0
0004_01,0,0,4,1,1,16.0,303.0,70.0,151.0,565.0,2.0,Earth,TRAPPIST-1e,F,S,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,0,1,9276,1,98,41.0,0.0,6819.0,0.0,1643.0,74.0,Europa,55 Cancri e,A,P,0
9278_01,1,0,9278,1,1499,18.0,0.0,0.0,0.0,0.0,0.0,Earth,PSO J318.5-22,G,S,0
9279_01,0,0,9279,1,1500,26.0,0.0,0.0,1872.0,1.0,0.0,Earth,TRAPPIST-1e,G,S,1
9280_01,0,0,9280,1,608,32.0,0.0,1049.0,0.0,353.0,3235.0,Europa,55 Cancri e,E,S,0


In [16]:
# Save features
features_path = '../data/processed/featured_spaceship-titanic.csv'
data_prepared.to_csv(features_path, index=False)

# Split dataset

In [18]:
train_set, validation_set = train_test_split(data_prepared, train_size=0.85, random_state=42,
                                             stratify=data_prepared['Transported'])

In [19]:
# Save train and test sets
train_set_path = '../data/processed/train_spaceship-titanic.csv'
validation_set_path = '../data/processed/validation_spaceship-titanic.csv'

train_set.to_csv(train_set_path)
validation_set.to_csv(validation_set_path)

# Train

In [21]:
X = ['CryoSleep', 'VIP', 'PassengerGroupId', 'PassengerGroupCount',
     'CabinNumber', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa',
     'VRDeck', 'HomePlanet', 'Destination', 'CabinDesc', 'CabinSide']
cat_features = ['CryoSleep', 'VIP', 'PassengerGroupId', 'PassengerGroupCount', 'CabinNumber', 'HomePlanet',
                'Destination', 'CabinDesc', 'CabinSide']
text_features = []
y = ['Transported']

In [22]:
train_data = Pool(data=train_set[X],
                  label=train_set[y],
                  cat_features=cat_features,
                  text_features=text_features
                  )

valid_data = Pool(data=validation_set[X],
                  label=validation_set[y],
                  cat_features=cat_features,
                  text_features=text_features
                  )

In [23]:
params = {'verbose': 100,
          'random_seed': 42,
          'learning_rate': 0.01,
          'eval_metric': 'Accuracy'}

In [24]:
model = CatBoostClassifier(**params)

In [25]:
model.fit(train_data, eval_set=valid_data)

0:	learn: 0.7386656	test: 0.7285276	best: 0.7285276 (0)	total: 65.1ms	remaining: 1m 5s
100:	learn: 0.7957775	test: 0.7806748	best: 0.7806748 (98)	total: 1.15s	remaining: 10.2s
200:	learn: 0.8033563	test: 0.7929448	best: 0.7937117 (177)	total: 3.81s	remaining: 15.1s
300:	learn: 0.8087698	test: 0.7944785	best: 0.7952454 (286)	total: 5.17s	remaining: 12s
400:	learn: 0.8113412	test: 0.7952454	best: 0.7975460 (348)	total: 5.86s	remaining: 8.75s
500:	learn: 0.8129652	test: 0.7960123	best: 0.7975460 (348)	total: 6.55s	remaining: 6.53s
600:	learn: 0.8141832	test: 0.7944785	best: 0.7975460 (348)	total: 7.48s	remaining: 4.96s
700:	learn: 0.8160780	test: 0.7983129	best: 0.7998466 (640)	total: 9.44s	remaining: 4.03s
800:	learn: 0.8185140	test: 0.8021472	best: 0.8021472 (796)	total: 11.1s	remaining: 2.75s
900:	learn: 0.8213561	test: 0.8021472	best: 0.8044479 (864)	total: 12.1s	remaining: 1.33s
999:	learn: 0.8241981	test: 0.8036810	best: 0.8044479 (864)	total: 12.8s	remaining: 0us

bestTest = 0.8044

<catboost.core.CatBoostClassifier at 0x28c554550>

In [27]:
model_path = '../models/model.joblib'
joblib.dump(model, model_path)

['../models/model.joblib']

# Evaluate

In [41]:
model.get_best_score()

{'learn': {'Accuracy': 0.8244688049803762, 'Logloss': 0.38164370247464685},
 'validation': {'Accuracy': 0.8044478527607362,
  'Logloss': 0.40142678500743273}}

In [55]:
# Save metrics
metrics_file = '../reports/metrics.json'

with open(metrics_file, 'w') as mf:
    json.dump(
        obj=model.get_best_score(),
        fp=mf,
        # indent=4
    )