In [28]:
import joblib
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from catboost import CatBoostClassifier
from catboost import Pool
import yaml

# Config

In [2]:
%cd ..

/Users/skv/PycharmProjects/kaggle-spaceship-titanic


In [29]:
with open('params.yaml') as config_file:
    config = yaml.safe_load(config_file)

print(config)

{'base': {'random_state': 42}, 'data': {'train_data': 'data/raw/train.csv', 'test_data': 'data/raw/test.csv', 'features_path': 'data/processed/featured.csv', 'train_set_path': 'data/processed/train_set.csv', 'validation_set_path': 'data/processed/validation_set.csv', 'submission_data': 'data/processed/submissions.csv'}, 'train': {'train_size': 0.85, 'model_params': {'verbose': 100, 'learning_rate': 0.1, 'eval_metric': 'Accuracy'}, 'model_path': 'models/model.joblib'}, 'reports': {'metrics_file': 'reports/metrics.json'}}


# Load dataset

In [30]:
train_data = pd.read_csv(config['data']['train_data'])
test_data = pd.read_csv(config['data']['test_data'])

In [31]:
train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


# Features engineering

In [32]:
class TransformBooleanToNumeric(BaseEstimator, TransformerMixin):
    def __init__(self, transform_boolean=True):
        self.transform_boolean = transform_boolean

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_boolean:
            X = X.apply(lambda x: x * 1)
            return X

In [33]:
class TransformCategoricalValues(BaseEstimator, TransformerMixin):
    def __init__(self, transform_categorical=True):
        self.transform_categorical = transform_categorical

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if self.transform_categorical:
            X['PassengerGroupId'] = X['PassengerId'].str.split('_', expand=True)[0]
            X['PassengerGroupCount'] = X['PassengerId'].str.split('_', expand=True)[1]
            X['CabinDesc'] = X['Cabin'].str.split('/', expand=True)[0]
            X['CabinNumber'] = X['Cabin'].str.split('/', expand=True)[1]
            X['CabinSide'] = X['Cabin'].str.split('/', expand=True)[2]

            # X.drop(['PassengerId', 'Cabin'], axis=1, inplace=True)

            X[['PassengerGroupId', 'PassengerGroupCount', 'CabinNumber']] = X[
                ['PassengerGroupId', 'PassengerGroupCount', 'CabinNumber']].apply(pd.to_numeric)
            return X
        # else:
        # return X

In [34]:
def preprocess_pipeline(df):
    """Basic transformation"""

    boolean_transform = Pipeline([('boolean_to_num', TransformBooleanToNumeric())])
    categorical_split = Pipeline([('categorical_split', TransformCategoricalValues())])

    df = boolean_transform.fit_transform(df)
    df = categorical_split.fit_transform(df)

    df[['PassengerId', 'Cabin', 'Name']] = df[['PassengerId', 'Cabin', 'Name']].astype('string')
    df['Cabin'].fillna('no_data', inplace=True)
    df['Name'].fillna('no_data', inplace=True)

    return df

In [35]:
train_data_prepared = preprocess_pipeline(train_data)
train_data_prepared

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,PassengerGroupId,PassengerGroupCount,CabinDesc,CabinNumber,CabinSide
0,0001_01,Europa,0,B/0/P,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,1,1,B,0.0,P
1,0002_01,Earth,0,F/0/S,TRAPPIST-1e,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,2,1,F,0.0,S
2,0003_01,Europa,0,A/0/S,TRAPPIST-1e,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,3,1,A,0.0,S
3,0003_02,Europa,0,A/0/S,TRAPPIST-1e,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,3,2,A,0.0,S
4,0004_01,Earth,0,F/1/S,TRAPPIST-1e,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,4,1,F,1.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,0,A/98/P,55 Cancri e,41.0,1,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,9276,1,A,98.0,P
8689,9278_01,Earth,1,G/1499/S,PSO J318.5-22,18.0,0,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,9278,1,G,1499.0,S
8690,9279_01,Earth,0,G/1500/S,TRAPPIST-1e,26.0,0,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,9279,1,G,1500.0,S
8691,9280_01,Europa,0,E/608/S,55 Cancri e,32.0,0,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,9280,1,E,608.0,S


In [36]:
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
                        'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide']

numerical_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [37]:
numeric_pipeline = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median')),
           ('std_scaler', StandardScaler())]
)

categorical_pipeline = Pipeline(
    steps=[('no_data', SimpleImputer(strategy='most_frequent', missing_values=pd.NA))]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)],
    verbose_feature_names_out=False, remainder='passthrough')

In [38]:
def creat_fin_data(df):
    result = pd.DataFrame(
        data=preprocessor.fit_transform(df),
        columns=preprocessor.get_feature_names_out(),
        index=df.index)

    result['CabinNumber'] = result['CabinNumber'].astype('int')

    return result

In [39]:
train_data_prepared = creat_fin_data(train_data_prepared)

In [40]:
# Save features
train_data_prepared.to_csv(config['data']['features_path'], index=False)

# Split dataset

In [41]:
train_set, validation_set = train_test_split(
    train_data_prepared, train_size=config['train']['train_size'],
    random_state=config['base']['random_state'],
    stratify=train_data_prepared['Transported'])

In [42]:
# Save train and validation sets
train_set.to_csv(config['data']['train_set_path'])
validation_set.to_csv(config['data']['validation_set_path'])

# Train

In [43]:
X = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
     'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
     'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide'] #, 'PassengerId', 'Cabin', 'Name']

cat_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'PassengerGroupId',
                'PassengerGroupCount', 'CabinDesc', 'CabinNumber', 'CabinSide']

text_features = ['PassengerId', 'Cabin', 'Name']

y = ['Transported']

In [44]:
train_data = Pool(data=train_set[X],
                  label=train_set[y],
                  cat_features=cat_features,
                  # text_features=text_features
                  )

valid_data = Pool(data=validation_set[X],
                  label=validation_set[y],
                  cat_features=cat_features,
                  # text_features=text_features
                  )

In [45]:
params = {'verbose': config['train']['model_params']['verbose'],
          'random_seed': config['base']['random_state'],
          'learning_rate': config['train']['model_params']['learning_rate'],
          'eval_metric': config['train']['model_params']['eval_metric']}

In [46]:
model = CatBoostClassifier(**params)

In [47]:
model.fit(train_data, eval_set=valid_data)

0:	learn: 0.7383949	test: 0.7216258	best: 0.7216258 (0)	total: 7.53ms	remaining: 7.52s
100:	learn: 0.8209501	test: 0.8044479	best: 0.8082822 (68)	total: 679ms	remaining: 6.05s
200:	learn: 0.8432806	test: 0.8006135	best: 0.8082822 (68)	total: 1.5s	remaining: 5.97s
300:	learn: 0.8581675	test: 0.8059816	best: 0.8082822 (68)	total: 2.24s	remaining: 5.21s
400:	learn: 0.8731899	test: 0.8059816	best: 0.8090491 (305)	total: 2.88s	remaining: 4.31s
500:	learn: 0.8826634	test: 0.8090491	best: 0.8113497 (461)	total: 3.55s	remaining: 3.54s
600:	learn: 0.8911896	test: 0.8144172	best: 0.8167178 (569)	total: 4.2s	remaining: 2.79s
700:	learn: 0.9014752	test: 0.8082822	best: 0.8167178 (569)	total: 4.87s	remaining: 2.08s
800:	learn: 0.9112194	test: 0.8151840	best: 0.8167178 (569)	total: 5.55s	remaining: 1.38s
900:	learn: 0.9200162	test: 0.8105828	best: 0.8167178 (569)	total: 6.21s	remaining: 682ms
999:	learn: 0.9281364	test: 0.8052147	best: 0.8167178 (569)	total: 6.85s	remaining: 0us

bestTest = 0.816717

<catboost.core.CatBoostClassifier at 0x13f8d35b0>

In [48]:
# Save model
joblib.dump(model, config['train']['model_path'])

['models/model.joblib']

# Predict

In [49]:
test_data_prepared = preprocess_pipeline(test_data)
test_data_prepared = creat_fin_data(test_data_prepared)

In [50]:
test_data_prepared['Transported'] = model.predict(test_data_prepared[X])

In [51]:
submission = test_data_prepared[['PassengerId', 'Transported']].copy()
submission['Transported'] = submission['Transported'].map({1: True, 0: False})
submission.to_csv(config['data']['submission_data'], index=False)

# Reports

In [52]:
kaggle_public_score = 0.79214

reports = {
    'params': model.get_params(),
    'validation_best_score': model.get_best_score()['validation'],
    'validation_best_iteration': model.get_best_iteration(),
    'kaggle_public_score': kaggle_public_score,
    'kaggle_score_minus_validation_score': kaggle_public_score - model.get_best_score()['validation']['Accuracy']
}

In [54]:
with open(config['reports']['metrics_file'], 'w') as mf:
    json.dump(
        obj=reports,
        fp=mf,
        indent=4
    )