In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import tensorflow as tf
tf.random.set_seed(42)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


In [2]:
#sklearn imports
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier, StackingClassifier
# from sklearn. import 

In [3]:
#Function declaration
def analyze(df):
    print(df.info(), df.describe(), "\n")
    
def drop_cols(df, df_test, cols):
    df.drop(cols, axis=1, inplace=True)
    df_test.drop(cols, axis=1, inplace=True)

def check_corr(df, x, y):
    plt.scatter(df[x], y)
    plt.xlabel(x)
    plt.ylabel(y.name)
    plt.show()
    print(df[x].corr(y))
    
def score_model(X_train, X_valid, y_train, y_valid, model):
    model.fit(X_train, y_train)
    y_predt = model.predict(X_train)
    y_predv = model.predict(X_valid)
    print("train accuracy/f1:", accuracy_score(y_train, y_predt), f1_score(y_train, y_predt))
    print("valid accuracy/f1:", accuracy_score(y_valid, y_predv), f1_score(y_valid, y_predv))
    print("")
    
def tune_hyperparameter(X, y, search):
    search.fit(X, y)
    print("best score:", search.best_score_)
    print("best params:", search.best_params_)
    print("")

In [4]:
#Dataset preparation
X = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
y = X.pop('Transported').astype('int64')
X_test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
drop_cols(X, X_test, ['PassengerId', 'Name'])

num_cols = X.select_dtypes(['number']).columns
cat_cols = X.select_dtypes(['object']).columns

print(X.shape, y.shape)
print("num_cols:", num_cols, "cat_cols:", cat_cols, sep="\n")
analyze(X)
analyze(y)
y.value_counts()

(8693, 11) (8693,)
num_cols:
Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'], dtype='object')
cat_cols:
Index(['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP'], dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
dtypes: float64(6), object(5)
memory usage: 747.2+ KB
None                Age   RoomService     FoodCourt  ShoppingMall  

Transported
1    4378
0    4315
Name: count, dtype: int64

In [5]:
#Feature engineering
def feature_engineer(df):
    df_ = df.drop(['Cabin'] ,axis=1)
    df_['RoomService'] = df.RoomService.apply(lambda x: 1 if x > 0 else 0 if x == 0 else None)
    df_['FoodCourt'] = df.FoodCourt.apply(lambda x: 1 if x > 0 else 0 if x == 0 else None)
    df_['ShoppingMall'] = df.ShoppingMall.apply(lambda x: 1 if x > 0 else 0 if x == 0 else None)
    df_['Spa'] = df.Spa.apply(lambda x: 1 if x > 0 else 0 if x == 0 else None)
    df_['VRDeck'] = df.VRDeck.apply(lambda x: 1 if x > 0 else 0 if x == 0 else None)
    df_['TotalBilled'] = df.FoodCourt + df.ShoppingMall - df.VRDeck - df.Spa - df.RoomService
    df_['NumberBilled'] = df_[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    df_['CryoSleep'] = df.CryoSleep.astype('Int64')
    df_['CabinDeck'] = df.Cabin.apply(lambda x: x[0] if type(x) == str else x)
    df_['CabinSide'] = df.Cabin.apply(lambda x: x[-1] if type(x) == str else x)
    df_['VIP'] = df.VIP.astype('Int64')
    return df_

X_eng = feature_engineer(X)
print(X_eng.join(y).corr(numeric_only=True).Transported.sort_values(ascending=False)[1:])
# check_corr(X_eng, 'TotalBilled', y)
# print(X_eng.TotalBilled.value_counts())

# print(X_eng.shape, y.shape)
# analyze(X_eng)

CryoSleep       0.468645
TotalBilled     0.339555
VIP            -0.037650
Age            -0.075026
FoodCourt      -0.235484
ShoppingMall   -0.267085
VRDeck         -0.337230
Spa            -0.350125
RoomService    -0.354925
NumberBilled   -0.440170
Name: Transported, dtype: float64


In [6]:
#Data preprocessing
num_pipeline1 = make_pipeline(
    SimpleImputer(strategy='mean'), 
    StandardScaler()
)
num_pipeline2 = make_pipeline(
    SimpleImputer(strategy='most_frequent'), 
)
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'), 
    OneHotEncoder(handle_unknown='ignore', sparse_output=False)
)
col_transformer = ColumnTransformer([
    ('num1', num_pipeline1, ['Age', 'TotalBilled', 'NumberBilled']), 
    ('num2', num_pipeline2, ['CryoSleep', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']), 
    ('cat', cat_pipeline, make_column_selector(dtype_include=object))
])

col_transformer.fit(X_eng)
X_prep = pd.DataFrame(col_transformer.transform(X_eng), columns=col_transformer.get_feature_names_out())

# print(X_prep.shape, y.shape)
# analyze(X_prep)

In [7]:
#Train & Test sets preparation
X_train, X_valid, y_train, y_valid = train_test_split(X_prep, y, test_size=0.2)

print(X_train.shape, y_train.shape)
print(X_valid.shape, y_valid.shape)
print(y_train.value_counts())
print(y_valid.value_counts())

(6954, 26) (6954,)
(1739, 26) (1739,)
Transported
1    3499
0    3455
Name: count, dtype: int64
Transported
1    879
0    860
Name: count, dtype: int64


In [8]:
#Model selection
rfc = RandomForestClassifier(random_state=42)
abc = AdaBoostClassifier(random_state=42)
gbc = GradientBoostingClassifier(random_state=42)
lr = LogisticRegression(random_state=42)
svc = SVC(random_state=42)

score_model(X_train, X_valid, y_train, y_valid, rfc)
score_model(X_train, X_valid, y_train, y_valid, abc)
score_model(X_train, X_valid, y_train, y_valid, gbc)
score_model(X_train, X_valid, y_train, y_valid, lr)
score_model(X_train, X_valid, y_train, y_valid, svc)

train accuracy/f1: 0.952689099798677 0.9537466610431603
valid accuracy/f1: 0.7596319723979299 0.7555555555555555

train accuracy/f1: 0.7900488927236123 0.7974472807991121
valid accuracy/f1: 0.7780333525014376 0.7850779510022272

train accuracy/f1: 0.8123382226056945 0.8169962137147665
valid accuracy/f1: 0.7889591719378953 0.7955431754874651

train accuracy/f1: 0.7907679033649698 0.795099281791297
valid accuracy/f1: 0.7780333525014376 0.7836322869955156

train accuracy/f1: 0.8081679608858211 0.8104036384309267
valid accuracy/f1: 0.7791834387579069 0.7823129251700681



In [9]:
#Hyperparameter tuning
param_rfc = {'n_estimators': np.arange(50, 150), 'max_depth': np.arange(2, 20), 'min_samples_split': np.arange(2, 15)}
param_abc = {'n_estimators': np.arange(10, 50), 'learning_rate': [0.1, 0.5, 1.0, 5.0, 10]}
param_gbc = {'n_estimators': np.arange(50, 150), 'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]}
param_lr = {'C': [0.1, 0.5, 1.0, 5.0, 10]}
param_svc = {'C': [0.1, 0.5, 1.0, 5.0, 10]}

rs_rfc = RandomizedSearchCV(rfc, param_rfc, cv=3, n_iter=50, random_state=42)
rs_abc = RandomizedSearchCV(abc, param_abc, cv=3, n_iter=50, random_state=42)
rs_gbc = RandomizedSearchCV(gbc, param_gbc, cv=3, n_iter=50, random_state=42)
gs_lr = GridSearchCV(lr, param_lr, cv=5)
gs_svc = GridSearchCV(svc, param_svc, cv=5)

tune_hyperparameter(X_prep, y, rs_rfc)
tune_hyperparameter(X_prep, y, rs_abc)
tune_hyperparameter(X_prep, y, rs_gbc)
tune_hyperparameter(X_prep, y, gs_lr)
tune_hyperparameter(X_prep, y, gs_svc)

best score: 0.7967344672256801
best params: {'n_estimators': 66, 'min_samples_split': 8, 'max_depth': 10}

best score: 0.7820095655937832
best params: {'n_estimators': 34, 'learning_rate': 1.0}

best score: 0.7961588815095442
best params: {'n_estimators': 130, 'learning_rate': 0.1}

best score: 0.7854596143042144
best params: {'C': 5.0}

best score: 0.7930544186671307
best params: {'C': 0.5}



In [10]:
#Ensemble selection
rfc.set_params(**rs_rfc.best_params_)
abc.set_params(**rs_abc.best_params_)
gbc.set_params(**rs_gbc.best_params_)
lr.set_params(**gs_lr.best_params_)
svc.set_params(**gs_svc.best_params_)
svc.probability = True

vc = VotingClassifier([
    ('rfc', rfc), 
#     ('abc', abc), 
    ('gbc', gbc), 
#     ('lr', lr), 
    ('svc', svc)
], voting='soft')
sc = StackingClassifier([
    ('rfc', rfc), 
    ('abc', abc), 
    ('gbc', gbc), 
    ('lr', lr), 
    ('svc', svc), 
    ('vc', vc)
], cv=3)

score_model(X_train, X_valid, y_train, y_valid, vc)
score_model(X_train, X_valid, y_train, y_valid, sc)

train accuracy/f1: 0.8180903077365546 0.8213024438480011
valid accuracy/f1: 0.7826336975273146 0.7876404494382021

train accuracy/f1: 0.8288754673569169 0.8334266517357224
valid accuracy/f1: 0.7912593444508338 0.7986688851913477



In [11]:
#Refit models
rfc.fit(X_prep, y)
abc.fit(X_prep, y)
gbc.fit(X_prep, y)
lr.fit(X_prep, y)
svc.fit(X_prep, y)
sc.fit(X_prep, y)
print(rfc.score(X_prep, y), sc.score(X_prep, y))

0.8363050730472794 0.823536178534453


In [12]:
#Test data preparation
X_test_eng = feature_engineer(X_test)
X_test_prep = pd.DataFrame(col_transformer.transform(X_test_eng), columns=col_transformer.get_feature_names_out())

print(X_test_prep.shape)
analyze(X_test_prep)

(4277, 26)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   num1__Age                       4277 non-null   float64
 1   num1__TotalBilled               4277 non-null   float64
 2   num1__NumberBilled              4277 non-null   float64
 3   num2__CryoSleep                 4277 non-null   float64
 4   num2__VIP                       4277 non-null   float64
 5   num2__RoomService               4277 non-null   float64
 6   num2__FoodCourt                 4277 non-null   float64
 7   num2__ShoppingMall              4277 non-null   float64
 8   num2__Spa                       4277 non-null   float64
 9   num2__VRDeck                    4277 non-null   float64
 10  cat__HomePlanet_Earth           4277 non-null   float64
 11  cat__HomePlanet_Europa          4277 non-null   float64
 12  cat__HomePlanet_Mars   

In [13]:
#Final Prediction
sample = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

submission = pd.DataFrame({"PassengerId": sample.PassengerId, "Transported": rfc.predict(X_test_prep).astype('bool')})
submission.to_csv('submission.csv', index=False)
pd.read_csv('/kaggle/working/submission.csv')

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,True
...,...,...
4272,9266_02,True
4273,9269_01,True
4274,9271_01,True
4275,9273_01,True


In [14]:
#Model building
from functools import partial

dense_layer = partial(tf.keras.layers.Dense, activation='relu', kernel_initializer='he_normal')
model = tf.keras.Sequential([
    dense_layer(128, input_shape=[26]), 
    dense_layer(64), 
    dense_layer(32), 
    dense_layer(1, activation='sigmoid', kernel_initializer='glorot_uniform')
])

In [15]:
#Model training & evaluation
model.compile(optimizer='sgd', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
model.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100


<keras.src.callbacks.History at 0x7cb933126ce0>

In [16]:
#Refit model
model.fit(X_prep, y, epochs=10)
model.evaluate(X_prep, y)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


[0.39730536937713623, 0.8093868494033813]

In [17]:
#Final prediction 2
sample = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
prediction = model.predict(X_test_prep).reshape(-1)
prediction = map(lambda x: True if x > 0.5 else False, prediction)

submission2 = pd.DataFrame({"PassengerId": sample.PassengerId, "Transported": prediction})
submission2.to_csv('submission2.csv', index=False)
pd.read_csv('/kaggle/working/submission2.csv')



Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
...,...,...
4272,9266_02,True
4273,9269_01,False
4274,9271_01,True
4275,9273_01,True
