In [5]:
import numpy as np
import pandas as pd

In [6]:
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [7]:
train_data.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [8]:
from sklearn.impute import KNNImputer
train_data['Transported'] = train_data['Transported']*1
train_data['CryoSleep'] = train_data['CryoSleep']*1
test_data['CryoSleep'] = test_data['CryoSleep']*1

imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

train_data[['Deck', 'Num', 'Side']] = train_data['Cabin'].str.split('/', expand=True)
train_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Num']] = imputer.fit_transform(train_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Num']])
train_data['HomePlanet'].fillna('Earth', inplace=True)
train_data['Destination'].fillna('TRAPPIST-1e', inplace=True)
train_data['Deck'].fillna('F', inplace=True)
train_data['Side'].fillna('P', inplace=True)

test_data[['Deck', 'Num', 'Side']] = test_data['Cabin'].str.split('/', expand=True)
test_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Num']] = imputer.fit_transform(test_data[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'CryoSleep', 'Num']])
test_data['HomePlanet'].fillna('Earth', inplace=True)
test_data['Destination'].fillna('TRAPPIST-1e', inplace=True)
test_data['Deck'].fillna('F', inplace=True)
test_data['Side'].fillna('P', inplace=True)

In [9]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train_data['Deck'] = encoder.fit_transform(train_data['Deck'])

encoder = LabelEncoder()
train_data['Num'] = encoder.fit_transform(train_data['Num'])

encoder = LabelEncoder()
train_data['Side'] = encoder.fit_transform(train_data['Side'])

encoder = LabelEncoder()
test_data['Deck'] = encoder.fit_transform(test_data['Deck'])

encoder = LabelEncoder()
test_data['Num'] = encoder.fit_transform(test_data['Num'])

encoder = LabelEncoder()
test_data['Side'] = encoder.fit_transform(test_data['Side'])

In [10]:
train_data['AllSpending'] = train_data['RoomService'] + train_data['FoodCourt'] + train_data['ShoppingMall'] + train_data['Spa'] + train_data['VRDeck']
test_data['AllSpending'] = test_data['RoomService'] + test_data['FoodCourt'] + test_data['ShoppingMall'] + test_data['Spa'] + test_data['VRDeck']

In [11]:
from sklearn.preprocessing import StandardScaler
train_label = train_data['Transported']
features = ['CryoSleep', 'RoomService', 'Spa', 'VRDeck', 'Deck', 'Side', 'AllSpending']

cols_to_norm = ['RoomService', 'Spa', 'VRDeck', 'AllSpending']
scaler = StandardScaler()

train_data[cols_to_norm] = scaler.fit_transform(train_data[cols_to_norm])
test_data[cols_to_norm] = scaler.transform(test_data[cols_to_norm])

train_input = pd.get_dummies(train_data[features])
test_input = pd.get_dummies(test_data[features])

In [12]:
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

cat_model = CatBoostClassifier(verbose=0)
cat_model.fit(train_input, train_label)

train_pred = cat_model.predict(train_input)
print(accuracy_score(train_label, train_pred))
print(confusion_matrix(train_label, train_pred))

test_pred = cat_model.predict(test_input)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Transported': test_pred})
output = output.replace({1: True, 0: False})
output.to_csv('submission.csv', index=False)

0.8386057747613022
[[3454  861]
 [ 542 3836]]
