In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('train.csv')

In [34]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Destination   8693 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Name          8493 non-null   object 
 12  Transported   8693 non-null   bool   
 13  Cabin_deck    8693 non-null   object 
 14  Cabin_number  8693 non-null   int32  
 15  Cabin_side    8693 non-null   object 
 16  group         8693 non-null   int32  
 17  group_size    8693 non-null   int64  
dtypes: bool(1), float64(6), int3

In [3]:
train['Cabin'].isnull().sum()

199

## Cabin

Cabin : 객실 종류 및 번호 (port : 좌현, starboard : 우현)

### '/' 형태로 되어있는 Cabin 칼럼을 Deck, number, side로 바꿔주어

### -> Cabin_deck / Cabin_number / Cabin_side 칼럼 생성

In [4]:
# '/'로 split하기 위해서 '/'가 있는 형태로 바꿔주고 다시 NULL값 채우기
train['Cabin'].fillna('Z/9999/Z', inplace=True)

train['Cabin_deck'] = train['Cabin'].apply(lambda x: x.split('/')[0])
train['Cabin_number'] = train['Cabin'].apply(lambda x: x.split('/')[1]).astype(int)
train['Cabin_side'] = train['Cabin'].apply(lambda x: x.split('/')[2])

In [5]:
#다시 NULL 값 채우기
train.loc[train['Cabin_deck']=='Z', 'Cabin_deck']=np.nan
train.loc[train['Cabin_number']==9999, 'Cabin_number']=np.nan
train.loc[train['Cabin_side']=='Z', 'Cabin_side']=np.nan

### Cabin 칼럼 삭제

In [6]:
train = train.drop('Cabin', axis = 1)

### PassengerId 앞 부분을 따와 'group' 칼럼 생성

In [7]:
train['group'] = train['PassengerId'].apply(lambda x: x.split('_')[0]).astype(int)

### group_size 칼럼을 생성해 각 승객별 속한 그룹의 인원수를 체크

In [8]:
train['group_size'] = train['group'].apply(lambda x: train['group'].value_counts()[x])

### 같은 그룹이면 같은 Cabin_deck/number/side임을 보기 위해 group_size가 1보다 큰 그룹들만 추출

### !! 같은 group이면 같은 모두 Cabin_side임을 확인하였음.

### 그룹별로 Cabin deck/number/side 데이터 종류를 분류한 데이터프레임 생성

### -> 같은 그룹인데 다른 Cabin ? 에 속했는지 여부를 따지기 위함

In [9]:
train_g_cd = train.groupby('group')['Cabin_deck'].value_counts()
train_g_cd = pd.DataFrame(train_g_cd)
train_g_cd.columns = ['count']

train_g_cd = train_g_cd.reset_index()
train_g_cd = train_g_cd.drop_duplicates('group')

In [10]:
train_g_cn = train.groupby('group')['Cabin_number'].value_counts()
train_g_cn = pd.DataFrame(train_g_cn)
train_g_cn.columns = ['count']

train_g_cn = train_g_cn.reset_index()
train_g_cn = train_g_cn.drop_duplicates('group')

In [11]:
train_g_cs = train.groupby('group')['Cabin_side'].value_counts()
train_g_cs = pd.DataFrame(train_g_cs)
train_g_cs.columns = ['count']

train_g_cs = train_g_cs.reset_index()
train_g_cs = train_g_cs.drop_duplicates('group')

### Cabin_deck/number/side 칼럼 결측치 채우기

### Cabin에서 NULL값이었던 인덱스

In [12]:
cab_null_idx = train[train['Cabin_deck'].isnull()].index
cab_null_idx

Int64Index([  15,   93,  103,  222,  227,  251,  260,  272,  280,  295,
            ...
            8043, 8066, 8110, 8168, 8202, 8209, 8475, 8485, 8509, 8656],
           dtype='int64', length=199)

### 같은 group이면 -> 같은 Cabin ?에 속하게 만들기

In [13]:
cab_null_idx = train[train['Cabin_number'].isnull()].index
cab_null_idx

Int64Index([  15,   93,  103,  222,  227,  251,  260,  272,  280,  295,
            ...
            8043, 8066, 8110, 8168, 8202, 8209, 8475, 8485, 8509, 8656],
           dtype='int64', length=199)

#### ** 결측치 데이터만 한 group에만 속할 수 있다 -> if/else문으로 처리해주기

In [14]:
train.loc[cab_null_idx, 'Cabin_deck'] = train.loc[cab_null_idx, 'group'].apply(lambda x: train_g_cd['Cabin_deck'].mode()[0] if sum(train_g_cd['group']==x)==0 else train_g_cd[train_g_cd['group']==x]['Cabin_deck'].values[0])
train.loc[cab_null_idx, 'Cabin_number'] = train.loc[cab_null_idx, 'group'].apply(lambda x: train_g_cn['Cabin_number'].mode()[0] if sum(train_g_cn['group']==x)==0 else train_g_cn[train_g_cn['group']==x]['Cabin_number'].values[0])
train.loc[cab_null_idx, 'Cabin_side'] = train.loc[cab_null_idx, 'group'].apply(lambda x: train_g_cs['Cabin_side'].mode()[0] if sum(train_g_cs['group']==x)==0 else train_g_cs[train_g_cs['group']==x]['Cabin_side'].values[0])

In [15]:
train['Cabin_number'] = train['Cabin_number'].astype(int)

In [16]:
print(train['Cabin_deck'].isnull().sum())
print(train['Cabin_number'].isnull().sum())
print(train['Cabin_side'].isnull().sum())

0
0
0


## Destination -> 결측치 182개 존재

In [17]:
train['Destination'].value_counts(dropna = False)

TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
NaN               182
Name: Destination, dtype: int64

In [18]:
train

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_deck,Cabin_number,Cabin_side,group,group_size
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,1,1
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,2,1
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,3,2
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,3,2
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,A,98,P,9276,1
8689,9278_01,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,G,1499,S,9278,1
8690,9279_01,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,G,1500,S,9279,1
8691,9280_01,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,E,608,S,9280,2


In [19]:
train.groupby(['HomePlanet'])['Destination'].value_counts()

HomePlanet  Destination  
Earth       TRAPPIST-1e      3101
            PSO J318.5-22     712
            55 Cancri e       690
Europa      TRAPPIST-1e      1189
            55 Cancri e       886
            PSO J318.5-22      19
Mars        TRAPPIST-1e      1475
            55 Cancri e       193
            PSO J318.5-22      49
Name: Destination, dtype: int64

### HomePlanet이 Earth, Mars일 경우 대부분 TRAPPIST-1e에 도착

### -> HomePlanet이 Europa인 경우도 TRAPPIST-1e에 가장 많은 사람이 도착하므로 최빈값인 TRAPPIST-1e로 결측치 채우기

In [20]:
train['Destination'].fillna(train['Destination'].mode()[0], inplace = True)

### 전처리를 위해 생성했던 칼럼 ['group', 'group_size'] 삭제

### 최종 NULL값 확인(Destinaiton / Cabin)  & row 수 확인

In [21]:
train.isnull().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Destination       0
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
Cabin_deck        0
Cabin_number      0
Cabin_side        0
group             0
group_size        0
dtype: int64

In [22]:
import pandas as pd

traindf = pd.read_csv("Encoding_train.csv")
traindf.drop(traindf.columns[0], axis = 1, inplace = True)
traindf.drop(['PassengerId', 'Age_group'], axis = 1, inplace = True)

train_x = traindf.drop('Transported', axis = 1)
train_y = traindf['Transported']

train_x.shape

(8693, 21)

In [45]:
import pandas as pd

test = pd.read_csv("Age_test.csv")
test.drop(test.columns[0], axis = 1, inplace = True)
test.drop(['PassengerId','Name'], axis = 1, inplace = True)

#test.loc[null_index, 'Cabin_number'] = train.loc[cab_null_idx, 'group'].apply(lambda x: train_g_cn['Cabin_number'].mode()[0] if sum(train_g_cn['group']==x)==0 else train_g_cn[train_g_cn['group']==x]['Cabin_number'].values[0])

In [24]:
test.isnull().sum()

Age                            0
RoomService                   82
FoodCourt                    106
ShoppingMall                  98
Spa                          101
VRDeck                        80
Cabin_number                 100
HomePlanet_Europa              0
HomePlanet_Mars                0
CryoSleep_True                 0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
Cabin_deck_B                   0
Cabin_deck_C                   0
Cabin_deck_D                   0
Cabin_deck_E                   0
Cabin_deck_F                   0
Cabin_deck_G                   0
Cabin_deck_T                   0
Cabin_side_S                   0
VIP_True                       0
dtype: int64

In [50]:
test['RoomService'].fillna(test['RoomService'].mode()[0], inplace = True)
test['FoodCourt'].fillna(test['FoodCourt'].mode()[0], inplace = True)
test['ShoppingMall'].fillna(test['ShoppingMall'].mode()[0], inplace = True)
test['Spa'].fillna(test['Spa'].mode()[0], inplace = True)
test['VRDeck'].fillna(test['VRDeck'].mode()[0], inplace = True)
test['Cabin_number'].fillna(test['Cabin_number'].mode()[0], inplace = True)

In [68]:
test.isnull().sum()

Age                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
Cabin_number                 0
HomePlanet_Europa            0
HomePlanet_Mars              0
CryoSleep_True               0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
Cabin_deck_B                 0
Cabin_deck_C                 0
Cabin_deck_D                 0
Cabin_deck_E                 0
Cabin_deck_F                 0
Cabin_deck_G                 0
Cabin_deck_T                 0
Cabin_side_S                 0
VIP_True                     0
dtype: int64

In [46]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y)

knn = KNeighborsClassifier()
knn.fit(X_train , y_train)

y_pred = knn.predict(X_test)

accuracy_score(y_test, y_pred)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


0.7470101195952162

In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y)

lr = LogisticRegression()
lr.fit(X_train , y_train)

y_pred = lr.predict(X_test)

accuracy_score(y_test, y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8012879484820608

In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y)

dt = DecisionTreeClassifier()
dt.fit(X_train , y_train)

y_pred = dt.predict(X_test)

accuracy_score(y_test, y_pred)

0.7677092916283349

## -----------------------------------

In [66]:
from sklearn.model_selection import GridSearchCV

params={'solver':['liblinear', 'lbfgs'],
        'penalty':['l2', 'l1'],
        'C':[0.01, 0.1, 1, 1, 5, 10]}

lr = LogisticRegression()
lr = GridSearchCV(lr, param_grid=params, scoring='accuracy', cv=3 )
lr.fit(X_train, y_train)
print('{1:.3f}'.format(lr.best_params_, lr.best_score_))

0.798


In [67]:
from sklearn.model_selection import GridSearchCV

params= {
    'n_neighbors' : list(range(1,20)),
    'weights' : ["uniform", "distance"],
    'metric' : ['euclidean', 'manhattan', 'minkowski']
}

knn = KNeighborsClassifier()
grid_clf = GridSearchCV(knn, param_grid=params, scoring='accuracy', cv=3 )
grid_clf.fit(X_train, y_train)
print('{1:.3f}'.format(grid_clf.best_params_,grid_clf.best_score_))

0.788


In [42]:
pd.DataFrame(grid_clf.predict(test))

Feature names must be in the same order as they were in fit.

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,0
0,1
1,1
2,1
3,0
4,1
...,...
4272,0
4273,0
4274,0
4275,0


In [51]:
transported_pred = pd.DataFrame(grid_clf.predict(test))

test_df = pd.read_csv("Age_test.csv")
passengerId = pd.DataFrame(test_df.iloc[:,1])

submission_df = pd.concat([passengerId, transported_pred], axis = 1)
submission_df.rename(columns={0:'Transported'}, inplace = True)
submission_df['Transported'] = submission_df['Transported'].astype(bool)

Feature names must be in the same order as they were in fit.

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [44]:
submission_df.to_csv("C:/Users/kyw97/Downloads/spaceship-titanic/knn_submission_csv.csv", index = False)

In [53]:
traindf = pd.read_csv("Encoding_train.csv")
traindf.drop(traindf.columns[0], axis = 1, inplace = True)
traindf.drop(['PassengerId', 'Age_group'], axis = 1, inplace = True)
traindf

Unnamed: 0,Cabin_number,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,HomePlanet_Europa,HomePlanet_Mars,...,Destination_TRAPPIST-1e,Cabin_deck_B,Cabin_deck_C,Cabin_deck_D,Cabin_deck_E,Cabin_deck_F,Cabin_deck_G,Cabin_deck_T,Cabin_side_S,VIP_True
0,0,39.0,0.0,0.0,0.0,0.0,0.0,0,1,0,...,1,1,0,0,0,0,0,0,0,0
1,0,24.0,109.0,9.0,25.0,549.0,44.0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
2,0,58.0,43.0,3576.0,0.0,6715.0,49.0,0,1,0,...,1,0,0,0,0,0,0,0,1,1
3,0,33.0,0.0,1283.0,371.0,3329.0,193.0,0,1,0,...,1,0,0,0,0,0,0,0,1,0
4,1,16.0,303.0,70.0,151.0,565.0,2.0,1,0,0,...,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,98,41.0,0.0,6819.0,0.0,1643.0,74.0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
8689,1499,18.0,0.0,0.0,0.0,0.0,0.0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
8690,1500,26.0,0.0,0.0,1872.0,1.0,0.0,1,0,0,...,1,0,0,0,0,0,1,0,1,0
8691,608,32.0,0.0,1049.0,0.0,353.0,3235.0,0,1,0,...,0,0,0,0,1,0,0,0,1,0


In [54]:
from pycaret.classification import *

clf = setup(data = traindf, target = 'Transported')
best_5 = compare_models(sort = 'Accuracy', n_select = 5)

Unnamed: 0,Description,Value
0,Session id,8919
1,Target,Transported
2,Target type,Binary
3,Original data shape,"(8693, 22)"
4,Transformed data shape,"(8693, 22)"
5,Transformed train set shape,"(6085, 22)"
6,Transformed test set shape,"(2608, 22)"
7,Numeric features,21
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.8105,0.8987,0.8457,0.7923,0.818,0.6208,0.6226,1.153
lightgbm,Light Gradient Boosting Machine,0.8097,0.9033,0.8124,0.8103,0.8113,0.6194,0.6195,1.369
rf,Random Forest Classifier,0.8031,0.8825,0.7677,0.8286,0.7969,0.6065,0.6082,1.217
xgboost,Extreme Gradient Boosting,0.8,0.8935,0.7892,0.8093,0.799,0.6001,0.6005,1.209
ada,Ada Boost Classifier,0.7995,0.8858,0.833,0.7831,0.8071,0.5988,0.6003,0.905
lr,Logistic Regression,0.7965,0.8807,0.8228,0.7841,0.8029,0.5929,0.5939,2.455
et,Extra Trees Classifier,0.7956,0.8692,0.7631,0.8187,0.7899,0.5913,0.5928,1.264
lda,Linear Discriminant Analysis,0.7694,0.857,0.7135,0.8063,0.7569,0.5392,0.5431,0.774
ridge,Ridge Classifier,0.7691,0.0,0.7132,0.806,0.7565,0.5386,0.5424,0.683
knn,K Neighbors Classifier,0.7594,0.8174,0.7905,0.7468,0.768,0.5186,0.5196,0.732


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

In [57]:
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc, optimize = 'Accuracy', n_iter = 10)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7898,0.8981,0.8436,0.764,0.8019,0.5792,0.5825
1,0.7882,0.8754,0.8143,0.7764,0.7949,0.5762,0.5769
2,0.7882,0.886,0.8143,0.7764,0.7949,0.5762,0.5769
3,0.8161,0.9149,0.8306,0.8095,0.8199,0.6321,0.6323
4,0.8342,0.9159,0.8599,0.8199,0.8394,0.6682,0.669
5,0.7961,0.8753,0.8137,0.788,0.8006,0.592,0.5923
6,0.8503,0.9229,0.8987,0.8209,0.858,0.7005,0.7037
7,0.824,0.9035,0.8562,0.8062,0.8304,0.6479,0.6491
8,0.8076,0.9045,0.8562,0.7821,0.8175,0.6149,0.6177
9,0.8109,0.8903,0.8693,0.7801,0.8223,0.6214,0.6256


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8079,0.9007,0.8436,0.7896,0.8157,0.6155,0.617
1,0.7865,0.88,0.8111,0.7757,0.793,0.5729,0.5735
2,0.7882,0.8889,0.8046,0.7816,0.7929,0.5762,0.5765
3,0.821,0.9158,0.8274,0.8194,0.8233,0.642,0.642
4,0.8358,0.9168,0.8664,0.8185,0.8418,0.6714,0.6726
5,0.7928,0.8768,0.7941,0.7941,0.7941,0.5855,0.5855
6,0.852,0.924,0.8954,0.8253,0.8589,0.7038,0.7064
7,0.8224,0.9074,0.8562,0.8037,0.8291,0.6446,0.646
8,0.8174,0.9074,0.8497,0.8,0.8241,0.6347,0.6359
9,0.7961,0.8916,0.8399,0.7741,0.8056,0.5919,0.594


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [58]:
final_model = finalize_model(tuned_gbc)

In [59]:
final_model

Pipeline(memory=FastMemory(location=C:\Users\kyw97\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(exclude=None,
                                    include=['Cabin_number', 'Age',
                                             'RoomService', 'FoodCourt',
                                             'ShoppingMall', 'Spa', 'VRDeck',
                                             'HomePlanet_Europa',
                                             'HomePlanet_Mars',
                                             'CryoSleep_True',
                                             'Destination_PSO J318.5-22',
                                             'Destination_TRAPPIST-1e',
                                             'Cabin_deck_B', 'Ca...
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.05, loss='deviance',
                                            m

In [61]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(init=None,
                                 learning_rate=0.05, loss='deviance',
                                 max_depth=5, max_features='sqrt',
                                 max_leaf_nodes=None,
                                 min_impurity_decrease=0.002,
                                 min_samples_leaf=1,
                                 min_samples_split=7,
                                 min_weight_fraction_leaf=0.0,
                                 n_estimators=150,
                                 n_iter_no_change=None,
                                 random_state=8919, subsample=0.75,
                                 tol=0.0001, validation_fraction=0.1,
                                 verbose=0, warm_start=False)

In [62]:
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y)

gbc.fit(X_train , y_train)

y_pred = gbc.predict(X_test)

accuracy_score(y_test, y_pred)

0.7999080036798528

In [None]:
test_df = pd.read_csv("Age_test.csv")

test['RoomService'].fillna(test['RoomService'].mode()[0], inplace = True)
test['FoodCourt'].fillna(test['FoodCourt'].mode()[0], inplace = True)
test['ShoppingMall'].fillna(test['ShoppingMall'].mode()[0], inplace = True)
test['Spa'].fillna(test['Spa'].mode()[0], inplace = True)
test['VRDeck'].fillna(test['VRDeck'].mode()[0], inplace = True)
test['Cabin_number'].fillna(test['Cabin_number'].mode()[0], inplace = True)

In [64]:
transported_pred = pd.DataFrame(gbc.predict(test))

submission_df = pd.concat([passengerId, transported_pred], axis = 1)
submission_df.rename(columns={0:'Transported'}, inplace = True)
submission_df['Transported'] = submission_df['Transported'].astype(bool)
submission_df

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,True
2,0019_01,True
3,0021_01,False
4,0023_01,True
...,...,...
4272,9266_02,False
4273,9269_01,False
4274,9271_01,True
4275,9273_01,False


In [65]:
submission_df.to_csv("C:/Users/kyw97/Downloads/spaceship-titanic/gbc_submission_csv.csv", index = False)