In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('train.csv')
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Convert everything to numbers and deal with categorical fields

In [3]:
# unwrap Cabin into it's three components: deck, num, side
def unwrap_cabin(cabin):
    if pd.isna(cabin):
        return [None, None, None]
    else:
        parts = cabin.split('/')
        deck = parts[0]
        num = int(parts[1])
        side = parts[2] if len(parts) > 1 else None
        return [deck, num, side]
    
train['Deck'], train['Num'], train['Side'] = zip(*train['Cabin'].map(unwrap_cabin))
train['Missing_Cabin'] = train['Cabin'].isna().astype(int)
train.drop(columns=['Cabin'], inplace=True)

In [4]:
train[['Deck', 'Num', 'Side']].head(10)

Unnamed: 0,Deck,Num,Side
0,B,0.0,P
1,F,0.0,S
2,A,0.0,S
3,A,0.0,S
4,F,1.0,S
5,F,0.0,P
6,F,2.0,S
7,G,0.0,S
8,F,3.0,S
9,B,1.0,P


In [23]:
# apply one hot encoding to HomePlanet, Cabin deck and Destination
def one_hot_encode(df, columns):
    encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    encoded_columns = encoder.fit_transform(df[columns])

    encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(columns))
    df = df.join(encoded_df)
    return df.drop(columns, axis=1), encoder

one_hot_encoded_train, encoder = one_hot_encode(train, ['HomePlanet', 'Deck', 'Destination'])
one_hot_encoded_train.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_None,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan
0,0001_01,False,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0002_01,False,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0003_01,False,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0003_02,False,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0004_01,False,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [24]:
one_hot_encoded_train.columns

Index(['PassengerId', 'CryoSleep', 'Age', 'VIP', 'RoomService', 'FoodCourt',
       'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported', 'Num', 'Side',
       'Missing_Cabin', 'HomePlanet_Earth', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'HomePlanet_nan', 'Deck_A', 'Deck_B', 'Deck_C',
       'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T', 'Deck_None',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'Destination_nan'],
      dtype='object')

In [25]:
# for CryoSleep, VIP and Side, we can just convert them to int
one_hot_encoded_train[['CryoSleep', 'VIP']] = one_hot_encoded_train[['CryoSleep', 'VIP']].astype('Int64')
one_hot_encoded_train['Side'] = one_hot_encoded_train['Side'].map({'P': 1, 'S': 0})
one_hot_encoded_train.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_None,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan
0,0001_01,0,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0002_01,0,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0003_01,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0003_02,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0004_01,0,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [26]:
# Replace nan with mean of Age, RoomService, FoodCourt, ShoppingMall, Spa, VRDeck
def replace_nan_with_mean(df, columns):
    for col in columns:
        mean_value = df[col].mean()
        df[col].fillna(mean_value, inplace=True)
    return df

nan_columns = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
final_train = replace_nan_with_mean(one_hot_encoded_train, nan_columns)
final_train.head()

Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,...,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_None,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Destination_nan
0,0001_01,0,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0002_01,0,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0003_01,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0003_02,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0004_01,0,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [27]:
# Remove useless fields
final_train.drop(columns=['PassengerId', 'Name', 'Deck_None', 'Destination_nan', 'HomePlanet_nan'], inplace=True)
final_train.head(), final_train.columns

(   CryoSleep   Age  VIP  RoomService  FoodCourt  ShoppingMall     Spa  VRDeck  \
 0          0  39.0    0          0.0        0.0           0.0     0.0     0.0   
 1          0  24.0    0        109.0        9.0          25.0   549.0    44.0   
 2          0  58.0    1         43.0     3576.0           0.0  6715.0    49.0   
 3          0  33.0    0          0.0     1283.0         371.0  3329.0   193.0   
 4          0  16.0    0        303.0       70.0         151.0   565.0     2.0   
 
    Transported  Num  ...  Deck_B  Deck_C  Deck_D  Deck_E  Deck_F  Deck_G  \
 0        False  0.0  ...     1.0     0.0     0.0     0.0     0.0     0.0   
 1         True  0.0  ...     0.0     0.0     0.0     0.0     1.0     0.0   
 2        False  0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
 3        False  0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
 4         True  1.0  ...     0.0     0.0     0.0     0.0     1.0     0.0   
 
    Deck_T  Destination_55 Cancri e  Desti

In [10]:
# Verify there are no more nan values
final_train.isna().sum()

CryoSleep                    217
Age                            0
VIP                          203
RoomService                    0
FoodCourt                      0
ShoppingMall                   0
Spa                            0
VRDeck                         0
Transported                    0
Num                          199
Side                         199
Missing_Cabin                  0
HomePlanet_Earth               0
HomePlanet_Europa              0
HomePlanet_Mars                0
Deck_A                         0
Deck_B                         0
Deck_C                         0
Deck_D                         0
Deck_E                         0
Deck_F                         0
Deck_G                         0
Deck_T                         0
Destination_55 Cancri e        0
Destination_PSO J318.5-22      0
Destination_TRAPPIST-1e        0
dtype: int64

In [28]:
final_train = final_train.astype('float32')

# Finish resolving nan values

# CryoSleep becomes average CryoSleep
final_train['CryoSleep'].fillna(final_train["CryoSleep"].mean(), inplace=True)

# VIP becomes average VIP
final_train['VIP'].fillna(final_train["VIP"].mean(), inplace=True)

# Num and side do not have to be dealt with, as there is a column for missing cabin
final_train['Num'].fillna(0, inplace=True)
final_train['Side'].fillna(0, inplace=True)

final_train.isna().sum()

CryoSleep                    0
Age                          0
VIP                          0
RoomService                  0
FoodCourt                    0
ShoppingMall                 0
Spa                          0
VRDeck                       0
Transported                  0
Num                          0
Side                         0
Missing_Cabin                0
HomePlanet_Earth             0
HomePlanet_Europa            0
HomePlanet_Mars              0
Deck_A                       0
Deck_B                       0
Deck_C                       0
Deck_D                       0
Deck_E                       0
Deck_F                       0
Deck_G                       0
Deck_T                       0
Destination_55 Cancri e      0
Destination_PSO J318.5-22    0
Destination_TRAPPIST-1e      0
dtype: int64

# Normalize columns

In [29]:
# Normalize fields RoomService, FoodCourd, ShoppingMall, Spa, VRDeck, Num and Age
def normalize_columns(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df, scaler

final_train, scaler = normalize_columns(final_train, ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Num', 'Age'])
final_train.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Num,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0.0,0.709437,0.0,-0.34059,-0.287314,-0.290817,-0.276663,-0.269023,0.0,-1.141624,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,-0.336717,0.0,-0.175364,-0.281669,-0.248968,0.211505,-0.230194,1.0,-1.141624,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,2.034566,1.0,-0.275409,1.955616,-0.290817,5.694289,-0.225782,0.0,-1.141624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.290975,0.0,-0.34059,0.517406,0.330225,2.683471,-0.098708,0.0,-1.141624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,-0.894666,0.0,0.118709,-0.243409,-0.038048,0.225732,-0.267258,1.0,-1.139678,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


# Train Test Split

In [30]:
x, y = final_train.drop(columns=["Transported",]), final_train["Transported"]

In [31]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((6954, 25), (1739, 25), (6954,), (1739,))

In [None]:
X_train.to_csv("ProcessedData/X_train.csv")
X_test.to_csv("ProcessedData/X_test.csv")
Y_train.to_csv("ProcessedData/Y_train.csv", index=None)
Y_test.to_csv("ProcessedData/Y_test.csv", index=None)

NameError: name 'X_train' is not defined

In [33]:
from joblib import dump

# save scaler and encoder
dump(scaler, 'scaler.joblib')
dump(encoder, 'encoder.joblib')

['encoder.joblib']