In [69]:
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import (
    RobustScaler,
    LabelEncoder, 
    OneHotEncoder,
)

In [70]:
random_state = 0

In [71]:
df = pd.read_csv('./data/train.csv')

Missing values before imputation:

In [72]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Split 'PassengerId' into 'GroupdId' and 'PersonId', and split 'Name' into 'FirstName' and 'LastName'.

In [73]:
# split 'PassengerId' into 'GroupId', 'PersonId'
df[['GroupId', 'PersonId']] = df['PassengerId'].str.split('_', expand=True)

# split 'Name' into 'FirstName' and 'LastName'
df[['FirstName', 'LastName']] = df['Name'].str.split(' ', n=1, expand=True)

Fill missing 'LastName' values according to the following insights:

- Passengers sharing a group Id often share a common last name.

- Passengers sharing a cabin often share a last name.

In [74]:
# fill missing 'LastName'...

# by 'GroupId'

# identify passengers traveling in a group
sharedgroup_idx = df[df['GroupId'].duplicated(keep=False)].index

# identify passengers traveling in a group with missing 'LastName'
targeted_idx = df[(df.index.isin(sharedgroup_idx)) & (df['LastName'].isna())].index

# identify passengers traveling in a group with known 'LastName(s)'
source_idx = df[(df.index.isin(sharedgroup_idx)) & (df['LastName'].notna())].index

# fill missing
df.loc[targeted_idx, 'LastName'] = df.iloc[targeted_idx, :]['GroupId'].map(
    lambda x: df.iloc[source_idx, :][df.loc[source_idx, 'GroupId'] == x]['LastName'].mode().max()
)

# by 'Cabin'

# identify passengers sharing a cabin
sharedcabin_idx = df[df['Cabin'].duplicated(keep=False)].index

# identify passengers sharing a cabin with missing 'LastName'
targeted_idx = df[(df.index.isin(sharedcabin_idx)) & (df['LastName'].isna())].index

# identify passengers sharing a cabin with known 'LastName(s)'
source_idx = df[(df.index.isin(sharedcabin_idx)) & (df['LastName'].notna())].index

# fill missing
df.loc[targeted_idx, 'LastName'] = df.iloc[targeted_idx, :]['Cabin'].map(
    lambda x: df.iloc[source_idx, :][df.loc[source_idx, 'Cabin'] == x]['LastName'].mode().max()
)

Fill missing 'Cabin' values according to the following insight:

- Passengers sharing a group Id often share the same cabin.

In [75]:
# fill missing 'Cabin'...

# by 'GroupId'

# identify passengers traveling in a group
sharedgroup_idx = df[df['GroupId'].duplicated(keep=False)].index

# identify passengers traveling in a group with missing 'Cabin'
targeted_idx = df[(df.index.isin(sharedgroup_idx)) & (df['Cabin'].isna())].index

# identify passengers traveling in a group with known 'Cabin(s)'
source_idx = df[(df.index.isin(sharedgroup_idx)) & (df['Cabin'].notna())].index

# fill missing
df.loc[targeted_idx, 'Cabin'] = df.iloc[targeted_idx, :]['GroupId'].map(
    lambda x: df.iloc[source_idx, :][df.loc[source_idx, 'GroupId'] == x]['Cabin'].mode().max()
)

Split 'Cabin' into 'CabinDeck', 'CabinNumber', and 'CabinSide'.

In [76]:
# split 'Cabin'
df[['CabinDeck', 'CabinNumber', 'CabinSide']] = df['Cabin'].str.split('/', expand=True)

Fill missing 'HomePlanet' values according to the following insights:

- Passengers with a cabin on decks "A", "B", "C" and "T" are from Europa. Passengers with a cabin on deck "G" are from Earth."

- Passengers sharing a group Id always come from the same home planet.

- Passengers sharing a last name always com from the same home planet.

- Passengers sharing a cabin always com from the same home planet.

- Passengers traveling to "PSO J318.5-22" almost always come from Earth.

In [77]:
# fill missing 'HomePlanet'...

# by 'CabinDeck'

# identify passengers with missing 'HomePlanet' and 'CabinDecks' of 'A', 'B', 'C', 'G', or 'T'
targeted_idx = df[(df['HomePlanet'].isna()) & (df['CabinDeck'].isin(['A', 'B', 'C', 'G', 'T']))].index

# fill missing
df.loc[targeted_idx, 'HomePlanet'] = df.iloc[targeted_idx, :]['CabinDeck'].map(
    lambda x: 'Earth' if x == 'G' else 'Europa'
)

# by 'GroupId'

# identify passengers traveling in a group
sharedgroup_idx = df[df['GroupId'].duplicated(keep=False)].index

# identify passengers traveling in a group with missing 'HomePlanet'
targeted_idx = df[(df.index.isin(sharedgroup_idx)) & (df['HomePlanet'].isna())].index

# identify passengers traveling in a group with known 'Homeplanet'
source_idx = df[(df.index.isin(sharedgroup_idx)) & (df['HomePlanet'].notna())].index

# fill missing
df.loc[targeted_idx, 'HomePlanet'] = df.iloc[targeted_idx, :]['GroupId'].map(
    lambda x: df.iloc[source_idx, :][df.loc[source_idx, 'GroupId'] == x]['HomePlanet'].mode().max()
)

# by 'LastName'

# identify passengers sharing a last name
sharedlastname_idx = df[(df['LastName'].duplicated(keep=False)) & (df['LastName'].notna())].index

# identify passengers sharing a last name with missing 'HomePlanet'
targeted_idx = df[(df.index.isin(sharedlastname_idx)) & (df['HomePlanet'].isna())].index

# identify passengers sharing a last name with known 'HomePlanet'
source_idx = df[(df.index.isin(sharedlastname_idx)) & (df['HomePlanet'].notna())].index

# fill missing
df.loc[targeted_idx, 'HomePlanet'] = df.iloc[targeted_idx, :]['LastName'].map(
    lambda x: df.iloc[source_idx, :][df.loc[source_idx, 'LastName'] == x]['HomePlanet'].mode().max()
)

# by 'Cabin'

# identify passengers sharing a cabin
sharedcabin_idx = df[(df['Cabin'].duplicated(keep=False)) & (df['Cabin'].notna())].index

# identify passengers sharing a cabin with missing 'HomePlanet'
targeted_idx = df[(df.index.isin(sharedcabin_idx)) & (df['HomePlanet'].isna())].index

# identify passengers sharing a cabin with known 'HomePlanet'
source_idx = df[(df.index.isin(sharedcabin_idx)) & (df['HomePlanet'].notna())].index

# fill missing
df.loc[targeted_idx, 'HomePlanet'] = df.iloc[targeted_idx, :]['Cabin'].map(
    lambda x: df.iloc[source_idx, :][df.loc[source_idx, 'Cabin'] == x]['HomePlanet'].mode().max()
)

# by 'Destination'

# identify passengers with missing 'HomePlanet' and 'Destination' of 'PSO J318.5-22'
targeted_idx = df[(df['HomePlanet'].isna()) & (df['Destination'] == 'PSO J318.5-22')].index

# fill missing
df.loc[targeted_idx, 'HomePlanet'] = 'Earth'

Fill missing 'CabinDeck' values according to the following insight:

- ...for missing 'CabinDeck' values, we can impute the mode by 'HomePlanet'. It's not perfect, but it's far better than imputing the mode for all cabin decks...

In [78]:
# fill missing 'CabinDeck'

# by 'HomePlanet'

# imput mode for passengers from 'Earth'
cd_mode_earth = df[(df['CabinDeck'].notna()) & (df['HomePlanet'] == 'Earth')]['CabinDeck'].mode()[0]
targeted_idx = df[(df['CabinDeck'].isna()) & (df['HomePlanet'] == 'Earth')].index
df.loc[targeted_idx, 'CabinDeck'] = cd_mode_earth

# imput mode for passengers from 'Europa'
cd_mode_europa = df[(df['CabinDeck'].notna()) & (df['HomePlanet'] == 'Europa')]['CabinDeck'].mode()[0]
targeted_idx = df[(df['CabinDeck'].isna()) & (df['HomePlanet'] == 'Europa')].index
df.loc[targeted_idx, 'CabinDeck'] = cd_mode_europa

# imput mode for passengers from 'Mars'
cd_mode_mars = df[(df['CabinDeck'].notna()) & (df['HomePlanet'] == 'Mars')]['CabinDeck'].mode()[0]
targeted_idx = df[(df['CabinDeck'].isna()) & (df['HomePlanet'] == 'Mars')].index
df.loc[targeted_idx, 'CabinDeck'] = cd_mode_mars

Fill missing amenities values according to the following insights:

- Passengers in cryosleep did not purchase amenities.

- Passengers the age of roughly 12 years old and younger did not purchase amenities.

In [79]:
amenities = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [80]:
# fill missing 'amenities'...

for a in amenities:
    # by 'CryoSleep'
    targeted_idx = df[(df['CryoSleep'] == True) & (df[a].isna())].index
    df.loc[targeted_idx, a] = 0.0
    
    # by 'Age'
    targeted_idx = df[(df['Age'] <= 12) & (df[a].isna())].index
    df.loc[targeted_idx, a] =  0.0

Fill missing 'CryoSleep' values according to the following insights:

- Passengers who purchased amenities were not in cyrosleep.

- Adult passengers (age 18 and older) who did not purchase amenities were in cryosleep. 

In [81]:
# fill missing 'CryoSleep'...

# by 'amenities'
targeted_idx = df[df['CryoSleep'].isna()].index
df.loc[targeted_idx, 'CryoSleep'] = df.iloc[targeted_idx, :].apply(
    lambda i: False if i[amenities].sum() > 0 else i['CryoSleep'], axis=1
)

# by 'Age'
targeted_idx = df[(df['CryoSleep'].isna()) & (df['Age'] >= 18)].index
df.loc[targeted_idx, 'CryoSleep'] = df.iloc[targeted_idx, :].apply(
    lambda i: True if i[amenities].sum() == 0 else i['CryoSleep'], axis=1
)

Create 'InGroup' feature to indicate whether a passenger belongs to a group. This feature proxies for 'PersonId'.

In [82]:
df['InGroup'] = df['GroupId'].duplicated(keep=False)

Create 'CabinLocation' feature to indicate a the location of a cabin on a given deck. This feature proxies for 'CabinNumber'.

In [83]:
# recall 'CabinNumber' is cast to an integer to make binning easier...
df['CabinNumber'] = df['CabinNumber'].map(lambda x: int(x) if pd.notna(x) else x)

# binned according to EDA insight...
df['CabinLocation'] = df['CabinNumber'].map(
    lambda x: (
        'Fore' if x <= 375 else 'Mid/Fore' if x > 375 and x <= 750 else 'Mid/Aft' if x > 750 and x <= 1125 else 'Aft'
    ) if pd.notna(x) else x
)

Missing values after imputation:

In [84]:
df.isna().sum()

PassengerId        0
HomePlanet         8
CryoSleep         33
Cabin             99
Destination      182
Age              179
VIP              203
RoomService      107
FoodCourt        106
ShoppingMall     103
Spa              114
VRDeck           107
Name             200
Transported        0
GroupId            0
PersonId           0
FirstName        200
LastName         104
CabinDeck          0
CabinNumber       99
CabinSide         99
InGroup            0
CabinLocation     99
dtype: int64

Now that missing values have been imputed according to EDA insights to the extent possible, input features and the target can be transformed and the data split into test and training sets...

In [85]:
cat_features = [
    'HomePlanet',
    'CabinDeck',
    'CabinSide',
    'CryoSleep',
    'Destination',
    'CabinLocation',
    'InGroup',
]

num_features = [
    'Age',
    *amenities,
]

cat_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='if_binary', handle_unknown='ignore', sparse_output=False)),
    ]
)

num_transformer = Pipeline(
    steps=[
        ('imputer', KNNImputer()),
        ('scaler', RobustScaler()),
    ]
)

transformers = [
    ('cat_transformer', cat_transformer, cat_features),
    ('num_transformer', num_transformer, num_features),
]

preprocessor = ColumnTransformer(
    transformers=transformers, 
    remainder='passthrough',
    verbose_feature_names_out=False,
)

preprocessor.set_output(transform='pandas')

features = [
    *cat_features,
    *num_features,
]

X = df[features]
y = df['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

At this stage the data has been transformed and split into training and test sets. To review model training steps, see individual model notebooks (e.g. `logisticregression.ipynb`).