In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from spaceship_titanic import feature_enginnering as fe

In [2]:
df = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


# Data Treatment

In [3]:

def treat_dataset(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Name'],axis=1)
    df = fe.cabin_inputer(df)
    df = fe.feature_inputer(df)
    df = fe.vip_knn_input(df)
    df['0_bills'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1) == 0
    df = fe.outliers_to_log(df)
    df = fe.dtype_memory_reducer(df)
    return df
def feature_enginnering(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.get_dummies(df,columns=['HomePlanet','Destination','Deck'],drop_first=True)
    return df

# Name

In [4]:
df['Name'].value_counts()

Gollux Reedall        2
Elaney Webstephrey    2
Grake Porki           2
Sus Coolez            2
Apix Wala             2
                     ..
Jamela Griffy         1
Hardy Griffy          1
Salley Mckinn         1
Mall Frasp            1
Propsh Hontichre      1
Name: Name, Length: 8473, dtype: int64

In [51]:
df_test

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez
...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,Earth,True,G/1496/S,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,Jeron Peter
9269_01,Earth,False,,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,Matty Scheron
9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore
9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale


In [52]:
df_index = df_test.index
full_index = pd.concat([df.index.to_series(),df_test.index.to_series()])
full_index = full_index.to_frame()['PassengerId'].str.split("_",expand=True).astype(int)[0].to_frame('GroupID')
full_index['GroupSize'] = full_index.groupby(['GroupID'])['GroupID'].transform('count')
full_index.loc[df_index,'GroupSize']

PassengerId
0013_01    1
0018_01    1
0019_01    1
0021_01    1
0023_01    1
          ..
9266_02    2
9269_01    1
9271_01    1
9273_01    1
9277_01    1
Name: GroupSize, Length: 4277, dtype: int64

In [61]:
def calculate_groupsize(df_index: pd.core.indexes.base.Index, full_index: pd.core.indexes.base.Index):
    full_index = full_index.to_frame()['PassengerId'].str.split("_",expand=True).astype(int)[0].to_frame('GroupID')
    full_index['GroupSize'] = full_index.groupby(['GroupID'])['GroupID'].transform('count')
    return full_index.loc[df_index,'GroupSize']

def calculate_seat_id(df_index: pd.core.indexes.base.Index):
    return df_index.to_frame()['PassengerId'].str.split("_",expand=True).astype(int)[1]
    
df['is_alone'] = calculate_groupsize(df.index, pd.concat([df.index.to_series(),df_test.index.to_series()])) == 1
df['SeatID'] = calculate_seat_id(df.index)

In [62]:
df.groupby('is_alone')['Transported'].mean()

is_alone
False    0.566872
True     0.452445
Name: Transported, dtype: float64

In [7]:
df.query("Name == 'Grake Porki'")

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0901_01,Mars,False,F/167/S,TRAPPIST-1e,32.0,False,47.0,0.0,2552.0,0.0,0.0,Grake Porki,True
3535_02,Mars,True,F/668/S,TRAPPIST-1e,25.0,False,0.0,0.0,0.0,0.0,0.0,Grake Porki,True


# Numeric Features

In [13]:
df[['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].corr()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
Age,1.0,0.068723,0.130421,0.033133,0.12397,0.101007
RoomService,0.068723,1.0,-0.015889,0.05448,0.01008,-0.019581
FoodCourt,0.130421,-0.015889,1.0,-0.014228,0.221891,0.227995
ShoppingMall,0.033133,0.05448,-0.014228,1.0,0.013879,-0.007322
Spa,0.12397,0.01008,0.221891,0.013879,1.0,0.153821
VRDeck,0.101007,-0.019581,0.227995,-0.007322,0.153821,1.0


In [10]:
df[df['VRDeck'].isna()].isna().sum()

HomePlanet        1
CryoSleep         7
Cabin             4
Destination       2
Age               4
VIP               1
RoomService       2
FoodCourt         6
ShoppingMall      6
Spa               3
VRDeck          188
Name              4
Transported       0
dtype: int64

# Cabin Features
The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

In [76]:
cabin_features['deck'].replace("T","G")

PassengerId
0001_01    B
0002_01    F
0003_01    A
0003_02    A
0004_01    F
          ..
9276_01    A
9278_01    G
9279_01    G
9280_01    E
9280_02    E
Name: deck, Length: 8693, dtype: object

In [77]:
df = df_train.copy()

In [85]:
df.groupby(df['Destination'].isna())['Transported'].mean()

Destination
False    0.503584
True     0.505495
Name: Transported, dtype: float64

In [91]:
df.groupby(['side'])['Transported'].agg(['count','mean'])

Unnamed: 0_level_0,count,mean
side,Unnamed: 1_level_1,Unnamed: 2_level_1
P,4206,0.45126
S,4288,0.555037


# Bills

In [23]:
df['0_bills'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1) == 0
df.groupby('0_bills')['VIP'].agg(['count','mean'])

Unnamed: 0_level_0,count,mean
0_bills,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4930,0.034686
True,3560,0.007865


In [57]:
df_train.corr()['0_bills'].sort_values()

  df_train.corr()['0_bills'].sort_values()


RoomService    -0.286793
ShoppingMall   -0.243392
FoodCourt      -0.241778
Spa            -0.233378
Age            -0.230221
VRDeck         -0.226161
Transported     0.481628
0_bills         1.000000
Name: 0_bills, dtype: float64