In [159]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [224]:
train = pd.read_csv('./train.csv')

train[['group', 'pid']] = train['PassengerId'].str.split('_').tolist()
train[['deck', 'num', 'side']] = train['Cabin'].fillna('//').str.split('/').tolist()

train_grouped = train.groupby('group')
train = train.set_index('group')
cols_to_impute = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
train[cols_to_impute] = train[cols_to_impute].fillna(train_grouped[cols_to_impute].agg(lambda x: x.sum() / x.shape[0]))

cols_to_impute = ['deck', 'num', 'side']
train[cols_to_impute] = train[cols_to_impute].replace({'': np.nan}).fillna(train_grouped[cols_to_impute].agg(pd.Series.mode))

cols_to_impute = ['HomePlanet', 'Destination']
modes = train[cols_to_impute].mode().T.to_dict()[0]
print(modes)
train[cols_to_impute] = train[cols_to_impute].fillna(modes)

train = train.reset_index()
train['expenditure'] = train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1, min_count=1)
train['group_size'] = train.groupby('group')['pid'].transform('size')

train['high_expenditure'] = train['expenditure'] >= train['expenditure'].quantile(train['VIP'].value_counts(normalize=True)[0])
train['VIP'] = train['VIP'].fillna(train['high_expenditure'])

train['zero_expenditure'] = train['expenditure'] == 0
train['CryoSleep'] = train['CryoSleep'].fillna(train['zero_expenditure'])

train.isna().sum()

{'HomePlanet': 'Earth', 'Destination': 'TRAPPIST-1e'}


group                 0
PassengerId           0
HomePlanet            0
CryoSleep             0
Cabin               199
Destination           0
Age                 179
VIP                   0
RoomService           0
FoodCourt             0
ShoppingMall          0
Spa                   0
VRDeck                0
Name                200
Transported           0
pid                   0
deck                  0
num                   0
side                  0
expenditure           0
group_size            0
high_expenditure      0
zero_expenditure      0
dtype: int64

In [225]:
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer

In [226]:
from seaborn import pairplot

In [227]:
from sklearn.preprocessing import OneHotEncoder

In [228]:
print(train.select_dtypes(object).columns)
print(train.select_dtypes(np.number).columns)

Index(['group', 'PassengerId', 'HomePlanet', 'Cabin', 'Destination', 'Name',
       'pid', 'deck', 'num', 'side'],
      dtype='object')
Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'expenditure', 'group_size'],
      dtype='object')


In [251]:
CategoricalLabels = ['HomePlanet', 'CryoSleep', 'Destination', 'deck', 'side', 'VIP']
NumericalLabels = ['expenditure', 'group_size']
target = 'Transported'
Columns = CategoricalLabels + NumericalLabels
train[train.select_dtypes(object).columns] = train.select_dtypes(object).astype(str)
features = train[Columns]
labels = train[target]
features

Unnamed: 0,HomePlanet,CryoSleep,Destination,deck,side,VIP,expenditure,group_size
0,Europa,False,TRAPPIST-1e,B,P,False,0.0,1
1,Earth,False,TRAPPIST-1e,F,S,False,736.0,1
2,Europa,False,TRAPPIST-1e,A,S,True,10383.0,2
3,Europa,False,TRAPPIST-1e,A,S,False,5176.0,2
4,Earth,False,TRAPPIST-1e,F,S,False,1091.0,1
...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,A,P,True,8536.0,1
8689,Earth,True,PSO J318.5-22,G,S,False,0.0,1
8690,Earth,False,TRAPPIST-1e,G,S,False,1873.0,1
8691,Europa,False,55 Cancri e,E,S,False,4637.0,2


In [252]:
CategoricalPipeline = Pipeline([
    ('selector', ColumnTransformer([('select', 'passthrough', CategoricalLabels)], remainder='drop')),
    ('one_hot', OneHotEncoder(drop='first', handle_unknown='ignore')) 
])

NumericalPipeline = Pipeline([
    ('selector', ColumnTransformer([('select', 'passthrough', NumericalLabels)], remainder='drop')),
    ('scaler', StandardScaler())
])

from sklearn.pipeline import make_pipeline
PreprocessPipeline = make_pipeline(*CategoricalPipeline, *NumericalPipeline)


In [253]:
temp = ColumnTransformer([('select', 'passthrough', CategoricalLabels)], remainder='drop')
temp

ColumnTransformer(transformers=[('select', 'passthrough',
                                 ['HomePlanet', 'CryoSleep', 'Destination',
                                  'deck', 'side', 'VIP'])])

In [257]:
PreprocessPipeline.fit_transform(train)

ValueError: Specifying the columns using strings is only supported for pandas DataFrames

In [240]:
for label in CategoricalLabels:
    print(label)
    train[label].value_counts()

HomePlanet
CryoSleep
Destination
deck
side
VIP


In [248]:
train[CategoricalLabels].astype(str)

Unnamed: 0,HomePlanet,CryoSleep,Destination,deck,side,VIP
0,Europa,False,TRAPPIST-1e,B,P,False
1,Earth,False,TRAPPIST-1e,F,S,False
2,Europa,False,TRAPPIST-1e,A,S,True
3,Europa,False,TRAPPIST-1e,A,S,False
4,Earth,False,TRAPPIST-1e,F,S,False
...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,A,P,True
8689,Earth,True,PSO J318.5-22,G,S,False
8690,Earth,False,TRAPPIST-1e,G,S,False
8691,Europa,False,55 Cancri e,E,S,False
