In [86]:
import pandas as pd
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [87]:
train.Cabin.head()

0    B/0/P
1    F/0/S
2    A/0/S
3    A/0/S
4    F/1/S
Name: Cabin, dtype: object

In [88]:
# train['Cabin_zone'] = train.Cabin.apply(lambda x: x.split('/')[0])
# train['Cabin_level'] = train.Cabin.apply(lambda x: x.split('/')[1])
# train['Cabin_side'] = train.Cabin.apply(lambda x: x.split('/')[2])

In [89]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [90]:
numerical_col = train.select_dtypes('float64').columns
categoric_col = train.select_dtypes('object').columns

In [91]:
# using only low cardinality columns
req_cat_col = []
for col in categoric_col:
    if train[col].nunique() < 10:
        req_cat_col.append(col)
        

In [92]:
categoric_col = req_cat_col

In [93]:
req_col = list(numerical_col) + list(categoric_col)
x_train= train[req_col].copy()
x_test = test[req_col].copy()

In [94]:
categoric_col

['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

In [95]:
y = train['Transported'].astype(int)

In [96]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
numerical_transformation = Pipeline(
    steps=[
        ('imputer',SimpleImputer(strategy='constant',)),
        ('scaler', StandardScaler())
    ]
)

categoric_transformation = Pipeline(
    steps=[
    ('imputing', SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [97]:
from sklearn.compose import ColumnTransformer
preprocessing = ColumnTransformer(
    transformers=[
        ('num',numerical_transformation, numerical_col),
        ('cat',categoric_transformation, categoric_col)
    ]
)

In [98]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, random_state=42,max_depth=10)

In [99]:
model = Pipeline(
    steps=[
        ('preprocessing', preprocessing),
        ('classifier', clf)
    ]
)

In [100]:
model.fit(x_train, y)

In [101]:
y_pred = model.predict(x_test)
test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [102]:
sub = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported' : y_pred
})
sub['Transported'] =  sub.Transported.astype(bool)
sub.to_csv('sub.csv', index=False)