In [12]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer,KNNImputer

In [13]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df_train.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


# Data Treatment

In [14]:
def feature_inputer(
    df:pd.DataFrame,
    to_mode = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP'],
    to_median = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
) -> pd.DataFrame:
    for col in to_mode:
        mode_inputer = SimpleImputer(strategy='most_frequent')
        df[col] = mode_inputer.fit_transform(df[[col]])

    
    for col in to_median:
        inputer = SimpleImputer(strategy='median')
        df[col] = inputer.fit_transform(df[[col]])
    return df

def dtype_memory_reducer(df: pd.DataFrame) -> pd.DataFrame:
    df['CryoSleep'] = df['CryoSleep'].astype(bool)
    df['VIP'] = df['VIP'].astype(bool)

    df['HomePlanet'] = df['HomePlanet'].astype('category')
    df['Destination'] = df['Destination'].astype('category')
    return df

def outliers_to_log(df:pd.DataFrame) -> pd.DataFrame:
    to_log = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in to_log:
        df[col] = np.log(df[col] + 1)
    return df

def cabin_inputer(df:pd.DataFrame) -> pd.DataFrame:
    cabin_features = df['Cabin'].str.split("/",expand=True)[[0,2]].rename(columns={0:'Deck',2:'side'})
    df = pd.concat([df,cabin_features],axis=1)
    df['Deck'] = df['Deck'].fillna('G').replace("T","G")
    df = df.drop(['side','Cabin'],axis=1)
    return df

def vip_knn_input(df: pd.DataFrame) -> pd.DataFrame:
    inputer = KNNImputer(n_neighbors=5)
    df['VIP'] = inputer.fit_transform(df[['VIP','RoomService']])[:,0]
    return df

def treat_dataset(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Name'],axis=1)
    df = cabin_inputer(df)
    df = feature_inputer(df)
    df = vip_knn_input(df)
    df['0_bills'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1) == 0
    df = outliers_to_log(df)
    df = dtype_memory_reducer(df)
    return df

def feature_enginnering(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.get_dummies(df,columns=['HomePlanet','Destination','Deck'],drop_first=True)
    return df

In [15]:
df_train = treat_dataset(df_train.copy())
df_train = feature_enginnering(df_train)

In [16]:
df_train.head(2)

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,0_bills,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0001_01,False,39.0,False,0.0,0.0,0.0,0.0,0.0,False,True,1,0,0,1,1,0,0,0,0,0
0002_01,False,24.0,False,4.70048,2.302585,3.258097,6.309918,3.806662,True,False,0,0,0,1,0,0,0,0,1,0


In [17]:
X_train,y_train = df_train.drop('Transported',axis=1), df_train['Transported']

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV

## Tuning

In [21]:
params = {'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50],
 'max_features': [None, 'sqrt'],
 'min_samples_leaf': [5, 10],
 'n_estimators': [10, 20, 50, 200, 400]
}

In [22]:
grid = GridSearchCV(
    RandomForestClassifier(),
    param_grid=params,
    scoring='accuracy',
    cv=KFold(5, shuffle=True, random_state=42),
    verbose=3,
    return_train_score=True
)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[CV 1/5] END bootstrap=True, max_depth=10, max_features=None, min_samples_leaf=5, n_estimators=10;, score=(train=0.843, test=0.787) total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=None, min_samples_leaf=5, n_estimators=10;, score=(train=0.839, test=0.794) total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=None, min_samples_leaf=5, n_estimators=10;, score=(train=0.840, test=0.793) total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=None, min_samples_leaf=5, n_estimators=10;, score=(train=0.839, test=0.793) total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=None, min_samples_leaf=5, n_estimators=10;, score=(train=0.837, test=0.799) total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=10, max_features=None, min_samples_leaf=5, n_estimators=20;, score=(train=0.839, test=0.787) total time=   0.2s
[CV 2/5] END bootstrap=True, m

[CV 1/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=5, n_estimators=20;, score=(train=0.827, test=0.797) total time=   0.1s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=5, n_estimators=20;, score=(train=0.830, test=0.798) total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=5, n_estimators=20;, score=(train=0.829, test=0.798) total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=5, n_estimators=20;, score=(train=0.827, test=0.796) total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=5, n_estimators=20;, score=(train=0.828, test=0.801) total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=5, n_estimators=50;, score=(train=0.832, test=0.796) total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=10, max_features=sqrt, min_samples_leaf=5, n_estimators

[CV 5/5] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=20;, score=(train=0.867, test=0.787) total time=   0.2s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=50;, score=(train=0.870, test=0.786) total time=   0.7s
[CV 2/5] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=50;, score=(train=0.869, test=0.794) total time=   0.7s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=50;, score=(train=0.871, test=0.794) total time=   0.7s
[CV 4/5] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=50;, score=(train=0.870, test=0.791) total time=   0.7s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=50;, score=(train=0.867, test=0.785) total time=   0.7s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators

[CV 4/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=50;, score=(train=0.851, test=0.790) total time=   0.2s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=50;, score=(train=0.850, test=0.802) total time=   0.2s
[CV 1/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=200;, score=(train=0.851, test=0.794) total time=   1.1s
[CV 2/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=200;, score=(train=0.851, test=0.801) total time=   1.1s
[CV 3/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=200;, score=(train=0.854, test=0.803) total time=   1.0s
[CV 4/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=200;, score=(train=0.851, test=0.797) total time=   1.1s
[CV 5/5] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estima

[CV 3/5] END bootstrap=True, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=200;, score=(train=0.871, test=0.788) total time=   3.2s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=200;, score=(train=0.871, test=0.792) total time=   3.5s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=200;, score=(train=0.870, test=0.788) total time=   3.5s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=400;, score=(train=0.869, test=0.786) total time=   6.6s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=400;, score=(train=0.870, test=0.794) total time=   7.2s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=400;, score=(train=0.872, test=0.791) total time=   6.6s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=None, min_samples_leaf=5, n_esti

[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=400;, score=(train=0.852, test=0.800) total time=   2.0s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=400;, score=(train=0.854, test=0.800) total time=   2.4s
[CV 4/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=400;, score=(train=0.852, test=0.796) total time=   2.2s
[CV 5/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=400;, score=(train=0.851, test=0.799) total time=   2.0s
[CV 1/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=10, n_estimators=10;, score=(train=0.825, test=0.792) total time=   0.0s
[CV 2/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=10, n_estimators=10;, score=(train=0.829, test=0.794) total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=30, max_features=sqrt, min_samples_leaf=10, n_est

[CV 2/5] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=10, n_estimators=10;, score=(train=0.838, test=0.783) total time=   0.1s
[CV 3/5] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=10, n_estimators=10;, score=(train=0.839, test=0.802) total time=   0.1s
[CV 4/5] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=10, n_estimators=10;, score=(train=0.840, test=0.790) total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=10, n_estimators=10;, score=(train=0.837, test=0.798) total time=   0.1s
[CV 1/5] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=10, n_estimators=20;, score=(train=0.838, test=0.786) total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=10, n_estimators=20;, score=(train=0.840, test=0.795) total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=40, max_features=None, min_samples_leaf=10, n_est

[CV 2/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, n_estimators=20;, score=(train=0.828, test=0.792) total time=   0.0s
[CV 3/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, n_estimators=20;, score=(train=0.825, test=0.801) total time=   0.0s
[CV 4/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, n_estimators=20;, score=(train=0.826, test=0.796) total time=   0.0s
[CV 5/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, n_estimators=20;, score=(train=0.821, test=0.801) total time=   0.0s
[CV 1/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, n_estimators=50;, score=(train=0.827, test=0.794) total time=   0.2s
[CV 2/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, n_estimators=50;, score=(train=0.826, test=0.795) total time=   0.2s
[CV 3/5] END bootstrap=True, max_depth=40, max_features=sqrt, min_samples_leaf=10, n_est

[CV 1/5] END bootstrap=True, max_depth=50, max_features=None, min_samples_leaf=10, n_estimators=50;, score=(train=0.839, test=0.795) total time=   0.8s
[CV 2/5] END bootstrap=True, max_depth=50, max_features=None, min_samples_leaf=10, n_estimators=50;, score=(train=0.842, test=0.794) total time=   0.8s
[CV 3/5] END bootstrap=True, max_depth=50, max_features=None, min_samples_leaf=10, n_estimators=50;, score=(train=0.843, test=0.798) total time=   0.7s
[CV 4/5] END bootstrap=True, max_depth=50, max_features=None, min_samples_leaf=10, n_estimators=50;, score=(train=0.844, test=0.799) total time=   0.9s
[CV 5/5] END bootstrap=True, max_depth=50, max_features=None, min_samples_leaf=10, n_estimators=50;, score=(train=0.838, test=0.796) total time=   0.8s
[CV 1/5] END bootstrap=True, max_depth=50, max_features=None, min_samples_leaf=10, n_estimators=200;, score=(train=0.841, test=0.794) total time=   3.1s
[CV 2/5] END bootstrap=True, max_depth=50, max_features=None, min_samples_leaf=10, n_es

[CV 5/5] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=50;, score=(train=0.824, test=0.807) total time=   0.2s
[CV 1/5] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=200;, score=(train=0.826, test=0.792) total time=   1.1s
[CV 2/5] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=200;, score=(train=0.830, test=0.799) total time=   1.1s
[CV 3/5] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=200;, score=(train=0.825, test=0.802) total time=   1.0s
[CV 4/5] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=200;, score=(train=0.825, test=0.796) total time=   1.0s
[CV 5/5] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=200;, score=(train=0.828, test=0.806) total time=   0.8s
[CV 1/5] END bootstrap=True, max_depth=50, max_features=sqrt, min_samples_leaf=10, 

[CV 4/5] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=10, n_estimators=200;, score=(train=0.825, test=0.789) total time=   4.0s
[CV 5/5] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=10, n_estimators=200;, score=(train=0.825, test=0.791) total time=   4.0s
[CV 1/5] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=10, n_estimators=400;, score=(train=0.829, test=0.773) total time=   8.8s
[CV 2/5] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=10, n_estimators=400;, score=(train=0.830, test=0.788) total time=   8.8s
[CV 3/5] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=10, n_estimators=400;, score=(train=0.827, test=0.791) total time=   7.9s
[CV 4/5] END bootstrap=False, max_depth=10, max_features=None, min_samples_leaf=10, n_estimators=400;, score=(train=0.825, test=0.789) total time=   8.0s
[CV 5/5] END bootstrap=False, max_depth=10, max_features=None, min_samples_l

[CV 3/5] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=10, n_estimators=400;, score=(train=0.830, test=0.800) total time=   2.3s
[CV 4/5] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=10, n_estimators=400;, score=(train=0.828, test=0.795) total time=   2.6s
[CV 5/5] END bootstrap=False, max_depth=10, max_features=sqrt, min_samples_leaf=10, n_estimators=400;, score=(train=0.832, test=0.807) total time=   2.3s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=10;, score=(train=0.867, test=0.744) total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=10;, score=(train=0.867, test=0.770) total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=None, min_samples_leaf=5, n_estimators=10;, score=(train=0.868, test=0.764) total time=   0.2s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=None, min_samples_leaf=5,

[CV 2/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=10;, score=(train=0.875, test=0.799) total time=   0.0s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=10;, score=(train=0.873, test=0.799) total time=   0.0s
[CV 4/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=10;, score=(train=0.869, test=0.802) total time=   0.0s
[CV 5/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=10;, score=(train=0.870, test=0.798) total time=   0.0s
[CV 1/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=20;, score=(train=0.874, test=0.795) total time=   0.1s
[CV 2/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_estimators=20;, score=(train=0.872, test=0.795) total time=   0.1s
[CV 3/5] END bootstrap=False, max_depth=20, max_features=sqrt, min_samples_leaf=5, n_est

[CV 1/5] END bootstrap=False, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=20;, score=(train=0.867, test=0.744) total time=   0.4s
[CV 2/5] END bootstrap=False, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=20;, score=(train=0.867, test=0.773) total time=   0.4s
[CV 3/5] END bootstrap=False, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=20;, score=(train=0.868, test=0.765) total time=   0.5s
[CV 4/5] END bootstrap=False, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=20;, score=(train=0.867, test=0.758) total time=   0.4s
[CV 5/5] END bootstrap=False, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=20;, score=(train=0.868, test=0.759) total time=   0.5s
[CV 1/5] END bootstrap=False, max_depth=30, max_features=None, min_samples_leaf=5, n_estimators=50;, score=(train=0.867, test=0.745) total time=   1.2s
[CV 2/5] END bootstrap=False, max_depth=30, max_features=None, min_samples_leaf=5, n_est

[CV 5/5] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=20;, score=(train=0.873, test=0.799) total time=   0.1s
[CV 1/5] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=50;, score=(train=0.873, test=0.790) total time=   0.3s
[CV 2/5] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=50;, score=(train=0.875, test=0.800) total time=   0.3s
[CV 3/5] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=50;, score=(train=0.875, test=0.805) total time=   0.3s
[CV 4/5] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=50;, score=(train=0.877, test=0.798) total time=   0.3s
[CV 5/5] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_estimators=50;, score=(train=0.873, test=0.800) total time=   0.3s
[CV 1/5] END bootstrap=False, max_depth=30, max_features=sqrt, min_samples_leaf=5, n_est

[CV 4/5] END bootstrap=False, max_depth=40, max_features=None, min_samples_leaf=5, n_estimators=50;, score=(train=0.867, test=0.758) total time=   1.3s
[CV 5/5] END bootstrap=False, max_depth=40, max_features=None, min_samples_leaf=5, n_estimators=50;, score=(train=0.867, test=0.758) total time=   1.0s
[CV 1/5] END bootstrap=False, max_depth=40, max_features=None, min_samples_leaf=5, n_estimators=200;, score=(train=0.867, test=0.745) total time=   4.9s
[CV 2/5] END bootstrap=False, max_depth=40, max_features=None, min_samples_leaf=5, n_estimators=200;, score=(train=0.867, test=0.771) total time=   4.4s
[CV 3/5] END bootstrap=False, max_depth=40, max_features=None, min_samples_leaf=5, n_estimators=200;, score=(train=0.869, test=0.765) total time=   4.5s
[CV 4/5] END bootstrap=False, max_depth=40, max_features=None, min_samples_leaf=5, n_estimators=200;, score=(train=0.867, test=0.758) total time=   4.7s
[CV 5/5] END bootstrap=False, max_depth=40, max_features=None, min_samples_leaf=5, n

[CV 3/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=5, n_estimators=200;, score=(train=0.876, test=0.803) total time=   1.6s
[CV 4/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=5, n_estimators=200;, score=(train=0.875, test=0.800) total time=   1.4s
[CV 5/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=5, n_estimators=200;, score=(train=0.872, test=0.799) total time=   1.4s
[CV 1/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=5, n_estimators=400;, score=(train=0.875, test=0.791) total time=   3.1s
[CV 2/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=5, n_estimators=400;, score=(train=0.875, test=0.800) total time=   3.3s
[CV 3/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=5, n_estimators=400;, score=(train=0.877, test=0.800) total time=   3.0s
[CV 4/5] END bootstrap=False, max_depth=40, max_features=sqrt, min_samples_leaf=5,

[CV 2/5] END bootstrap=False, max_depth=50, max_features=None, min_samples_leaf=5, n_estimators=400;, score=(train=0.867, test=0.771) total time=   8.4s
[CV 3/5] END bootstrap=False, max_depth=50, max_features=None, min_samples_leaf=5, n_estimators=400;, score=(train=0.869, test=0.764) total time=   8.9s
[CV 4/5] END bootstrap=False, max_depth=50, max_features=None, min_samples_leaf=5, n_estimators=400;, score=(train=0.868, test=0.759) total time=   8.7s
[CV 5/5] END bootstrap=False, max_depth=50, max_features=None, min_samples_leaf=5, n_estimators=400;, score=(train=0.867, test=0.758) total time=   9.4s
[CV 1/5] END bootstrap=False, max_depth=50, max_features=None, min_samples_leaf=10, n_estimators=10;, score=(train=0.839, test=0.765) total time=   0.2s
[CV 2/5] END bootstrap=False, max_depth=50, max_features=None, min_samples_leaf=10, n_estimators=10;, score=(train=0.838, test=0.777) total time=   0.2s
[CV 3/5] END bootstrap=False, max_depth=50, max_features=None, min_samples_leaf=10

[CV 3/5] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=10;, score=(train=0.842, test=0.798) total time=   0.0s
[CV 4/5] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=10;, score=(train=0.842, test=0.796) total time=   0.0s
[CV 5/5] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=10;, score=(train=0.842, test=0.800) total time=   0.0s
[CV 1/5] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=20;, score=(train=0.844, test=0.798) total time=   0.0s
[CV 2/5] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=20;, score=(train=0.845, test=0.795) total time=   0.0s
[CV 3/5] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=10, n_estimators=20;, score=(train=0.845, test=0.800) total time=   0.1s
[CV 4/5] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=10

In [42]:
grid.best_params_

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 5,
 'n_estimators': 200}

In [50]:
pd.DataFrame(grid.cv_results_)[['mean_train_score','std_train_score','mean_test_score','std_test_score']].loc[grid.best_index_]

mean_train_score    0.831445
std_train_score     0.001142
mean_test_score     0.800414
std_test_score      0.003660
Name: 13, dtype: float64

In [44]:
model = grid.best_estimator_
model.fit(X_train, y_train)

## Cross Validation

# Submission

In [51]:
X_test = treat_dataset(df_test)
X_test = feature_enginnering(X_test)
X_test.head(2)

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,0_bills,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0013_01,True,27.0,False,0.0,0.0,0.0,0.0,0.0,True,0,0,0,1,0,0,0,0,0,1
0018_01,False,19.0,False,0.0,2.302585,0.0,7.94591,0.0,False,0,0,0,1,0,0,0,0,1,0


In [52]:
y_test = model.predict(X_test)

In [53]:
submission = pd.Series(y_test, index=[X_test.index]).to_frame('Transported')
submission.to_csv('data/submission_rnforest_2022-11-02.csv')