# Xgboost

Ref: https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from spaceship_titanic import feature_enginnering as fe
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df_train.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


# Data Treatment

In [3]:
def treat_dataset(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Name'],axis=1)
    df = fe.fillna_homeplanet_and_destination(df)
    df = fe.fill_with_0_people_with_no_other_wastes(df)
    df = fe.fill_0_wastes_people_cryosleep(df)
    df = fe.fill_with_non_0_median(df)
    df = fe.fill_cryosleep(df)
    df = fe.feature_inputer(df, to_mode=[], to_median=['Age'])
    df = fe.cabin_inputer(df)
    df = fe.vip_knn_input(df)
    df['total_bills'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    df = fe.outliers_to_log(df)
    df = fe.dtype_memory_reducer(df)
    return df
def feature_enginnering(df: pd.DataFrame) -> pd.DataFrame:
    df['Deck'] = df['Deck'].map(dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'],[1,2,3,4,5,6,7])))
    df = pd.get_dummies(df,columns=['HomePlanet','Destination','Side'],drop_first=True)
    return df

def train_test_treatments(df, full_df):
    df['GroupSize'] = fe.calculate_groupsize(df.index, full_df.index)
    df['GroupLastNameSize'] = fe.calculate_group_lastname_size(df.index, full_df)
    df.drop(['Side_U','VIP'],axis=1, inplace=True)
    return df

In [4]:
df_train = treat_dataset(df_train.copy())
df_train = feature_enginnering(df_train)
df_train = train_test_treatments(df_train, pd.concat([df_train,df_test]))

In [5]:
X, y = df_train.drop('Transported',axis=1), df_train['Transported']
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [21]:
pipeline = Pipeline([
    #('scaler', MinMaxScaler()),
    ('skb', SelectKBest(chi2, k = 10)),
    ('model', GradientBoostingClassifier(
        min_samples_leaf=15, random_state=42,
        n_estimators=10000, n_iter_no_change=20, tol=1e-4, validation_fraction=0.1
        )
    )
])

## Tuning

In [22]:
params = {
 'skb__k':[5, 7, 9, 11, 13, 15],
 'model__max_depth': [3, 5, 7, 9],
 'model__learning_rate': [0.1,0.001],
 'model__subsample': [0.4,0.7,0.9],
 'model__max_features':['log2', None],

}

In [None]:
grid = GridSearchCV(
    pipeline,
    param_grid=params,
    scoring='accuracy',
    cv=KFold(5, shuffle=True, random_state=42),
    verbose=3,
    return_train_score=True,
    n_jobs=1
)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
[CV 1/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.809, test=0.805) total time=   0.2s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.810, test=0.805) total time=   0.3s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.812, test=0.792) total time=   0.2s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.806, test=0.812) total time=   0.1s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.808, test=0.787) total time=   0.1s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model_

[CV 4/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.7, skb__k=11;, score=(train=0.819, test=0.820) total time=   0.4s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.7, skb__k=11;, score=(train=0.813, test=0.788) total time=   0.2s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.7, skb__k=13;, score=(train=0.829, test=0.803) total time=   0.5s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.7, skb__k=13;, score=(train=0.844, test=0.810) total time=   0.8s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.7, skb__k=13;, score=(train=0.830, test=0.799) total time=   0.5s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.7, skb__k=13;, score=(train=0.821, test=0.820)

[CV 3/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.4, skb__k=7;, score=(train=0.812, test=0.786) total time=   0.2s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.4, skb__k=7;, score=(train=0.813, test=0.809) total time=   0.3s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.4, skb__k=7;, score=(train=0.813, test=0.788) total time=   0.3s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.4, skb__k=9;, score=(train=0.809, test=0.803) total time=   0.3s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.4, skb__k=9;, score=(train=0.827, test=0.794) total time=   0.7s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.4, skb__k=9;, score=(train=0.815, test=0.786) total

[CV 2/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.851, test=0.810) total time=   1.1s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.840, test=0.799) total time=   1.0s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.828, test=0.821) total time=   0.7s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.843, test=0.801) total time=   0.9s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.9, skb__k=5;, score=(train=0.810, test=0.804) total time=   0.5s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=None, model__subsample=0.9, skb__k=5;, score=(train=0.823, test=0.795) t

[CV 1/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.836, test=0.800) total time=   0.3s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.845, test=0.803) total time=   0.4s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.841, test=0.787) total time=   0.3s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.825, test=0.820) total time=   0.1s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.830, test=0.799) total time=   0.2s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.4, skb__k=13;, score=(train=0.848, test=0.810)

[CV 5/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.9, skb__k=5;, score=(train=0.825, test=0.788) total time=   0.3s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.9, skb__k=7;, score=(train=0.821, test=0.807) total time=   0.3s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.9, skb__k=7;, score=(train=0.837, test=0.791) total time=   0.6s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.9, skb__k=7;, score=(train=0.832, test=0.786) total time=   0.5s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.9, skb__k=7;, score=(train=0.827, test=0.811) total time=   0.4s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=log2, model__subsample=0.9, skb__k=7;, score=(train=0.821, test=0.787) total

[CV 4/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.4, skb__k=13;, score=(train=0.852, test=0.824) total time=   0.5s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.4, skb__k=13;, score=(train=0.845, test=0.801) total time=   0.4s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.4, skb__k=15;, score=(train=0.850, test=0.797) total time=   0.5s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.4, skb__k=15;, score=(train=0.862, test=0.815) total time=   0.6s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.4, skb__k=15;, score=(train=0.848, test=0.797) total time=   0.4s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.4, skb__k=15;, score=(train=0.853, test=0.824)

[CV 3/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9, skb__k=9;, score=(train=0.835, test=0.787) total time=   0.5s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9, skb__k=9;, score=(train=0.836, test=0.815) total time=   0.6s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9, skb__k=9;, score=(train=0.832, test=0.794) total time=   0.4s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9, skb__k=11;, score=(train=0.851, test=0.794) total time=   0.7s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9, skb__k=11;, score=(train=0.859, test=0.797) total time=   0.8s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9, skb__k=11;, score=(train=0.835, test=0.792) to

[CV 2/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.7, skb__k=5;, score=(train=0.837, test=0.789) total time=   0.2s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.7, skb__k=5;, score=(train=0.829, test=0.789) total time=   0.2s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.7, skb__k=5;, score=(train=0.844, test=0.807) total time=   0.4s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.7, skb__k=5;, score=(train=0.834, test=0.792) total time=   0.2s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.7, skb__k=7;, score=(train=0.835, test=0.800) total time=   0.2s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.7, skb__k=7;, score=(train=0.857, test=0.792) total

[CV 1/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.9, skb__k=13;, score=(train=0.868, test=0.803) total time=   0.3s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.9, skb__k=13;, score=(train=0.902, test=0.810) total time=   0.7s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.9, skb__k=13;, score=(train=0.883, test=0.802) total time=   0.5s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.9, skb__k=13;, score=(train=0.886, test=0.829) total time=   0.5s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.9, skb__k=13;, score=(train=0.879, test=0.797) total time=   0.5s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=log2, model__subsample=0.9, skb__k=15;, score=(train=0.887, test=0.807)

[CV 5/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=None, model__subsample=0.7, skb__k=7;, score=(train=0.831, test=0.795) total time=   0.3s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=None, model__subsample=0.7, skb__k=9;, score=(train=0.848, test=0.802) total time=   0.6s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=None, model__subsample=0.7, skb__k=9;, score=(train=0.861, test=0.797) total time=   0.6s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=None, model__subsample=0.7, skb__k=9;, score=(train=0.850, test=0.788) total time=   0.4s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=None, model__subsample=0.7, skb__k=9;, score=(train=0.855, test=0.807) total time=   0.5s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=None, model__subsample=0.7, skb__k=9;, score=(train=0.847, test=0.799) total

[CV 4/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=None, model__subsample=0.9, skb__k=15;, score=(train=0.894, test=0.825) total time=   0.9s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=7, model__max_features=None, model__subsample=0.9, skb__k=15;, score=(train=0.886, test=0.807) total time=   0.8s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.834, test=0.802) total time=   0.3s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.833, test=0.795) total time=   0.3s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.831, test=0.786) total time=   0.2s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.4, skb__k=5;, score=(train=0.834, test=0.807) tot

[CV 3/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.7, skb__k=11;, score=(train=0.877, test=0.791) total time=   0.4s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.7, skb__k=11;, score=(train=0.883, test=0.817) total time=   0.5s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.7, skb__k=11;, score=(train=0.856, test=0.783) total time=   0.3s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.7, skb__k=13;, score=(train=0.887, test=0.805) total time=   0.4s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.7, skb__k=13;, score=(train=0.930, test=0.809) total time=   1.0s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.7, skb__k=13;, score=(train=0.909, test=0.794)

[CV 2/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4, skb__k=7;, score=(train=0.844, test=0.790) total time=   0.4s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4, skb__k=7;, score=(train=0.834, test=0.778) total time=   0.2s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4, skb__k=7;, score=(train=0.845, test=0.809) total time=   0.4s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4, skb__k=7;, score=(train=0.833, test=0.785) total time=   0.3s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4, skb__k=9;, score=(train=0.841, test=0.802) total time=   0.3s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4, skb__k=9;, score=(train=0.866, test=0.799) total

[CV 1/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.916, test=0.798) total time=   1.0s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.958, test=0.807) total time=   2.0s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.912, test=0.807) total time=   0.8s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.907, test=0.822) total time=   0.8s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.7, skb__k=15;, score=(train=0.905, test=0.794) total time=   0.9s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.9, skb__k=5;, score=(train=0.862, test=0.795) 

[CV 5/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=9;, score=(train=0.804, test=0.783) total time=  11.3s
[CV 1/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.808, test=0.799) total time=  15.6s
[CV 2/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.812, test=0.804) total time=  21.3s
[CV 3/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.807, test=0.790) total time=  14.4s
[CV 4/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.804, test=0.811) total time=  16.5s
[CV 5/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.4, skb__k=11;, score=(train=0.806, 

[CV 3/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.9, skb__k=5;, score=(train=0.807, test=0.786) total time=  19.6s
[CV 4/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.9, skb__k=5;, score=(train=0.803, test=0.807) total time=  17.9s
[CV 5/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.9, skb__k=5;, score=(train=0.805, test=0.783) total time=  14.9s
[CV 1/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.9, skb__k=7;, score=(train=0.797, test=0.799) total time=  16.2s
[CV 2/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.9, skb__k=7;, score=(train=0.807, test=0.796) total time=  25.2s
[CV 3/5] END model__learning_rate=0.001, model__max_depth=3, model__max_features=log2, model__subsample=0.9, skb__k=7;, score=(train=0.806, test=

In [9]:
grid.best_params_

{'learning_rate': 0.001,
 'max_depth': 13,
 'max_features': 'log2',
 'min_samples_leaf': 15,
 'n_estimators': 10000,
 'n_iter_no_change': 20,
 'random_state': 42,
 'subsample': 0.9,
 'tol': 0.0001,
 'validation_fraction': 0.1}

In [10]:
pd.DataFrame(grid.cv_results_).loc[grid.best_index_]

mean_fit_time                                                        42.308752
std_fit_time                                                          5.875032
mean_score_time                                                       0.409357
std_score_time                                                        0.050338
param_learning_rate                                                      0.001
param_max_depth                                                             13
param_max_features                                                        log2
param_min_samples_leaf                                                      15
param_n_estimators                                                       10000
param_n_iter_no_change                                                      20
param_random_state                                                          42
param_subsample                                                            0.9
param_tol                                           

# Validation

In [12]:
model = grid.best_estimator_
len(model.estimators_)

3741

In [13]:
pd.Series(model.feature_importances_, X.columns).sort_values()

GroupLastNameSize            0.000000
Destination_PSO J318.5-22    0.002699
Destination_TRAPPIST-1e      0.008479
HomePlanet_Mars              0.014008
GroupSize                    0.016783
HomePlanet_Europa            0.020920
Side_S                       0.023692
Deck                         0.057015
Age                          0.061226
ShoppingMall                 0.069836
RoomService                  0.084982
VRDeck                       0.090323
FoodCourt                    0.091149
CryoSleep                    0.095252
Num                          0.096299
Spa                          0.107951
total_bills                  0.159385
dtype: float64

In [14]:
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

       False       0.81      0.78      0.79       861
        True       0.79      0.82      0.81       878

    accuracy                           0.80      1739
   macro avg       0.80      0.80      0.80      1739
weighted avg       0.80      0.80      0.80      1739



# Submission

In [15]:
X_test = treat_dataset(df_test)
X_test = feature_enginnering(X_test)
X_test = train_test_treatments(X_test, pd.concat([df_train,df_test]))
X_test.head(2)

Unnamed: 0_level_0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,total_bills,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Side_S,GroupSize,GroupLastNameSize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0013_01,True,27.0,0.0,0.0,0.0,0.0,0.0,7,3,0.0,0,0,0,1,1,1,1
0018_01,False,19.0,0.0,2.302585,0.0,7.94591,0.0,6,4,2832.0,0,0,0,1,1,1,1


In [16]:
y_test = model.predict(X_test)
submission = pd.Series(y_test, index=[X_test.index]).astype(bool).to_frame('Transported')
submission.head()

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True


In [17]:
submission.to_csv('data/submission_gbm_2022-11-22.csv')

# To Ensemble

In [None]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')

df = pd.concat([df_train,df_test]).drop(['Transported'],axis=1)
df = treat_dataset(df)
df = feature_enginnering(df)
df = train_test_treatments(df, pd.concat([df_train,df_test]))
ensemble = model.predict_proba(df)
ensemble = pd.Series(ensemble[:,1], index=[df.index]).to_frame('GBM')
ensemble.to_csv('data/ensemble/gbm.csv')