# Xgboost

Ref: https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

In [4]:
import pandas as pd
import numpy as np
from spaceship_titanic import feature_enginnering as fe

In [5]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df_train.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


# Data Treatment

In [6]:
def treat_dataset(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Name'],axis=1)
    df = fe.fillna_homeplanet_and_destination(df)
    df = fe.fill_with_0_people_with_no_other_wastes(df)
    df = fe.fill_0_wastes_people_cryosleep(df)
    df = fe.fill_with_non_0_median(df)
    df = fe.fill_cryosleep(df)
    df = fe.feature_inputer(df, to_mode=[], to_median=['Age'])
    df = fe.cabin_inputer(df)
    df = fe.vip_knn_input(df)
    df['has_bills'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1) != 0
    df = fe.outliers_to_log(df)
    df = fe.dtype_memory_reducer(df)
    return df
def feature_enginnering(df: pd.DataFrame) -> pd.DataFrame:
    df['Deck'] = df['Deck'].map(dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'],[1,2,3,4,5,6,7])))
    df = pd.get_dummies(df,columns=['HomePlanet','Destination'],drop_first=True)
    return df

def train_test_treatments(df, full_df):
    df['is_alone'] = fe.calculate_groupsize(df.index, full_df.index) == 1
    return df

In [7]:
df_train = treat_dataset(df_train.copy())
df_train = feature_enginnering(df_train)
df_train = train_test_treatments(df_train, pd.concat([df_train,df_test]))

In [8]:
X_train,y_train = df_train.drop('Transported',axis=1), df_train['Transported']

In [9]:
import xgboost as xgb
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.pipeline import Pipeline

## Tuning

In [28]:
params = {
 'max_depth': [3, 4, 5, 6],
 'learning_rate': [0.01,0.1],
 'n_estimators': [100, 200,300,400],
 'subsample':[0.8, 1],
 'colsample_bytree': [0.8,1],
}

In [29]:
grid = GridSearchCV(
    xgb.XGBClassifier(),
    param_grid=params,
    scoring='accuracy',
    cv=KFold(5, shuffle=True, random_state=42),
    verbose=3,
    return_train_score=True
)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8;, score=(train=0.773, test=0.765) total time=   0.1s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8;, score=(train=0.774, test=0.757) total time=   0.1s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8;, score=(train=0.780, test=0.779) total time=   0.1s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8;, score=(train=0.778, test=0.776) total time=   0.1s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=0.8;, score=(train=0.771, test=0.779) total time=   0.1s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1;, score=(train=0.784, test=0.772) total time=   0.0s
[CV 2/5] END colsampl

[CV 5/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=0.8;, score=(train=0.800, test=0.808) total time=   0.4s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=1;, score=(train=0.805, test=0.784) total time=   0.4s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=1;, score=(train=0.806, test=0.792) total time=   0.3s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=1;, score=(train=0.799, test=0.788) total time=   0.3s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=1;, score=(train=0.806, test=0.795) total time=   0.3s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=200, subsample=1;, score=(train=0.798, test=0.803) total time=   0.5s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=4, n_estimators=300, subsam

[CV 4/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=300, subsample=1;, score=(train=0.816, test=0.796) total time=   0.6s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=300, subsample=1;, score=(train=0.814, test=0.808) total time=   0.6s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=400, subsample=0.8;, score=(train=0.817, test=0.789) total time=   0.9s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=400, subsample=0.8;, score=(train=0.819, test=0.797) total time=   1.0s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=400, subsample=0.8;, score=(train=0.818, test=0.803) total time=   1.0s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=400, subsample=0.8;, score=(train=0.817, test=0.800) total time=   1.0s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.01, max_depth=5, n_estimators=400, 

[CV 4/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8;, score=(train=0.806, test=0.794) total time=   0.1s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.8;, score=(train=0.804, test=0.803) total time=   0.1s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1;, score=(train=0.809, test=0.786) total time=   0.0s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1;, score=(train=0.806, test=0.792) total time=   0.0s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1;, score=(train=0.808, test=0.799) total time=   0.0s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1;, score=(train=0.808, test=0.792) total time=   0.0s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=3, n_estimators=100, subsample=1

[CV 4/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=200, subsample=1;, score=(train=0.830, test=0.803) total time=   0.2s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=200, subsample=1;, score=(train=0.826, test=0.803) total time=   0.2s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=0.8;, score=(train=0.850, test=0.786) total time=   0.5s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=0.8;, score=(train=0.855, test=0.788) total time=   0.5s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=0.8;, score=(train=0.852, test=0.801) total time=   0.5s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=0.8;, score=(train=0.851, test=0.799) total time=   0.5s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=4, n_estimators=300, subsamp

[CV 4/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=400, subsample=0.8;, score=(train=0.895, test=0.797) total time=   1.2s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=400, subsample=0.8;, score=(train=0.898, test=0.799) total time=   1.0s
[CV 1/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=400, subsample=1;, score=(train=0.876, test=0.790) total time=   0.8s
[CV 2/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=400, subsample=1;, score=(train=0.879, test=0.793) total time=   0.7s
[CV 3/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=400, subsample=1;, score=(train=0.877, test=0.796) total time=   0.6s
[CV 4/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=400, subsample=1;, score=(train=0.878, test=0.795) total time=   1.3s
[CV 5/5] END colsample_bytree=0.8, learning_rate=0.1, max_depth=5, n_estimators=400, subsample=1

[CV 4/5] END colsample_bytree=1, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1;, score=(train=0.781, test=0.776) total time=   0.0s
[CV 5/5] END colsample_bytree=1, learning_rate=0.01, max_depth=3, n_estimators=100, subsample=1;, score=(train=0.769, test=0.778) total time=   0.1s
[CV 1/5] END colsample_bytree=1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8;, score=(train=0.798, test=0.781) total time=   0.4s
[CV 2/5] END colsample_bytree=1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8;, score=(train=0.797, test=0.786) total time=   0.3s
[CV 3/5] END colsample_bytree=1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8;, score=(train=0.791, test=0.781) total time=   0.3s
[CV 4/5] END colsample_bytree=1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8;, score=(train=0.798, test=0.789) total time=   0.3s
[CV 5/5] END colsample_bytree=1, learning_rate=0.01, max_depth=3, n_estimators=200, subsample=0.8;

[CV 4/5] END colsample_bytree=1, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=0.8;, score=(train=0.805, test=0.793) total time=   0.7s
[CV 5/5] END colsample_bytree=1, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=0.8;, score=(train=0.803, test=0.803) total time=   0.7s
[CV 1/5] END colsample_bytree=1, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=1;, score=(train=0.808, test=0.784) total time=   0.5s
[CV 2/5] END colsample_bytree=1, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=1;, score=(train=0.804, test=0.791) total time=   0.6s
[CV 3/5] END colsample_bytree=1, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=1;, score=(train=0.803, test=0.800) total time=   0.7s
[CV 4/5] END colsample_bytree=1, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=1;, score=(train=0.807, test=0.795) total time=   0.4s
[CV 5/5] END colsample_bytree=1, learning_rate=0.01, max_depth=4, n_estimators=300, subsample=1;, scor

[CV 4/5] END colsample_bytree=1, learning_rate=0.01, max_depth=5, n_estimators=400, subsample=1;, score=(train=0.817, test=0.799) total time=   1.2s
[CV 5/5] END colsample_bytree=1, learning_rate=0.01, max_depth=5, n_estimators=400, subsample=1;, score=(train=0.815, test=0.808) total time=   1.9s
[CV 1/5] END colsample_bytree=1, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8;, score=(train=0.818, test=0.784) total time=   0.5s
[CV 2/5] END colsample_bytree=1, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8;, score=(train=0.819, test=0.798) total time=   0.4s
[CV 3/5] END colsample_bytree=1, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8;, score=(train=0.815, test=0.797) total time=   0.4s
[CV 4/5] END colsample_bytree=1, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8;, score=(train=0.815, test=0.793) total time=   0.3s
[CV 5/5] END colsample_bytree=1, learning_rate=0.01, max_depth=6, n_estimators=100, subsample=0.8;

[CV 4/5] END colsample_bytree=1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8;, score=(train=0.819, test=0.799) total time=   0.2s
[CV 5/5] END colsample_bytree=1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=0.8;, score=(train=0.816, test=0.810) total time=   0.2s
[CV 1/5] END colsample_bytree=1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1;, score=(train=0.816, test=0.791) total time=   0.2s
[CV 2/5] END colsample_bytree=1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1;, score=(train=0.813, test=0.798) total time=   0.2s
[CV 3/5] END colsample_bytree=1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1;, score=(train=0.820, test=0.804) total time=   0.2s
[CV 4/5] END colsample_bytree=1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1;, score=(train=0.813, test=0.793) total time=   0.2s
[CV 5/5] END colsample_bytree=1, learning_rate=0.1, max_depth=3, n_estimators=200, subsample=1;, score=(trai

[CV 4/5] END colsample_bytree=1, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=1;, score=(train=0.844, test=0.806) total time=   0.4s
[CV 5/5] END colsample_bytree=1, learning_rate=0.1, max_depth=4, n_estimators=300, subsample=1;, score=(train=0.840, test=0.803) total time=   0.4s
[CV 1/5] END colsample_bytree=1, learning_rate=0.1, max_depth=4, n_estimators=400, subsample=0.8;, score=(train=0.865, test=0.789) total time=   0.7s
[CV 2/5] END colsample_bytree=1, learning_rate=0.1, max_depth=4, n_estimators=400, subsample=0.8;, score=(train=0.869, test=0.790) total time=   0.7s
[CV 3/5] END colsample_bytree=1, learning_rate=0.1, max_depth=4, n_estimators=400, subsample=0.8;, score=(train=0.868, test=0.798) total time=   0.7s
[CV 4/5] END colsample_bytree=1, learning_rate=0.1, max_depth=4, n_estimators=400, subsample=0.8;, score=(train=0.867, test=0.807) total time=   0.7s
[CV 5/5] END colsample_bytree=1, learning_rate=0.1, max_depth=4, n_estimators=400, subsample=0.8;, score

[CV 4/5] END colsample_bytree=1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=0.8;, score=(train=0.851, test=0.803) total time=   0.2s
[CV 5/5] END colsample_bytree=1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=0.8;, score=(train=0.846, test=0.808) total time=   0.3s
[CV 1/5] END colsample_bytree=1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=1;, score=(train=0.840, test=0.791) total time=   0.2s
[CV 2/5] END colsample_bytree=1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=1;, score=(train=0.842, test=0.797) total time=   0.2s
[CV 3/5] END colsample_bytree=1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=1;, score=(train=0.844, test=0.805) total time=   0.2s
[CV 4/5] END colsample_bytree=1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=1;, score=(train=0.843, test=0.795) total time=   0.2s
[CV 5/5] END colsample_bytree=1, learning_rate=0.1, max_depth=6, n_estimators=100, subsample=1;, score=(trai

In [30]:
grid.best_params_

{'colsample_bytree': 0.8,
 'learning_rate': 0.01,
 'max_depth': 6,
 'n_estimators': 400,
 'subsample': 1}

In [31]:
grid_df = pd.DataFrame(grid.cv_results_)
grid_df.groupby('param_colsample_bytree')['mean_test_score'].mean()

param_colsample_bytree
0.8    0.795145
1.0    0.795413
Name: mean_test_score, dtype: float64

In [32]:
pd.DataFrame(grid.cv_results_)[['mean_train_score','std_train_score','mean_test_score','std_test_score']].loc[grid.best_index_]

mean_train_score    0.827678
std_train_score     0.000914
mean_test_score     0.801910
std_test_score      0.004559
Name: 31, dtype: float64

In [33]:
model = grid.best_estimator_
model.fit(X_train, y_train)

# Submission

In [34]:
X_test = treat_dataset(df_test)
X_test = feature_enginnering(X_test)
X_test = train_test_treatments(X_test, pd.concat([df_train,df_test]))
X_test.head(2)

Unnamed: 0_level_0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,has_bills,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,is_alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0013_01,True,27.0,False,0.0,0.0,0.0,0.0,0.0,7,False,0,0,0,1,True
0018_01,False,19.0,False,0.0,2.302585,0.0,7.94591,0.0,6,True,0,0,0,1,True


In [35]:
y_test = model.predict(X_test)
submission = pd.Series(y_test, index=[X_test.index]).astype(bool).to_frame('Transported')
submission.head()

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True


In [36]:
submission.to_csv('data/submission_xgboost_2022-11-06.csv')