# Xgboost

Ref: https://towardsdatascience.com/xgboost-fine-tune-and-optimize-your-model-23d996fab663

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from spaceship_titanic import feature_enginnering as fe
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, chi2

In [9]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df_train.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


# Data Treatment

In [10]:
def treat_dataset(df:pd.DataFrame) -> pd.DataFrame:
    df = df.drop(['Name'],axis=1)
    df = fe.fillna_homeplanet_and_destination(df)
    df = fe.fill_with_0_people_with_no_other_wastes(df)
    df = fe.fill_0_wastes_people_cryosleep(df)
    df = fe.fill_with_non_0_median(df)
    df = fe.fill_cryosleep(df)
    df = fe.feature_inputer(df, to_mode=[], to_median=['Age'])
    df = fe.cabin_inputer(df)
    df = fe.vip_knn_input(df)
    df['total_bills'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
    df = fe.outliers_to_log(df)
    df = fe.dtype_memory_reducer(df)
    return df
def feature_enginnering(df: pd.DataFrame) -> pd.DataFrame:
    df['Deck'] = df['Deck'].map(dict(zip(['A', 'B', 'C', 'D', 'E', 'F', 'G'],[1,2,3,4,5,6,7])))
    df = pd.get_dummies(df,columns=['HomePlanet','Destination','Side'],drop_first=True)
    return df

def train_test_treatments(df, full_df):
    df['GroupSize'] = fe.calculate_groupsize(df.index, full_df.index)
    df['GroupLastNameSize'] = fe.calculate_group_lastname_size(df.index, full_df)
    df.drop(['Side_U','VIP'],axis=1, inplace=True)
    return df

In [11]:
df_train = treat_dataset(df_train.copy())
df_train = feature_enginnering(df_train)
df_train = train_test_treatments(df_train, pd.concat([df_train,df_test]))

In [12]:
X, y = df_train.drop('Transported',axis=1), df_train['Transported']
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42
)

In [13]:
pipeline = Pipeline([
    #('scaler', MinMaxScaler()),
    ('skb', SelectKBest(chi2, k = 15)),
    ('model', GradientBoostingClassifier(
        min_samples_leaf=5, random_state=42,
        n_estimators=10000, n_iter_no_change=20, tol=1e-3, validation_fraction=0.1
        )
    )
])

## Tuning

In [14]:
params = {
 #'skb__k':[5, 7, 9, 11, 13, 15],
 'model__max_depth': [3, 5, 7, 9],
 'model__learning_rate': [0.1,0.001],
 'model__subsample': [0.4,0.7,0.9],
 'model__max_features':['log2', None],

}

In [15]:
grid = GridSearchCV(
    pipeline,
    param_grid=params,
    scoring='accuracy',
    cv=KFold(5, shuffle=True, random_state=42),
    verbose=3,
    return_train_score=True,
    n_jobs=1
)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 1/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4;, score=(train=0.821, test=0.825) total time=   0.3s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4;, score=(train=0.839, test=0.800) total time=   0.6s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4;, score=(train=0.829, test=0.797) total time=   0.4s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4;, score=(train=0.826, test=0.777) total time=   0.3s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.4;, score=(train=0.828, test=0.815) total time=   0.3s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=3, model__max_features=log2, model__subsample=0.7;, score=(train=0.825, test=0.813) tot

[CV 3/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.7;, score=(train=0.891, test=0.810) total time=   1.3s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.7;, score=(train=0.863, test=0.787) total time=   0.7s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.7;, score=(train=0.887, test=0.807) total time=   1.2s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9;, score=(train=0.882, test=0.827) total time=   1.3s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9;, score=(train=0.866, test=0.813) total time=   1.0s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=5, model__max_features=None, model__subsample=0.9;, score=(train=0.888, test=0.809) total time=   1.4s
[CV 4/5] END model__learning_rate=0.1, model__

[CV 5/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=log2, model__subsample=0.9;, score=(train=0.943, test=0.812) total time=   0.8s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4;, score=(train=0.903, test=0.800) total time=   0.5s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4;, score=(train=0.913, test=0.808) total time=   0.6s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4;, score=(train=0.928, test=0.802) total time=   0.8s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4;, score=(train=0.903, test=0.787) total time=   0.5s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=9, model__max_features=None, model__subsample=0.4;, score=(train=0.917, test=0.810) total time=   0.7s
[CV 1/5] END model__learning_rate=0.1, model__

[CV 2/5] END model__learning_rate=0.001, model__max_depth=5, model__max_features=log2, model__subsample=0.7;, score=(train=0.821, test=0.802) total time=   9.8s
[CV 3/5] END model__learning_rate=0.001, model__max_depth=5, model__max_features=log2, model__subsample=0.7;, score=(train=0.822, test=0.795) total time=  10.7s
[CV 4/5] END model__learning_rate=0.001, model__max_depth=5, model__max_features=log2, model__subsample=0.7;, score=(train=0.822, test=0.782) total time=   8.1s
[CV 5/5] END model__learning_rate=0.001, model__max_depth=5, model__max_features=log2, model__subsample=0.7;, score=(train=0.821, test=0.807) total time=   9.8s
[CV 1/5] END model__learning_rate=0.001, model__max_depth=5, model__max_features=log2, model__subsample=0.9;, score=(train=0.813, test=0.814) total time=  10.4s
[CV 2/5] END model__learning_rate=0.001, model__max_depth=5, model__max_features=log2, model__subsample=0.9;, score=(train=0.823, test=0.800) total time=  11.2s
[CV 3/5] END model__learning_rate=

[CV 3/5] END model__learning_rate=0.001, model__max_depth=7, model__max_features=None, model__subsample=0.9;, score=(train=0.875, test=0.793) total time=  33.2s
[CV 4/5] END model__learning_rate=0.001, model__max_depth=7, model__max_features=None, model__subsample=0.9;, score=(train=0.873, test=0.776) total time=  26.3s
[CV 5/5] END model__learning_rate=0.001, model__max_depth=7, model__max_features=None, model__subsample=0.9;, score=(train=0.877, test=0.809) total time=  30.6s
[CV 1/5] END model__learning_rate=0.001, model__max_depth=9, model__max_features=log2, model__subsample=0.4;, score=(train=0.858, test=0.831) total time=  15.4s
[CV 2/5] END model__learning_rate=0.001, model__max_depth=9, model__max_features=log2, model__subsample=0.4;, score=(train=0.863, test=0.810) total time=  16.6s
[CV 3/5] END model__learning_rate=0.001, model__max_depth=9, model__max_features=log2, model__subsample=0.4;, score=(train=0.863, test=0.799) total time=  16.4s
[CV 4/5] END model__learning_rate=

In [16]:
grid.best_params_

{'model__learning_rate': 0.1,
 'model__max_depth': 7,
 'model__max_features': 'log2',
 'model__subsample': 0.4}

In [17]:
pd.DataFrame(grid.cv_results_).loc[grid.best_index_]

mean_fit_time                                                           0.45041
std_fit_time                                                            0.09468
mean_score_time                                                        0.008582
std_score_time                                                         0.001025
param_model__learning_rate                                                  0.1
param_model__max_depth                                                        7
param_model__max_features                                                  log2
param_model__subsample                                                      0.4
params                        {'model__learning_rate': 0.1, 'model__max_dept...
split0_test_score                                                      0.822514
split1_test_score                                                      0.811832
split2_test_score                                                      0.811011
split3_test_score                       

# Validation

In [18]:
model = grid.best_estimator_

In [20]:
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

       False       0.80      0.79      0.79      1289
        True       0.80      0.81      0.80      1319

    accuracy                           0.80      2608
   macro avg       0.80      0.80      0.80      2608
weighted avg       0.80      0.80      0.80      2608



# Submission

In [15]:
X_test = treat_dataset(df_test)
X_test = feature_enginnering(X_test)
X_test = train_test_treatments(X_test, pd.concat([df_train,df_test]))
X_test.head(2)

Unnamed: 0_level_0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Deck,Num,total_bills,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Side_S,GroupSize,GroupLastNameSize
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0013_01,True,27.0,0.0,0.0,0.0,0.0,0.0,7,3,0.0,0,0,0,1,1,1,1
0018_01,False,19.0,0.0,2.302585,0.0,7.94591,0.0,6,4,2832.0,0,0,0,1,1,1,1


In [16]:
y_test = model.predict(X_test)
submission = pd.Series(y_test, index=[X_test.index]).astype(bool).to_frame('Transported')
submission.head()

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,True
0023_01,True


In [17]:
submission.to_csv('data/submission_gbm_2022-11-22.csv')

# To Ensemble

In [27]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')

df = pd.concat([df_train,df_test]).drop(['Transported'],axis=1)
df = treat_dataset(df)
df = feature_enginnering(df)
df = train_test_treatments(df, pd.concat([df_train,df_test]))
ensemble = model.predict_proba(df)
ensemble = pd.Series(ensemble[:,1], index=[df.index]).to_frame('GBM')
ensemble.to_csv('data/ensemble/gbm.csv')