In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold, GridSearchCV, cross_validate
from sklearn.pipeline import Pipeline
import os

In [30]:
df_train = pd.read_csv("data/train.csv", index_col='PassengerId')
df_test = pd.read_csv("data/test.csv", index_col='PassengerId')
df_train.head(2)

Unnamed: 0_level_0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


In [31]:
ensemble_df = pd.concat([
    pd.read_csv(f'data/ensemble/{file}',index_col='PassengerId') for file in os.listdir('data/ensemble/')
],axis=1)
ensemble_df.head(2)

Unnamed: 0_level_0,GBM,KNN,LR
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0001_01,0.645641,0.64,0.849722
0002_01,0.212694,0.08,0.084764


In [32]:
df = pd.concat([ensemble_df.loc[df_train.index],df_train[['Transported']]],axis=1)
df.head()

Unnamed: 0_level_0,GBM,KNN,LR,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0001_01,0.645641,0.64,0.849722,False
0002_01,0.212694,0.08,0.084764,True
0003_01,0.089918,0.16,0.285502,False
0003_02,0.052019,0.36,0.5676,False
0004_01,0.2796,0.08,0.206688,True


# Cross Validation

In [56]:
X, y = df.drop('Transported',axis=1), df['Transported']
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.1,
    random_state=42
)

In [57]:
pipeline = Pipeline([
    ('model', (GradientBoostingClassifier(n_estimators=10000, n_iter_no_change=20, tol=1e-3, validation_fraction=0.15)))
])

In [58]:
params = {
     'model__max_depth': [2,3,4,5],
     'model__learning_rate': [0.1,0.01],
     'model__subsample': [0.1,0.2,0.3,0.4,0.5],
}

In [59]:
grid = GridSearchCV(
    pipeline,
    param_grid=params,
    scoring='accuracy',
    cv=KFold(5, shuffle=True, random_state=42),
    verbose=3,
    return_train_score=True
)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END model__learning_rate=0.1, model__max_depth=2, model__subsample=0.1;, score=(train=0.857, test=0.864) total time=   0.1s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=2, model__subsample=0.1;, score=(train=0.859, test=0.854) total time=   0.0s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=2, model__subsample=0.1;, score=(train=0.864, test=0.849) total time=   0.0s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=2, model__subsample=0.1;, score=(train=0.863, test=0.851) total time=   0.1s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=2, model__subsample=0.1;, score=(train=0.859, test=0.854) total time=   0.1s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=2, model__subsample=0.2;, score=(train=0.860, test=0.861) total time=   0.1s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=2, model__subsample=0.2;, score=(train=0.860, test=0.865) total time=   0.1s
[CV 3/5

[CV 3/5] END model__learning_rate=0.1, model__max_depth=4, model__subsample=0.3;, score=(train=0.873, test=0.846) total time=   0.1s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=4, model__subsample=0.3;, score=(train=0.876, test=0.857) total time=   0.1s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=4, model__subsample=0.3;, score=(train=0.874, test=0.857) total time=   0.1s
[CV 1/5] END model__learning_rate=0.1, model__max_depth=4, model__subsample=0.4;, score=(train=0.870, test=0.859) total time=   0.2s
[CV 2/5] END model__learning_rate=0.1, model__max_depth=4, model__subsample=0.4;, score=(train=0.877, test=0.858) total time=   0.2s
[CV 3/5] END model__learning_rate=0.1, model__max_depth=4, model__subsample=0.4;, score=(train=0.879, test=0.843) total time=   0.2s
[CV 4/5] END model__learning_rate=0.1, model__max_depth=4, model__subsample=0.4;, score=(train=0.875, test=0.855) total time=   0.2s
[CV 5/5] END model__learning_rate=0.1, model__max_depth=4, model__sub

[CV 5/5] END model__learning_rate=0.01, model__max_depth=2, model__subsample=0.5;, score=(train=0.864, test=0.859) total time=   2.2s
[CV 1/5] END model__learning_rate=0.01, model__max_depth=3, model__subsample=0.1;, score=(train=0.863, test=0.866) total time=   1.2s
[CV 2/5] END model__learning_rate=0.01, model__max_depth=3, model__subsample=0.1;, score=(train=0.864, test=0.860) total time=   1.0s
[CV 3/5] END model__learning_rate=0.01, model__max_depth=3, model__subsample=0.1;, score=(train=0.866, test=0.850) total time=   0.9s
[CV 4/5] END model__learning_rate=0.01, model__max_depth=3, model__subsample=0.1;, score=(train=0.864, test=0.854) total time=   1.1s
[CV 5/5] END model__learning_rate=0.01, model__max_depth=3, model__subsample=0.1;, score=(train=0.866, test=0.854) total time=   0.9s
[CV 1/5] END model__learning_rate=0.01, model__max_depth=3, model__subsample=0.2;, score=(train=0.864, test=0.868) total time=   1.2s
[CV 2/5] END model__learning_rate=0.01, model__max_depth=3, mo

[CV 2/5] END model__learning_rate=0.01, model__max_depth=5, model__subsample=0.3;, score=(train=0.876, test=0.862) total time=   2.3s
[CV 3/5] END model__learning_rate=0.01, model__max_depth=5, model__subsample=0.3;, score=(train=0.877, test=0.852) total time=   1.7s
[CV 4/5] END model__learning_rate=0.01, model__max_depth=5, model__subsample=0.3;, score=(train=0.876, test=0.854) total time=   1.9s
[CV 5/5] END model__learning_rate=0.01, model__max_depth=5, model__subsample=0.3;, score=(train=0.879, test=0.852) total time=   2.1s
[CV 1/5] END model__learning_rate=0.01, model__max_depth=5, model__subsample=0.4;, score=(train=0.875, test=0.867) total time=   2.2s
[CV 2/5] END model__learning_rate=0.01, model__max_depth=5, model__subsample=0.4;, score=(train=0.876, test=0.863) total time=   2.0s
[CV 3/5] END model__learning_rate=0.01, model__max_depth=5, model__subsample=0.4;, score=(train=0.883, test=0.849) total time=   2.4s
[CV 4/5] END model__learning_rate=0.01, model__max_depth=5, mo

In [63]:
grid.best_params_

{'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__subsample': 0.4}

In [64]:
pd.DataFrame(grid.cv_results_)[['mean_train_score','std_train_score','mean_test_score','std_test_score']].loc[grid.best_index_]

mean_train_score    0.877605
std_train_score     0.003054
mean_test_score     0.859517
std_test_score      0.005891
Name: 38, dtype: float64

In [65]:
model = grid.best_estimator_
print(classification_report(y_val, model.predict(X_val)))

              precision    recall  f1-score   support

       False       0.79      0.78      0.78       443
        True       0.77      0.78      0.78       427

    accuracy                           0.78       870
   macro avg       0.78      0.78      0.78       870
weighted avg       0.78      0.78      0.78       870



In [43]:
model.steps[0][1].feature_importances_

array([0.91571703, 0.00835334, 0.07592964])

In [45]:
X_test = ensemble_df.loc[df_test.index]
X_test.head(2)

Unnamed: 0_level_0,GBM,KNN,LR
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0013_01,0.562865,0.8,0.800198
0018_01,0.078126,0.32,0.149556


In [46]:
y_test = model.predict(X_test)
submission = pd.Series(y_test, index=[X_test.index]).astype(bool).to_frame('Transported')
submission.head()

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,False
0018_01,False
0019_01,True
0021_01,True
0023_01,True


In [47]:
submission.to_csv('data/submission_ensemble_2022-11-23.csv')