In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [96]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

In [3]:
train_df = pd.read_csv("data/train_clean.csv")
test_df = pd.read_csv("data/test_clean.csv")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Destination   8693 non-null   object 
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   bool   
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   bool   
 12  Deck          8693 non-null   object 
 13  Num           8693 non-null   float64
 14  Side          8693 non-null   object 
 15  NumGroup      8693 non-null   int64  
 16  TotalExpense  8693 non-null   float64
 17  Group         8693 non-null   int64  
 18  InGroup       8693 non-null 

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4277 non-null   object 
 2   CryoSleep     4277 non-null   bool   
 3   Destination   4277 non-null   object 
 4   Age           4277 non-null   float64
 5   VIP           4277 non-null   bool   
 6   RoomService   4277 non-null   float64
 7   FoodCourt     4277 non-null   float64
 8   ShoppingMall  4277 non-null   float64
 9   Spa           4277 non-null   float64
 10  VRDeck        4277 non-null   float64
 11  Deck          4277 non-null   object 
 12  Num           4277 non-null   float64
 13  Side          4277 non-null   object 
 14  NumGroup      4277 non-null   int64  
 15  TotalExpense  4277 non-null   float64
 16  Group         4277 non-null   int64  
 17  InGroup       4277 non-null   bool   
dtypes: bool(3), float64(8), int6

## Encoding categorical features
We'll use one hot encoder for the following categorical features: HomePlanet, Destination, Deck

In [29]:
X_train = pd.get_dummies(train_df.drop("PassengerId", axis =1), columns = ["HomePlanet", "Destination", "Deck", "VIP", "CryoSleep", "Side"], drop_first=True)
X_test = pd.get_dummies(test_df.drop("PassengerId", axis =1), columns = ["HomePlanet", "Destination", "Deck", "VIP", "CryoSleep", "Side"], drop_first=True)
X_train.drop("Transported", axis = 1, inplace=True)
X_train.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num,NumGroup,TotalExpense,Group,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,VIP_True,CryoSleep_True,Side_S
0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,1,...,1,0,0,0,0,0,0,0,0,0
1,24.0,109.0,9.0,25.0,549.0,44.0,0.0,0,736.0,2,...,0,0,0,0,1,0,0,0,0,1
2,58.0,43.0,3576.0,0.0,6715.0,49.0,0.0,0,10383.0,3,...,0,0,0,0,0,0,0,1,0,1
3,33.0,0.0,1283.0,371.0,3329.0,193.0,0.0,0,5176.0,3,...,0,0,0,0,0,0,0,0,0,1
4,16.0,303.0,70.0,151.0,565.0,2.0,1.0,0,1091.0,4,...,0,0,0,0,1,0,0,0,0,1


In [30]:
X_test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num,NumGroup,TotalExpense,Group,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,VIP_True,CryoSleep_True,Side_S
0,27.0,0.0,0.0,0.0,0.0,0.0,3.0,0,0.0,13,...,0,0,0,0,0,1,0,0,1,1
1,19.0,0.0,9.0,0.0,2823.0,0.0,4.0,0,2832.0,18,...,0,0,0,0,1,0,0,0,0,1
2,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,19,...,0,1,0,0,0,0,0,0,1,1
3,38.0,0.0,6652.0,0.0,181.0,585.0,1.0,0,7418.0,21,...,0,1,0,0,0,0,0,0,0,1
4,20.0,10.0,0.0,635.0,0.0,0.0,5.0,0,645.0,23,...,0,0,0,0,1,0,0,0,0,1


In [31]:
y_train = train_df[["Transported"]]
y_train["Transported"] = y_train["Transported"].astype(int)
y_train.head()

Unnamed: 0,Transported
0,0
1,1
2,0
3,0
4,1


## Models

### XGBoost

In [69]:
params_xgb = { 'max_depth': [3, 5, 6, 7, 8, 10],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'reg_alpha': [0.05, 0.1, 1, 10, 100],
           'reg_lambda': [0.1, 1, 2,  10, 100],
           'n_estimators': [100, 500, 1000],
           }

model_xgb = xgb.XGBClassifier(use_label_encoder=False)
clf_xgb = RandomizedSearchCV(estimator=model_xgb,
                         param_distributions=params_xgb,
                         scoring="accuracy",
                         n_iter=25,
                         cv=5,
                         verbose=1)

In [80]:
clf_xgb.fit(X_train, y_train, eval_metric = "logloss")

Fitting 5 folds for each of 25 candidates, totalling 125 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                   param_distributions={'colsample_bylevel': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'colsample_bytree': array([0.4, 0.5, 0.6,

In [81]:
clf_xgb.best_score_

0.7656793218064428

### Random Forest


In [55]:
params_rf = {
    "n_estimators":[100, 500, 1000],
    "max_depth":[5, 8, 10],
    "criterion":["gini", "entropy"]
}

model_rf = RandomForestClassifier()

clf_rf = GridSearchCV(model_rf, param_grid=params_rf, cv = 5)

In [56]:
clf_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 8, 10],
                         'n_estimators': [100, 500, 1000]})

In [58]:
clf_rf.best_score_

0.7723506161696305

### LGBM

In [70]:
params_lgbm = { 
    'max_depth': [3, 5, 6, 7, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': np.arange(0.5, 1.0, 0.1),
    'colsample_bytree': np.arange(0.4, 1.0, 0.1),
    'reg_alpha': [0.05, 0.1, 1, 10, 100],
    'reg_lambda': [0.1, 1, 2,  10, 100],
    'n_estimators': [100, 500, 1000],
}
model_lgbm = LGBMClassifier()
clf_lgbm = RandomizedSearchCV(  model_lgbm, 
                                param_distributions=params_lgbm, 
                                n_iter = 25,
                                cv=5, 
                                scoring="accuracy",
                                verbose=1)

In [71]:
clf_lgbm.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=25,
                   param_distributions={'colsample_bylevel': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'colsample_bytree': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'learning_rate': [0.01, 0.1, 0.2, 0.3],
                                        'max_depth': [3, 5, 6, 7, 8, 10],
                                        'n_estimators': [100, 500, 1000],
                                        'reg_alpha': [0.05, 0.1, 1, 10, 100],
                                        'reg_lambda': [0.1, 1, 2, 10, 100],
                                        'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9])},
                   scoring='accuracy', verbose=1)

In [72]:
clf_lgbm.best_score_

0.7831625519209683

### Catboost

In [77]:
params_cat = { 
    'max_depth': [3, 5, 6, 7, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': np.arange(0.5, 1.0, 0.1),
    'reg_lambda': [0.1, 1, 2,  10, 100],
    'n_estimators': [100, 500, 1000],
}
model_cat = CatBoostClassifier()
clf_cat = RandomizedSearchCV(   model_cat, 
                                param_distributions=params_cat, 
                                n_iter = 25,
                                cv=5, 
                                scoring="accuracy")

In [78]:
clf_cat.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
0:	learn: 0.5776014	total: 73.2ms	remaining: 1m 13s
1:	learn: 0.5215245	total: 93.2ms	remaining: 46.5s
2:	learn: 0.4681195	total: 110ms	remaining: 36.6s
3:	learn: 0.4427042	total: 127ms	remaining: 31.6s
4:	learn: 0.4207128	total: 147ms	remaining: 29.3s
5:	learn: 0.3944679	total: 168ms	remaining: 27.8s
6:	learn: 0.3827406	total: 186ms	remaining: 26.4s
7:	learn: 0.3689991	total: 205ms	remaining: 25.4s
8:	learn: 0.3552292	total: 222ms	remaining: 24.5s
9:	learn: 0.3435090	total: 242ms	remaining: 23.9s
10:	learn: 0.3355107	total: 265ms	remaining: 23.8s
11:	learn: 0.3287204	total: 291ms	remaining: 23.9s
12:	learn: 0.3221659	total: 310ms	remaining: 23.5s
13:	learn: 0.3156201	total: 329ms	remaining: 23.2s
14:	learn: 0.3110015	total: 348ms	remaining: 22.9s
15:	learn: 0.2997453	total: 367ms	remaining: 22.6s
16:	learn: 0.2958219	total: 387ms	remaining: 22.4s
17:	learn: 0.2921825	total: 410ms	remaining: 22.4s
18:	learn: 0.2823092	total:

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostClassifier object at 0x7fe608fef130>,
                   n_iter=25,
                   param_distributions={'learning_rate': [0.01, 0.1, 0.2, 0.3],
                                        'max_depth': [3, 5, 6, 7, 8, 10],
                                        'n_estimators': [100, 500, 1000],
                                        'reg_lambda': [0.1, 1, 2, 10, 100],
                                        'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9])},
                   scoring='accuracy', verbose=1)

In [79]:
clf_cat.best_score_

0.7801724600000925

We can see that XGBoost, LGBM and Catboost models gave us the higher score. So we will take the mode of the results between these three models to give us our test predictions

In [117]:
y_test_xgb = clf_xgb.predict(X_test)
y_test_lgbm = clf_lgbm.predict(X_test)
y_test_cat = clf_cat.predict(X_test)

y_test_all = np.dstack([y_test_cat, y_test_lgbm, y_test_xgb])
y_test = stats.mode(y_test_all.T)[0]
y_test = y_test.reshape(-1)

In [118]:
test_df["Transported"] = y_test

In [121]:
test_df["Transported"] = test_df["Transported"].apply(bool)
test_df[["PassengerId", "Transported"]].to_csv("data/submissions.csv", index = False)