In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

  from pandas import MultiIndex, Int64Index


In [3]:
train_df = pd.read_csv("data/train_clean.csv")
test_df = pd.read_csv("data/test_clean.csv")
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 19 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8693 non-null   object 
 2   CryoSleep     8693 non-null   bool   
 3   Destination   8693 non-null   object 
 4   Age           8693 non-null   float64
 5   VIP           8693 non-null   bool   
 6   RoomService   8693 non-null   float64
 7   FoodCourt     8693 non-null   float64
 8   ShoppingMall  8693 non-null   float64
 9   Spa           8693 non-null   float64
 10  VRDeck        8693 non-null   float64
 11  Transported   8693 non-null   bool   
 12  Deck          8693 non-null   object 
 13  Num           8693 non-null   float64
 14  Side          8693 non-null   object 
 15  NumGroup      8693 non-null   int64  
 16  TotalExpense  8693 non-null   float64
 17  Group         8693 non-null   int64  
 18  InGroup       8693 non-null 

In [4]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4277 non-null   object 
 2   CryoSleep     4277 non-null   bool   
 3   Destination   4277 non-null   object 
 4   Age           4277 non-null   float64
 5   VIP           4277 non-null   bool   
 6   RoomService   4277 non-null   float64
 7   FoodCourt     4277 non-null   float64
 8   ShoppingMall  4277 non-null   float64
 9   Spa           4277 non-null   float64
 10  VRDeck        4277 non-null   float64
 11  Deck          4277 non-null   object 
 12  Num           4277 non-null   float64
 13  Side          4277 non-null   object 
 14  NumGroup      4277 non-null   int64  
 15  TotalExpense  4277 non-null   float64
 16  Group         4277 non-null   int64  
 17  InGroup       4277 non-null   bool   
dtypes: bool(3), float64(8), int6

## Encoding categorical features
We'll use one hot encoder for the following categorical features: HomePlanet, Destination, Deck

In [5]:
X_train = pd.get_dummies(train_df.drop("PassengerId", axis =1), columns = ["HomePlanet", "Destination", "Deck", "VIP", "CryoSleep", "Side"], drop_first=True)
X_test = pd.get_dummies(test_df.drop("PassengerId", axis =1), columns = ["HomePlanet", "Destination", "Deck", "VIP", "CryoSleep", "Side"], drop_first=True)
X_train.drop(["Transported", "Group"], axis = 1, inplace=True)
X_test.drop(["Group"], axis = 1, inplace=True)
X_train.head()
X_train.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Num', 'NumGroup', 'TotalExpense', 'InGroup', 'HomePlanet_Europa',
       'HomePlanet_Mars', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E',
       'Deck_F', 'Deck_G', 'Deck_T', 'VIP_True', 'CryoSleep_True', 'Side_S'],
      dtype='object')

In [6]:
X_test.head()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Num,NumGroup,TotalExpense,InGroup,...,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,VIP_True,CryoSleep_True,Side_S
0,27.0,0.0,0.0,0.0,0.0,0.0,3.0,0,0.0,False,...,0,0,0,0,0,1,0,0,1,1
1,19.0,0.0,9.0,0.0,2823.0,0.0,4.0,0,2832.0,False,...,0,0,0,0,1,0,0,0,0,1
2,31.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,False,...,0,1,0,0,0,0,0,0,1,1
3,38.0,0.0,6652.0,0.0,181.0,585.0,1.0,0,7418.0,False,...,0,1,0,0,0,0,0,0,0,1
4,20.0,10.0,0.0,635.0,0.0,0.0,5.0,0,645.0,False,...,0,0,0,0,1,0,0,0,0,1


In [7]:
y_train = train_df[["Transported"]]
y_train["Transported"] = y_train["Transported"].astype(int)
y_train.head()

Unnamed: 0,Transported
0,0
1,1
2,0
3,0
4,1


## Models

### XGBoost

In [8]:
params_xgb = { 'max_depth': [3, 5, 6, 7, 8, 10],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'reg_alpha': [0.05, 0.1, 1, 10, 100],
           'reg_lambda': [0.1, 1, 2,  10, 100],
           'n_estimators': [100, 500, 1000],
           }

model_xgb = xgb.XGBClassifier(use_label_encoder=False)
clf_xgb = RandomizedSearchCV(estimator=model_xgb,
                         param_distributions=params_xgb,
                         scoring="accuracy",
                         n_iter=25,
                         cv=5,
                         verbose=1)

In [9]:
clf_xgb.fit(X_train, y_train, eval_metric = "logloss")

Fitting 5 folds for each of 25 candidates, totalling 125 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None,
                                           enable_categorical=False, gamma=None,
                                           gpu_id=None, importance_type=None,
                                           interaction_constraints=None,
                                           learning_rate=None,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missing=nan,
                                           monotone_constraints...
                   param_distributions={'colsample_bylevel': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'colsample_bytree': array([0.4, 0.5, 0.6,

In [10]:
clf_xgb.best_score_

0.7953567748881512

### Random Forest


In [11]:
params_rf = {
    "n_estimators":[100, 500, 1000],
    "max_depth":[5, 8, 10],
    "criterion":["gini", "entropy"]
}

model_rf = RandomForestClassifier()

clf_rf = GridSearchCV(model_rf, param_grid=params_rf, cv = 5)

In [12]:
clf_rf.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [5, 8, 10],
                         'n_estimators': [100, 500, 1000]})

In [13]:
clf_rf.best_score_

0.7934000401008211

### LGBM

In [14]:
params_lgbm = { 
    'max_depth': [3, 5, 6, 7, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': np.arange(0.5, 1.0, 0.1),
    'colsample_bytree': np.arange(0.4, 1.0, 0.1),
    'reg_alpha': [0.05, 0.1, 1, 10, 100],
    'reg_lambda': [0.1, 1, 2,  10, 100],
    'n_estimators': [100, 500, 1000],
}
model_lgbm = LGBMClassifier()
clf_lgbm = RandomizedSearchCV(  model_lgbm, 
                                param_distributions=params_lgbm, 
                                n_iter = 25,
                                cv=5, 
                                scoring="accuracy",
                                verbose=1)

In [15]:
clf_lgbm.fit(X_train, y_train)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


RandomizedSearchCV(cv=5, estimator=LGBMClassifier(), n_iter=25,
                   param_distributions={'colsample_bytree': array([0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
                                        'learning_rate': [0.01, 0.1, 0.2, 0.3],
                                        'max_depth': [3, 5, 6, 7, 8, 10],
                                        'n_estimators': [100, 500, 1000],
                                        'reg_alpha': [0.05, 0.1, 1, 10, 100],
                                        'reg_lambda': [0.1, 1, 2, 10, 100],
                                        'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9])},
                   scoring='accuracy', verbose=1)

In [16]:
clf_lgbm.best_score_

0.7938610010250194

### Catboost

In [17]:
params_cat = { 
    'max_depth': [3, 5, 6, 7, 8, 10],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': np.arange(0.5, 1.0, 0.1),
    'reg_lambda': [0.1, 1, 2,  10, 100],
    'n_estimators': [100, 500, 1000],
}
model_cat = CatBoostClassifier()
clf_cat = RandomizedSearchCV(   model_cat, 
                                param_distributions=params_cat, 
                                n_iter = 25,
                                cv=5, 
                                scoring="accuracy")

In [18]:
clf_cat.fit(X_train, y_train)

0:	learn: 0.6897329	total: 57.3ms	remaining: 28.6s
1:	learn: 0.6842267	total: 81.5ms	remaining: 20.3s
2:	learn: 0.6776245	total: 104ms	remaining: 17.3s
3:	learn: 0.6719320	total: 124ms	remaining: 15.4s
4:	learn: 0.6666768	total: 144ms	remaining: 14.2s
5:	learn: 0.6611428	total: 165ms	remaining: 13.6s
6:	learn: 0.6554731	total: 183ms	remaining: 12.9s
7:	learn: 0.6503922	total: 201ms	remaining: 12.4s
8:	learn: 0.6453641	total: 219ms	remaining: 12s
9:	learn: 0.6395503	total: 239ms	remaining: 11.7s
10:	learn: 0.6342992	total: 258ms	remaining: 11.5s
11:	learn: 0.6298405	total: 280ms	remaining: 11.4s
12:	learn: 0.6249266	total: 298ms	remaining: 11.2s
13:	learn: 0.6210791	total: 316ms	remaining: 11s
14:	learn: 0.6166530	total: 334ms	remaining: 10.8s
15:	learn: 0.6126591	total: 354ms	remaining: 10.7s
16:	learn: 0.6082372	total: 373ms	remaining: 10.6s
17:	learn: 0.6034563	total: 393ms	remaining: 10.5s
18:	learn: 0.5991410	total: 411ms	remaining: 10.4s
19:	learn: 0.5940928	total: 428ms	remaining

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostClassifier object at 0x7f9d670a7910>,
                   n_iter=25,
                   param_distributions={'learning_rate': [0.01, 0.1, 0.2, 0.3],
                                        'max_depth': [3, 5, 6, 7, 8, 10],
                                        'n_estimators': [100, 500, 1000],
                                        'reg_lambda': [0.1, 1, 2, 10, 100],
                                        'subsample': array([0.5, 0.6, 0.7, 0.8, 0.9])},
                   scoring='accuracy')

In [19]:
clf_cat.best_score_

0.794090952103341

We can see that the Catboost model gave us the higher score. So we will use it to predict the test data.

In [31]:
y_test = clf_cat.predict(X_test)
test_df["Transported"] = y_test

In [32]:
test_df["Transported"] = test_df["Transported"].apply(bool)
test_df[["PassengerId", "Transported"]].to_csv("data/submissions.csv", index = False)

This model gave us 80,5 % accuracy in the test set.