In [35]:
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

In [20]:
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')

In [21]:
train.drop(columns=['Age', 'Group', 'GroupSize', 'CabinNum', 'log_RoomService', 'log_FoodCourt', 'log_ShoppingMall', 'log_Spa', 'log_VRDeck', 'Surname'], inplace=True)
test.drop(columns=['Age', 'Group', 'GroupSize', 'CabinNum', 'log_RoomService',
           'log_FoodCourt', 'log_ShoppingMall', 'log_Spa', 'log_VRDeck', 'Surname'], inplace=True)


In [22]:
train = train.astype('category')
test = test.astype('category')

In [23]:
train.dtypes

HomePlanet              category
CryoSleep               category
Destination             category
VIP                     category
Transported             category
CabinDeck               category
CabinSide               category
AgeBins                 category
GroupBins               category
GroupSizeBins           category
CabinNumBins            category
log_RoomServiceBins     category
log_FoodCourtBins       category
log_ShoppingMallBins    category
log_SpaBins             category
log_VRDeckBins          category
dtype: object

In [24]:
test.dtypes

HomePlanet              category
CryoSleep               category
Destination             category
VIP                     category
CabinDeck               category
CabinSide               category
AgeBins                 category
GroupBins               category
GroupSizeBins           category
CabinNumBins            category
log_RoomServiceBins     category
log_FoodCourtBins       category
log_ShoppingMallBins    category
log_SpaBins             category
log_VRDeckBins          category
dtype: object

#### Train - Target split

In [25]:
target = train['Transported']
train.drop(columns=['Transported'], inplace = True)

#### One Hot Encoding

In [26]:
train_encoded = pd.get_dummies(train, drop_first = True)
test_encoded = pd.get_dummies(test, drop_first = True)

In [27]:
train_encoded.head()

Unnamed: 0,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,...,"GroupBins_(7500.0, 9280.0]","GroupSizeBins_(1, 8]","CabinNumBins_(1175.0, 1894.0]","CabinNumBins_(300.0, 700.0]","CabinNumBins_(700.0, 1175.0]","log_RoomServiceBins_(0.0, 9.57]","log_FoodCourtBins_(0.0, 10.303]","log_ShoppingMallBins_(0.0, 10.064]","log_SpaBins_(0.0, 10.017]","log_VRDeckBins_(0.0, 10.091]"
0,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1
2,1,0,0,0,1,1,0,0,0,0,...,0,1,0,0,0,1,1,0,1,1
3,1,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,1,1,1,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,1,1,1,1


In [28]:
test_encoded.head()

Unnamed: 0,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_True,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_True,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,...,"GroupBins_(7500.0, 9280.0]","GroupSizeBins_(1, 8]","CabinNumBins_(1175.0, 1894.0]","CabinNumBins_(300.0, 700.0]","CabinNumBins_(700.0, 1175.0]","log_RoomServiceBins_(0.0, 9.57]","log_FoodCourtBins_(0.0, 10.303]","log_ShoppingMallBins_(0.0, 10.064]","log_SpaBins_(0.0, 10.017]","log_VRDeckBins_(0.0, 10.091]"
0,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,1,1
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0


#### Model Selection

We will choose from the following models:

* Logistic Regression
* K-Nearest Neighbours
* Naive Bayes
* Support Vector Machine
* Random Forest
* XGBoost
* CatBoost

In [36]:
RANDOM_STATE = 42

classifiers = {
    'Logistic':LogisticRegression(random_state=RANDOM_STATE, n_jobs = -1),
    'KNN':KNeighborsClassifier(n_jobs=-1),
    'NaiveBayes':BernoulliNB(),
    'SVC':SVC(),
    'RandomForest':RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    'CatBoost':CatBoostClassifier(random_state=RANDOM_STATE, verbose = False)
}

param_grids = {
    'Logistic':{'penalty':['l1', 'l2'], 'max_iter':[100,200] },
    'KNN':{'n_neighbors':[3,5,7,9]},
    'NaiveBayes':{},
    'SVC':{'kernel':['linear', 'rbf'], 'C':[0.5, 1.0, 1.5]},
    'RandomForest': {'n_estimators': [50, 150, 250],
                     'max_depth': [4, 8, 12]},
    'CatBoost': {'n_estimators': [50, 150],
                 'max_depth': [4, 8, 12],
                 'learning_rate': [0.05, 0.1, 0.15]}
}

In [37]:
best_params = {}
best_score = {}

for key, classifier in classifiers.items():
    model = GridSearchCV(estimator=classifier, param_grid=param_grids[key], n_jobs=-1, cv = 5)

    model.fit(train_encoded, target)
    best_params[key] = model.best_params_
    best_score[key] = model.best_score_

best_score

10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/home/laksie98/opt/anaconda3/envs/av/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/laksie98/opt/anaconda3/envs/av/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/laksie98/opt/anaconda3/envs/av/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 61, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' o

{'Logistic': 0.718967026669693,
 'KNN': 0.7186269637656657,
 'NaiveBayes': 0.7347300241994559,
 'SVC': 0.7410578146640631,
 'RandomForest': 0.7455450700804861,
 'CatBoost': nan}