In [1]:
import pandas as pd

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('train_processed.csv')
test = pd.read_csv('test_processed.csv')

In [3]:
train.drop(columns=['Surname'], inplace=True)
test.drop(columns=['Surname'], inplace=True)


In [4]:
train_cat_cols = train.select_dtypes(exclude = 'number').columns
train[train_cat_cols] = train[train_cat_cols].astype('category')

test_cat_cols = test.select_dtypes(exclude='number').columns
test[test_cat_cols] = test[test_cat_cols].astype('category')


In [5]:
train.dtypes

HomePlanet              category
CryoSleep               category
Destination             category
Age                      float64
VIP                     category
Transported             category
Group                    float64
GroupSize                  int64
CabinDeck               category
CabinNum                 float64
CabinSide               category
log_RoomService          float64
log_FoodCourt            float64
log_ShoppingMall         float64
log_Spa                  float64
log_VRDeck               float64
AgeBins                 category
GroupBins               category
GroupSizeBins           category
CabinNumBins            category
log_RoomServiceBins     category
log_FoodCourtBins       category
log_ShoppingMallBins    category
log_SpaBins             category
log_VRDeckBins          category
dtype: object

In [6]:
test.dtypes

HomePlanet              category
CryoSleep               category
Destination             category
Age                      float64
VIP                     category
Group                    float64
GroupSize                  int64
CabinDeck               category
CabinNum                 float64
CabinSide               category
log_RoomService          float64
log_FoodCourt            float64
log_ShoppingMall         float64
log_Spa                  float64
log_VRDeck               float64
AgeBins                 category
GroupBins               category
GroupSizeBins           category
CabinNumBins            category
log_RoomServiceBins     category
log_FoodCourtBins       category
log_ShoppingMallBins    category
log_SpaBins             category
log_VRDeckBins          category
dtype: object

#### Train - Target split

In [7]:
target = train['Transported']
train.drop(columns=['Transported'], inplace = True)

In [8]:
target = pd.DataFrame(target)

#### Column Transformation

In [9]:
num_train = train.select_dtypes(include = 'number')
num_test = test.select_dtypes(include='number')
cat_train = train.select_dtypes(exclude = 'number')
cat_cols = cat_train.columns
cat_test = test.select_dtypes(exclude = 'number')

In [10]:
# Standard Scaler for numerical columns

num_transformer = StandardScaler()

num_train = num_transformer.fit_transform(num_train)
num_test = num_transformer.transform(num_test)

In [11]:
num_train = pd.DataFrame(num_train)
num_test = pd.DataFrame(num_test)

These will be used directly for CatBoost Classifer as it can handle categorical columns itself

In [12]:
catboost_train = pd.concat([num_train, cat_train], axis = 1)
catboost_test = pd.concat([num_test, cat_test], axis = 1)

In [13]:
# One Hot Encoder for categorical columns

cat_transformer = OneHotEncoder(drop = 'if_binary', sparse = False)

cat_train = cat_transformer.fit_transform(cat_train)
cat_test = cat_transformer.transform(cat_test)
target = cat_transformer.fit_transform(target)

In [14]:
cat_train = pd.DataFrame(cat_train)
cat_test = pd.DataFrame(cat_test)
target = pd.DataFrame(target)

In [15]:
# Concat back

train = pd.concat([num_train, cat_train], axis = 1)
test = pd.concat([num_test, cat_test], axis = 1)

#### Model Selection

We will choose from the following models:

* Logistic Regression
* K-Nearest Neighbours
* Naive Bayes
* Support Vector Machine
* Random Forest
* XGBoost
* CatBoost

In [16]:
RANDOM_STATE = 42

# We shall do catboost separately
classifiers = {
    'Logistic':LogisticRegression(random_state=RANDOM_STATE, n_jobs = -1),
    'KNN':KNeighborsClassifier(n_jobs=-1),
    'NaiveBayes':BernoulliNB(),
    'SVC':SVC(),
    'RandomForest':RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
}

param_grids = {
    'Logistic':{'penalty':['l1', 'l2'], 'max_iter':[100,200] },
    'KNN':{'n_neighbors':[3,5,7,9]},
    'NaiveBayes':{},
    'SVC':{'kernel':['linear', 'rbf'], 'C':[0.5, 1.0, 1.5]},
    'RandomForest': {'n_estimators': [50, 150, 250],
                     'max_depth': [4, 8, 12]},
    #'CatBoost': 
}

In [17]:
best_params = {}
best_score = {}

for key, classifier in classifiers.items():
    model = GridSearchCV(estimator=classifier, param_grid=param_grids[key], n_jobs=-1, cv = 5)

    model.fit(train, target)
    best_params[key] = model.best_params_
    best_score[key] = model.best_score_

best_score

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_opt

{'Logistic': 0.7509487549886149,
 'KNN': 0.7592371182729384,
 'NaiveBayes': 0.7380674580512986,
 'SVC': 0.7792518616111398,
 'RandomForest': 0.7630320058814538}

In [18]:
# CatBoost

key = 'CatBoost'
classifier = CatBoostClassifier(random_state=RANDOM_STATE, verbose=False, cat_features=list(cat_cols))
param_grid = {'n_estimators': [50, 150],
                           'max_depth': [4, 8, 12],
                           'learning_rate': [0.05, 0.1, 0.15]}

model = GridSearchCV(estimator=classifier,
                         param_grid=param_grid, n_jobs=-1, cv=5)

model.fit(catboost_train, target)
best_params[key] = model.best_params_
best_score[key] = model.best_score_

best_score


  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,
  self._init_pool(data, label, cat_featu

{'Logistic': 0.7509487549886149,
 'KNN': 0.7592371182729384,
 'NaiveBayes': 0.7380674580512986,
 'SVC': 0.7792518616111398,
 'RandomForest': 0.7630320058814538,
 'CatBoost': 0.7675195259897657}

SVC has performed the best with the following params

##### Hyperparameter Tuning for final SVC model

In [19]:
best_params['SVC']

{'C': 0.5, 'kernel': 'rbf'}

In [23]:
param_grid = {
    'kernel':['rbf'],
    'C':[0.1, 0.3, 0.5, 0.7, 0.9],
    'gamma':['scale', 'auto']
}

classifier = SVC()

In [24]:
model = GridSearchCV(estimator=classifier,
                     param_grid=param_grid, n_jobs=-1, cv=5)

model.fit(train, target)
best_params = model.best_params_
best_score = model.best_score_
best_score
best_params

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

{'C': 0.5, 'gamma': 'auto', 'kernel': 'rbf'}

In [25]:
best_score

0.7848867548840616

In [26]:
best_params

{'C': 0.5, 'gamma': 'auto', 'kernel': 'rbf'}

#### Final Prediction

In [29]:
model = SVC(**best_params)
model.fit(train, target)
prediction = model.predict(test)

  y = column_or_1d(y, warn=True)


In [30]:
prediction

array([1., 0., 1., ..., 1., 1., 1.])

In [41]:
submission = pd.DataFrame()
test_original = pd.read_csv('test.csv')
submission['PassengerId'] = test_original['PassengerId']

In [42]:
submission['Transported'] = prediction

In [43]:
submission

Unnamed: 0,PassengerId,Transported
0,0013_01,1.0
1,0018_01,0.0
2,0019_01,1.0
3,0021_01,1.0
4,0023_01,1.0
...,...,...
4272,9266_02,1.0
4273,9269_01,0.0
4274,9271_01,1.0
4275,9273_01,1.0


In [45]:
submission['Transported'] = submission['Transported'].astype('bool')

In [46]:
submission.to_csv('submission.csv', index = False)

LB - 0.78887

Model does not overfit