Stacking ensemble on Titanic dataset

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [2]:
# data preparation

titanic = pd.read_excel('/content/drive/MyDrive/datasets/Titanic.xlsx')

# we dont need 'name' column so we drop it
titanic.drop(labels='Name', axis=1, inplace=True)

# get dummies
titanic['Sex'] = pd.get_dummies(titanic['Sex'], drop_first=True)


X = titanic.iloc[:, 1:].values
y = titanic.iloc[:, 0].values.reshape(-1,1)

# training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y)

In [3]:
# scaling for kNN Classifier

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

**Creation of base classifiers**

In [4]:
#kNN classifier
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()


params_knn = {'n_neighbors': np.arange(1, 25)}

# GridSearch:
knn_gs = GridSearchCV(estimator = knn,
                      param_grid = params_knn,
                      cv=5)

# fitting
knn_gs.fit(X_train_scaled, y_train.ravel())


knn_best = knn_gs.best_estimator_


print('Best parameter for kNN: ', knn_gs.best_params_)

Best parameter for kNN:  {'n_neighbors': 9}


**Decision Tree**

In [5]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV


dtree = DecisionTreeClassifier()


params = {
    'criterion': ['gini', 'entropy'],
    'max_depth' : range(3,8),
    'min_samples_split': range(2,10),
    'min_samples_leaf': range(1,10)
}

# GridSearch:
dtree_gs = GridSearchCV(
    estimator = dtree,
    param_grid = params,
    cv = 5,
    scoring = 'accuracy'
)


dtree_gs.fit(X_train, y_train.ravel())

dtree_best = dtree_gs.best_estimator_

print('Best parameters for decision tree: ',dtree_gs.best_params_)

Best parameters for decision tree:  {'criterion': 'gini', 'max_depth': 4, 'min_samples_leaf': 6, 'min_samples_split': 2}


**Logistic Regression**

In [6]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

log_reg = LogisticRegression(max_iter=200)


log_reg.fit(X_train, y_train.ravel())

In [8]:
# Testing the three models and listing the accuracy values
print('k-nn: {}'.format(knn_best.score(X_test_scaled, y_test)))
print('Decision tree: {}'.format(dtree_best.score(X_test, y_test)))
print('Logistic regression: {}'.format(log_reg.score(X_test, y_test)))

k-nn: 0.8122866894197952
Decision tree: 0.8020477815699659
Logistic regression: 0.78839590443686


**Stacking classifier**

In [9]:
# ENSEMBLE: VOTING CLASSIFIER
from sklearn.ensemble import VotingClassifier

# creating a dictionary containing the models
estimators=[('k-nn', knn_best), ('Decision tree', dtree_best), ('Logistic regression', log_reg)]

# stacking classifier with base models
ensemble_VC = VotingClassifier(estimators,
                               voting='soft')

In [10]:
ensemble_VC.fit(X_train, y_train.ravel())

In [11]:
#evaluation
ensemble_VC.score(X_test, y_test)

0.7986348122866894