In [16]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

import warnings
warnings.filterwarnings('ignore')

# Voting Classifier Using Titanic Dataset

Learn about VotingClassifier

### 1. Load Dataset

In [8]:
#load already pre-processed titanic trianing dataset
X = np.load("tatanic_X_train.npy")
y = np.load("tatanic_y_train.npy")

In [9]:
X[0]

array([0.27345609, 0.01415106, 0.        , 1.        , 0.        ,
       0.125     , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [10]:
y[:10]

array([0., 1., 1., 1., 0., 0., 0., 0., 1., 1.])

### 2. Train Model

In [11]:
#create 3 different model instances
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
clf3 = GaussianNB()

#create a VotingClassifier
eclf = VotingClassifier(
    estimators=[('lr', clf1), ('dt', clf2), ('gnb', clf3)],
    voting='hard'
)

In [18]:
from sklearn.model_selection import cross_val_score

cross_val_score(eclf, X, y, cv=5).mean()

0.8009268075922046

Try each model without Ensemble and compare cross_val_scores

In [19]:
#Logistic regression
cross_val_score(clf1, X, y, cv=5).mean()

0.8290420872214816

In [20]:
#Decision tree
cross_val_score(clf2, X, y, cv=5).mean()

0.7829175395162826

In [21]:
#Guassian naive bayes
cross_val_score(clf3, X, y, cv=5).mean()

0.4600139655938551

From the results, the VotingClassifier is not very accurate. This low-performance can be blamed on Naive Bayesian model, cross-validation score is less than even 50%. In such case, let's try VotingClassifier without Gaussian Naive Bayes

### 3. Model Training Correction

In [22]:
#create 2 different model instances
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)

#create a VotingClassifier
eclf = VotingClassifier(
    estimators=[('lr', clf1), ('dt', clf2)],
    voting='hard'
)

In [23]:
#check cross_validation_score
cross_val_score(eclf, X, y, cv=5).mean()

0.8222687742017394

This is a lot better than the ensemble with GNB model

In [25]:
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]

params = {
    "lr__solver": ['liblinear'], "lr__penalty": ["l2"], "lr__C":c_params,
    "dt__max_depth":[10,8,7,6,5,4,3,2],
    "dt__min_samples_leaf":[1,2,3,4,5,6,7,8,9]
}

In [28]:
from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X, y)

In [29]:
grid.best_score_

0.84251968503937

In [30]:
grid.best_params_

{'dt__max_depth': 10,
 'dt__min_samples_leaf': 5,
 'lr__C': 5.0,
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear'}