In [1]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [2]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [3]:
X[:5] # x features

array([[0.27345609, 0.01415106, 0.        , 1.        , 0.        ,
        0.125     , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.473882  , 0.13913574, 0.        , 0.        , 1.        ,
        0.125     , 0.25      , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.32356257, 0.01546857, 0.        , 1.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 1.    

In [4]:
y[:5]

array([0., 1., 1., 1., 0.])

In [7]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [8]:
clflog = LogisticRegression(random_state=1)
clfdt = DecisionTreeClassifier(random_state=1)
clfgn = GaussianNB()

models = [clflog, clfdt, clfgn]

for model in models:
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    score = model.score(x_test, y_test)
#     print(model)
    print(score)
    print('-'*20)

0.8277153558052435
--------------------
0.7677902621722846
--------------------
0.7640449438202247
--------------------




In [9]:

eclf_h = VotingClassifier(estimators=[('lr', clflog), ('rf', clfdt), ('gnb', clfgn)], voting='hard')
eclf_h.fit(x_train, y_train)
pred = eclf_h.predict(x_test)
score = eclf_h.score(x_test, y_test)
print(score)

0.8164794007490637




In [10]:

eclf_s = VotingClassifier(estimators=[('lr', clflog), ('rf', clfdt), ('gnb', clfgn)], voting='soft')
eclf_s.fit(x_train, y_train)
pred = eclf_s.predict(x_test)
score = eclf_s.score(x_test, y_test)
print(score)

0.8164794007490637




In [11]:
clflog = LogisticRegression(random_state=1)
clfdt = DecisionTreeClassifier(random_state=1)
clfgn = GaussianNB()
eclf_h = VotingClassifier(estimators=[('lr', clflog), ('rf', clfdt), ('gnb', clfgn)], voting='hard')
eclf_s = VotingClassifier(estimators=[('lr', clflog), ('rf', clfdt), ('gnb', clfgn)], voting='soft')


In [12]:
models = [clflog, clfdt, clfgn, eclf_h, eclf_s]

In [13]:
for model in models:
    model.fit(x_train, y_train)
    pred = model.predict(x_test)
    score = model.score(x_test, y_test)
#     print(model)
    print(score)
    print('-'*20)

0.8277153558052435
--------------------
0.7677902621722846
--------------------
0.7640449438202247
--------------------
0.8164794007490637
--------------------
0.8164794007490637
--------------------




In [14]:
from sklearn.model_selection import cross_val_score
for model in models:
    scores = cross_val_score(model, x_train, y_train, cv=5)
    
    print(scores)
    print(scores.mean())
    print('-'*20)

[0.76       0.848      0.816      0.83870968 0.82113821]
0.8167695777602937
--------------------
[0.752      0.784      0.768      0.77419355 0.78861789]
0.7733622869131918
--------------------
[0.688      0.768      0.704      0.81451613 0.76422764]
0.7477487542617361
--------------------
[0.776      0.824      0.776      0.87096774 0.80487805]
0.8103691581431944
--------------------
[0.776      0.824      0.792      0.84677419 0.80487805]
0.808730448465775
--------------------




In [15]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')
eclf2 = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='soft')

In [16]:
models = [clf1, clf2, eclf1, eclf2]

for model in models:
    scores = cross_val_score(model, x_train, y_train, cv=5)
    
    print(scores)
    print(scores.mean())
    print('-'*20)

[0.76       0.848      0.816      0.83870968 0.82113821]
0.8167695777602937
--------------------
[0.752      0.784      0.768      0.77419355 0.78861789]
0.7733622869131918
--------------------
[0.8        0.808      0.8        0.7983871  0.82926829]
0.8071310778914242
--------------------
[0.752      0.784      0.768      0.77419355 0.78861789]
0.7733622869131918
--------------------




In [17]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')

In [18]:
c_params = [0.1,  5.0, 7.0, 10.0, 15.0, 20.0, 100.0]

params ={
    "lr__solver" : ['liblinear'], "lr__penalty" : ["l2"], \
    "lr__C" : c_params, "dt__criterion" : ["gini", "entropy"],
    "dt__max_depth" : [10,8,7,6,5,4,3,2],
    "dt__min_samples_leaf": [1,2,3,4,5,6,7,8,9]
    }

In [19]:
# cross_validation(cv), grid_search(param_grid), ensemble(estimator) => 통합

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X, y)

In [20]:
grid.best_score_

0.84251968503937

In [21]:
grid.best_params_

{'dt__criterion': 'gini',
 'dt__max_depth': 10,
 'dt__min_samples_leaf': 5,
 'lr__C': 5.0,
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear'}

In [22]:
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]

params ={"solver" : ['liblinear'], "penalty" : ["l2"], "C" : c_params}
grid = GridSearchCV(clf1, param_grid=params, cv=5)
grid = grid.fit(X, y)

In [23]:
grid.best_score_

0.8267716535433071

In [24]:
grid.best_params_

{'C': 5.0, 'penalty': 'l2', 'solver': 'liblinear'}