## 2. 투표 분류기 (voting classifier)

In [6]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [7]:
X = np.load("../datasets/ch13/titanic_X_train.npy")
y = np.load("../datasets/ch13/titanic_y_train.npy")

In [8]:
X[0]

array([0.27345609, 0.01415106, 0.        , 1.        , 0.        ,
       0.125     , 0.        , 0.        , 0.        , 1.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       1.        , 0.        , 0.        , 1.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ])

In [9]:
y[:10]

array([0., 1., 1., 1., 0., 0., 0., 0., 1., 1.])

In [10]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1, max_depth=4)
clf3 = GaussianNB()

eclf = VotingClassifier(
    estimators = [('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

In [11]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

0.8222941661905668

In [12]:
cross_val_score(clf1, X, y, cv=5).mean()

0.8290420872214816

In [13]:
cross_val_score(clf2, X, y, cv=5).mean()

0.8223068621849807

In [14]:
cross_val_score(clf3, X, y, cv=5).mean()

0.4600139655938551

In [17]:
eclf = VotingClassifier(
    estimators=[('lr', clf1), ('rf', clf2)], voting='hard')
cross_val_score(eclf, X, y, cv=5).mean()

0.8301783787215135

In [18]:
clf1 = LogisticRegression(random_state=1)
clf2 = DecisionTreeClassifier(random_state=1)
eclf = VotingClassifier(estimators=[('lr', clf1), ('dt', clf2)], voting='hard')

In [19]:
c_params = [0.1, 5.0, 7.0, 10.0, 15.0, 20.0, 100.0]

params = {
    "lr__solver": ['liblinear'], "lr__penalty": ['l2'], "lr__C": c_params,
    "dt__criterion" : ['gini', 'entropy'],
    "dt__max_depth": [10, 8, 7, 6, 5, 4, 3, 2],
    "dt__min_samples_leaf": [1, 2, 3, 4, 5, 6, 7, 8, 9]
}

In [20]:
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X, y)
grid.best_score_

0.8425569732749316

In [21]:
grid.best_params_

{'dt__criterion': 'gini',
 'dt__max_depth': 10,
 'dt__min_samples_leaf': 5,
 'lr__C': 5.0,
 'lr__penalty': 'l2',
 'lr__solver': 'liblinear'}

## 3. 배깅과 랜덤 포레스트

In [22]:
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier

X = np.load("../datasets/ch13/titanic_X_train.npy")
y = np.load("../datasets/ch13/titanic_y_train.npy")

clf1 = LogisticRegression(random_state=1)
eclf = BaggingClassifier(clf1, oob_score=True)

from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]


0.8256586047102139

In [23]:
params = {
    "n_estimators": [10, 20, 30, 40, 50, 55],
    "max_samples": [0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid= grid.fit(X, y)

grid.best_score_

  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictions / predictions.sum(axis=1)[:, np.newaxis]
  warn(
  oob_decision_function = predictio

0.8290420872214816

In [24]:
grid.best_params_

{'max_samples': 0.9, 'n_estimators': 50}

In [25]:
grid.best_estimator_.oob_score_

0.8267716535433071

In [26]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier

X = np.load("../datasets/ch13/titanic_X_train.npy")
y = np.load("../datasets/ch13/titanic_y_train.npy")

eclf = RandomForestClassifier(n_estimators=100, max_features=2, n_jobs=7, oob_score=True)

from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

0.7987113565670031

In [27]:
from sklearn.model_selection import GridSearchCV
params = {
    'n_estimators': [10, 20, 30, 50, 100],
    'max_features': [1, 2, 3, 4, 5, 6, 7, 10, 15, 20, 25, len(X[0])]
}

grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5)
grid = grid.fit(X, y)

grid.best_score_

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


0.8268075922046595

In [28]:
grid.best_params_

{'max_features': 20, 'n_estimators': 30}

In [29]:
grid.best_estimator_.oob_score_

0.813273340832396

#### ADABOOST

In [30]:
import numpy as np
X = np.load("../datasets/ch13/titanic_X_train.npy")
y = np.load("../datasets/ch13/titanic_y_train.npy")

In [31]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
eclf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=500)

In [32]:
from sklearn.model_selection import cross_val_score
cross_val_score(eclf, X, y, cv=5).mean()

0.784072875007935

In [33]:
from sklearn.ensemble import RandomForestClassifier
eclf = RandomForestClassifier(n_estimators=500)
cross_val_score(eclf, X, y, cv=5).mean()

0.8009395035866185

In [34]:
eclf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2), n_estimators=500)

params = {'base_estimator__criterion': ['gini', 'entropy'],
          'base_estimator__max_features': [7, 8,],
          'base_estimator__max_depth': [1, 2],
          'n_estimators': [23, 24, 25, 26, 27],
          'learning_rate': [0.4, 0.45, 0.5, 0.55, 0.6]}

from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(estimator=eclf, param_grid=params, cv=5, n_jobs=7)
grid = grid.fit(X, y)

grid.best_score_

0.8290547832158953

In [35]:
grid.best_params_

{'base_estimator__criterion': 'entropy',
 'base_estimator__max_depth': 2,
 'base_estimator__max_features': 7,
 'learning_rate': 0.4,
 'n_estimators': 25}

In [36]:
grid.best_estimator_.feature_importances_

array([0.28344405, 0.18024294, 0.0599647 , 0.0555602 , 0.05644651,
       0.11489068, 0.00250215, 0.00339794, 0.        , 0.01261134,
       0.        , 0.        , 0.        , 0.01336483, 0.        ,
       0.05899793, 0.06126528, 0.00815244, 0.00978454, 0.        ,
       0.0061059 , 0.02152568, 0.02192603, 0.02329949, 0.00651738,
       0.        , 0.        ])