<a href="https://colab.research.google.com/github/jungsoo2004/eqsim/blob/master/hands_on_ml_ch6_exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#문제 7번

In [1]:
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
moon_X, moon_y = make_moons(n_samples=1000, noise=0.4)

train_X, test_X = train_test_split(moon_X, test_size=0.2, random_state=42)
train_y, test_y = train_test_split(moon_y, test_size=0.2, random_state=42)

In [10]:
param_grid = {
    'max_leaf_nodes': [2, 4, 6, 8],
    'max_depth': [2, 3, 4],
    'min_samples_leaf': [50, 100, 200, 400]
}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(train_X, train_y)

In [24]:
grid_search.best_params_

{'max_depth': 2, 'max_leaf_nodes': 4, 'min_samples_leaf': 50}

In [25]:
best_dt = DecisionTreeClassifier(random_state=42, max_depth=2, max_leaf_nodes=4, min_samples_leaf=50)
best_dt.fit(train_X, train_y)
prediction = best_dt.predict(test_X)

correct_answer = (prediction == test_y)
accuracy = np.sum(correct_answer) / len(test_y)
accuracy

0.82

#문제 8번

In [30]:
from sklearn.model_selection import ShuffleSplit

X_subsets = []
y_subsets = []

splitter = ShuffleSplit(n_splits=1000, test_size=100)
for train_index, test_index in splitter.split(moon_X):
  X_subset = moon_X[test_index]
  X_subsets.append(X_subset)

  y_subset = moon_y[test_index]
  y_subsets.append(y_subset)

len(X_subsets), len(y_subsets)

(1000, 1000)

In [33]:
sum = 0

for i in range(len(X_subsets)):
  dt = DecisionTreeClassifier(random_state=42, max_depth=2, max_leaf_nodes=4, min_samples_leaf=50)
  dt.fit(X_subsets[i], y_subsets[i])

  dt_pred = dt.predict(test_X)
  correct_answer = (prediction == test_y)
  accuracy = np.sum(correct_answer) / len(test_y)
  sum += accuracy

sum / len(X_subsets)

0.8200000000000152

In [45]:
pred_results = []

for i in range(len(X_subsets)):
  dt = DecisionTreeClassifier(random_state=42, max_depth=2, max_leaf_nodes=4, min_samples_leaf=50)
  dt.fit(X_subsets[i], y_subsets[i])

  dt_pred = dt.predict(test_X)
  pred_results.append(dt_pred)


integrated_result = []
for i in range(len(test_X)):
  count_sum = 0
  for j in range(len(X_subsets)):
    count_sum += pred_results[j][i]
  if count_sum / len(X_subsets) > 0.5:
    integrated_result.append(1)
  else:
    integrated_result.append(0)

pred_results = np.array(pred_results)
correct_answer = (integrated_result == test_y)
accuracy = np.sum(correct_answer) / len(test_y)
accuracy

0.72

In [47]:
correct_answer

array([False,  True,  True,  True,  True,  True, False, False,  True,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True, False,  True,
        True, False, False, False, False,  True,  True,  True, False,
        True,  True,  True, False, False, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
       False,  True, False,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True, False,  True,  True,
        True,  True, False,  True,  True, False, False, False,  True,
       False,  True,  True, False, False,  True,  True, False,  True,
       False,  True,  True,  True,  True, False,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True, False,  True,
        True, False,  True,  True,  True,  True,  True, False,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,

예상 외로 data 100개를 사용한 dt와 1000개 사용한 dt의 성능 차이가 거의 나지 않음. 그래서인지 bagging 방식으로 만든 모델이 오히려 성능이 떨어짐