<a href="https://colab.research.google.com/github/kylo-dev/gachon-machinelearning/blob/main/5%EC%A3%BC%EC%B0%A8_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5주차 실습

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

wine = pd.read_csv('https://bit.ly/wine_csv_data')

data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

## 결정트리

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)

In [3]:
print(gs.best_params_)

{'max_depth': 14, 'min_impurity_decrease': 0.0004, 'min_samples_split': 12}


In [4]:
print(np.max(gs.cv_results_['mean_test_score']))

0.8683865773302731


In [5]:
print(gs.score(test_input, test_target))

0.8615384615384616


## 랜덤포레스트

In [7]:
from sklearn.ensemble import RandomForestClassifier


# 랜덤 포레스트 모델 초기화
rf = RandomForestClassifier()

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]
}

# GridSearchCV를 사용하여 최적 모델 찾기
grid_search = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid_search.fit(train_input, train_target)

In [8]:
# 최적 모델 및 최적 파라미터 출력
best_rf_model = grid_search.best_estimator_
best_rf_params = grid_search.best_params_

print("Best Random Forest Model:")
print(best_rf_model)

print("Best Parameters:")
print(best_rf_params)

Best Random Forest Model:
RandomForestClassifier(max_depth=20, min_impurity_decrease=0.0001,
                       n_estimators=300)
Best Parameters:
{'max_depth': 20, 'min_impurity_decrease': 0.0001, 'n_estimators': 300}


In [9]:
print(grid_search.score(test_input, test_target))

0.8792307692307693


In [10]:
print(best_rf_model.score(test_input, test_target))

0.8792307692307693


## 엑스트라 트리

In [11]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_jobs=-1, random_state=42)

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 20],
    'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]
}

# GridSearchCV를 사용하여 최적 모델 찾기
grid_search = GridSearchCV(et, param_grid, cv=5, n_jobs=-1)
grid_search.fit(train_input, train_target)

In [12]:
# 최적 모델 및 최적 파라미터 출력
best_rf_model = grid_search.best_estimator_
best_rf_params = grid_search.best_params_

print("Best Random Forest Model:")
print(best_rf_model)

print("Best Parameters:")
print(best_rf_params)

Best Random Forest Model:
ExtraTreesClassifier(max_depth=5, min_impurity_decrease=0.0001, n_jobs=-1,
                     random_state=42)
Best Parameters:
{'max_depth': 5, 'min_impurity_decrease': 0.0001, 'n_estimators': 100}


In [13]:
# 최적의 모델로 score 값 구하
print(grid_search.score(test_input, test_target))

0.7384615384615385


## 그래디언트 부스팅

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5]
}

gb = GradientBoostingClassifier(random_state=42)

# GridSearchCV를 사용하여 최적 모델 찾기
grid_search = GridSearchCV(gb, param_grid, cv=5, n_jobs=-1)
grid_search.fit(train_input, train_target)

In [15]:
# 최적 모델 및 최적 파라미터 출력
best_rf_model = grid_search.best_estimator_
best_rf_params = grid_search.best_params_

print("Best Random Forest Model:")
print(best_rf_model)

print("Best Parameters:")
print(best_rf_params)

Best Random Forest Model:
GradientBoostingClassifier(learning_rate=0.2, max_depth=5, n_estimators=300,
                           random_state=42)
Best Parameters:
{'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 300}


In [16]:
# 최적의 모델로 score 값 구하
print(grid_search.score(test_input, test_target))

0.8792307692307693


## 히스토그램기반 그래디언트 부스팅

In [17]:
from sklearn.ensemble import HistGradientBoostingClassifier

param_grid = {
        'learning_rate': [0.01, 0.1, 0.2],
        'max_iter': [100, 200, 300]
}

hgb = HistGradientBoostingClassifier(random_state=42)

# GridSearchCV를 사용하여 최적 모델 찾기
grid_search = GridSearchCV(hgb, param_grid, cv=5, n_jobs=-1)
grid_search.fit(train_input, train_target)

In [None]:
# 최적 모델 및 최적 파라미터 출력
best_rf_model = grid_search.best_estimator_
best_rf_params = grid_search.best_params_

print("Best Random Forest Model:")
print(best_rf_model)

print("Best Parameters:")
print(best_rf_params)

In [19]:
# 최적의 모델로 score 값 구하
print(grid_search.score(test_input, test_target))

0.8669230769230769
