In [None]:
# 폰트 다운로드 (런타임 재부팅)
'''
!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf
'''

In [None]:
# 코랩에서 그래프 안 나오는 경우 사용
%matplotlib inline

# 데이터 처리
import pandas as pd

# 행렬 연산
import numpy as nd

# 데이터 시각화
import matplotlib
import matplotlib.pyplot as plt

# 경고 메시지 무시
import warnings

# 글꼴 설정 ("NanumBarunGothic")
plt.rc("font", family="NanumBarunGothic")

# 실행결과 경고메시지 출력 제외
warnings.filterwarnings("ignore")

**1. 반복학습 및 loss 최저점 구하기, 조기종료**

In [None]:
# 데이터 split하기 위한 도구
from sklearn.model_selection import train_test_split

# 데이터 고차원화
from sklearn.preprocessing import PolynomialFeatures

# 데이터 rescaling
from sklearn.preprocessing import StandardScaler

# 사이킷런 전처리 파이프라인 만들기 위해
from sklearn.pipeline import Pipeline

# 행렬 연산 라이브러리
import numpy as np

# 사이킷런의 linear model 중 stochastic gradient descent
from sklearn.linear_model import SGDRegressor

# 사이킷런의 loss function 중 하나인 mean squared error 모듈
from sklearn.metrics import mean_squared_error

# 시스템 접근 모듈
import os

# random 기준
np.random.seed(42)

'''
# 그림을 저장할 폴더
PROJECT_ROOT_DIR = "."

# 피규어 저장
def save_fig(fig_id, tight_layout=True):
  path = os.path.join(PROJECT_ROOT_DIR, "images.png")
  if tight_layout:
    plt.tight_layout()
  plt.savefig(path, format="png", dpi=300)
'''

# 100개의 샘플 예정
m = 100

# 100개의 샘플 사용 데이터
X = 6 * np.random.rand(m, 1) - 3

# 100개의 샘플 사용 데이터의 정답값
y = 2 + X + 0.5 * X ** 2 + np.random.rand(m, 1)
# train set, validation set

# ravel() 다차원을 1차원으로, test = validation set 50%, random_state suffling 기준
X_train, X_val, y_train, y_val = train_test_split(X[:50], y[:50].ravel(), test_size=0.5, random_state=10)

print(len(X_train))
print(len(X_val))
print(len(y_train))
print(len(y_val))

In [None]:
# 전처리 파이프라인 polynomial Features의 차수는 90
poly_scaler = Pipeline([
    ("poly_features", PolynomialFeatures(degree=90, include_bias=False)),
    ("std_scaler", StandardScaler()),
])

# X로 fitting한 다음에 변환까지
X_train_poly_scaled = poly_scaler.fit_transform(X_train)

# # 이미 X_train으로 poly_scaler를 fitting해서 변환만 해준다.
X_val_poly_scaled = poly_scaler.transform(X_val)

# SGDRegressor 모델 정의
# penalty: {'l2', 'l1', 'elasticnet'}, 기본값='l2' 사용할 페널티(정규화 용어라고도 함)입니다. 선형 SVM 모델의 표준 정규화 장치인 'l2'가 기본값입니다. 'l1' 및 'elasticnet'은 'l2'로 달성할 수 없는 모델(기능 선택)에 희소성을 가져올 수 있습니다.
# eta0 float, 기본값=0.01 초기 학습률입니다. 기본값은 0.01입니다.
# warm_start의 default는 False임. 이는 .fit을 실행할 때, 이전에 업데이트된 weight(coefficient)를 초기화하고 다시 fitting한다는 것을 의미한다. 반대로 True는 이전 호출에 사용했던 solution을 재사용 할지 여부 결정
# learning_rate="constant" : Learning Rate로 지정한 상수값을 계속 사용하는 것을 나타낸다.
sgd_reg = SGDRegressor(max_iter=1, penalty=None, eta0=0.0005, warm_start=True, learning_rate="constant", random_state=42)

# 학습 횟수
n_epochs = 500

# 학습오차와 validation 오차 담을 리스트 그릇
train_errors, val_errors = [], []

# 학습 반복
for epoch in range(n_epochs):
  # fitting, plynomual 시킨거랑 정답값으로 fitting
  sgd_reg.fit(X_train_poly_scaled, y_train)

  # fitting된 모델에 train X와 validation X 넣어서 각각 예측값 도출
  y_train_predict = sgd_reg.predict(X_train_poly_scaled)
  y_val_predict = sgd_reg.predict(X_val_poly_scaled)

  # MSE 구한 후 리스트에 각각 담기
  train_errors.append(mean_squared_error(y_train, y_train_predict))
  val_errors.append(mean_squared_error(y_val, y_val_predict))

# argmin에서 몇 번째에 val error가 낮나? 몇 번째가 몇 번째 epoch나?
best_epoch = np.argmin(val_errors)

# best epoch에서 val_error를 찾고, 그 값은 MSE라서 RMSE로 구한다.
best_val_rmse = np.sqrt(val_errors[best_epoch])

In [None]:
'''
# plot에서 설정할 값들
# xytext : 텍스트 위치
# xy : 화살표 위치
# ha : horizontal alignment
# arrowprops : 화살표 속성들

plt.annotate("최선의 모델",
             xy=(best_epoch, best_val_rmse),
             xytext=(best_epoch, best_val_rmse + 1),
             ha="center",
             arrowprops=dict(facecolor='black', shrink=0.05),
             fontsize=16,
)

# 그래프를 더 보기 좋게 만들기 위해
best_val_rmse -= 0.03
plt.plot([0, n_epochs], [best_val_rmse, best_val_rmse], "k:", linewidth=2)

# val_errors와 train_errors의 RMSE
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="검증 세트")
plt.plot(np.sqrt(train_errors), "r--", linewidth=2, label="훈련 세트")

# 각 화살표 네이밍
plt.legend(loc="upper right", fontsize=14)

# xlabel
plt.xlabel("에포크", fontsize=14)

# ylabel
plt.ylabel("RMSE", fontsize=14)
save_fig("early_stopping_plot")
plt.show()
'''

In [None]:
from sklearn.base import clone

sgd_reg = SGDRegressor(max_iter=1, warm_start=True, penalty=None, learning_rate="constant", eta0=0.0005, random_state=42)
minimum_val_error = float("inf")

best_epoch = None
best_model = None

for epoch in range(1000):
  sgd_reg.fit(X_train_poly_scaled, y_train)
  y_val_predict = sgd_reg.predict(X_val_poly_scaled)
  val_error = mean_squared_error(y_val, y_val_predict)

  if val_error < minimum_val_error:
    minimum_val_error = val_error
    best_epoch = epoch
    best_model = clone(sgd_reg)

print(minimum_val_error)
print(best_epoch)
print(best_model)

**2. 로지스틱 회귀 실습**

In [None]:
t = np.linspace(-10, 10, 100)
sig = 1 / (1 + np.exp(-t))
plt.figure(figsize=(9, 3))

# 아래 굵은선
plt.plot([-10, 10], [0, 0], "k-")

# 가운데 점선
plt.plot([-10, 10], [0.5, 0.5], "k:")

# 위쪽 점선
plt.plot([-10, 10], [1, 1], "k:")

# 가운데 굵은선
plt.plot([0, 0], [-1.1, 1.1], "k-")

# 파란색 선
plt.plot(t, sig, "b-", linewidth=2, label=r"$\sigma(t) = \frac{1}{1 + e^{-t}}$")
plt.xlabel("t")
plt.legend(loc="upper left", fontsize=20)
plt.axis([-10, 10, -0.1, 1.1])
plt.show()

In [None]:
# 사이킷런 라이브러리에서 공급하는 iris 데이터
# 총 5개의 컬럼
# sepal length in cm
# sepal width in cm
# petal length in cm
# petal width in cm
# label

from sklearn import datasets

iris = datasets.load_iris()

In [None]:
iris["data"]

In [None]:
X = iris["data"][:, 3:] # 꽃잎 넓이 1개만

# 정답 만들기, target 2부분을 1로 만드는 부분
y = (iris["target"] == 2).astype(np.int) # Iris-Virginica이면 1 아니면 0

In [None]:
iris["target"] == 2
y

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver="liblinear", random_state=42)
log_reg.fit(X, y)

In [None]:
# 데이터 생성
# 0~3 사이에 1000개
X_new = np.linspace(0, 3, 1000).reshape(-1, 1)

# 모델로 예측
y_proba = log_reg.predict_proba(X_new)
y_proba

# 0.5 기준으로 decision boundary 구한다. 그리고 그 X 값은 무엇인지? (linear하게 뽑아서 첫 번째 값을 바운더리 데이터로 생각)
decision_boundary = X_new[y_proba[:, 1] >= 0.5][0]
decision_boundary

array([[0.98554411, 0.01445589],
       [0.98543168, 0.01456832],
       [0.98531838, 0.01468162],
       ...,
       [0.02618938, 0.97381062],
       [0.02598963, 0.97401037],
       [0.02579136, 0.97420864]])

In [None]:
# 위의 기준으로 2개의 데이터를 메뉴얼하게 줘봤을 때, 잘 구분하는지?
log_reg.predict([[1.7], [1.5]])

array([1, 0])

**3. Support Vector Machine - 분류**

In [None]:
import numpy as np
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# Support Vector machine classifier == SVC
from sklearn.svm import LinearSVC

# iris 데이터 동일
iris = datasets.load_iris()
X = iris["data"][:, (2, 3)]

# 2를 제외
y = (iris["target"] == 2).astype(np.float64)

print(y)

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1.]


In [None]:
svm_clf = Pipeline([
    # 데이터 rescaling
    ("scaler", StandardScaler()),
    # Linear Support vector Machine Classifier: SVC, 힌지 loss
    ("linear_svc", LinearSVC(C=1, loss="hinge", random_state=42))
])

# SVM 훈련
svm_clf.fit(X, y)

In [None]:
# 분류 예측
svm_clf.predict([[5.5, 1.7]])

**3. Support Vector Machine - 비선형(Polynormial Features) 분류**

In [None]:
from sklearn.datasets import make_moons
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

# SVC에서 kernel을 linear로 해도 되지만, LinearSVC가 최적화 되어있고 더 빠르다.
polynomial_svm_clf = Pipeline([
    ("poly_features", PolynomialFeatures(degree=3)),
    ("scaler", StandardScaler()),
    # max_iter : Gradient Descent 방식을 반복해서 몇번 수행할 것인가?
    ("svm_clf", LinearSVC(C=10, loss="hinge", max_iter=2000, random_state=42))
])

polynomial_svm_clf.fit(X, y)

In [None]:
# 분류 예측
svm_clf.predict([[3.5, 6.4]])

**3. Support Vector Machine - 비선형(Polynormial Features) 분류 (Simple)**

In [None]:
from sklearn.svm import SVC

poly_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="poly", degree=3, C=5))
])

poly_kernel_svm_clf.fit(X, y)

In [None]:
# 분류 예측
svm_clf.predict([[3.5, 6.4]])

**3. Support Vector Machine - 가우시안 방사함수 kernel**

In [None]:
rbf_kernel_svm_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
])

rbf_kernel_svm_clf.fit(X, y)

**3. Support Vector Machine - 회귀**

In [None]:
# 랜덤하게 50개의 샘플 생성
np.random.seed(42)
m = 50
X = 2 * np.random.rand(m, 1)
y = (4 + 3 * X + np.random.randn(m, 1)).ravel()

In [None]:
# Support vector regressor = SVR
from sklearn.svm import LinearSVR

svm_reg = LinearSVR(epsilon=1.5, random_state=42)
svm_reg.fit(X, y)

In [None]:
svm_reg.predict(X)

In [None]:
from sklearn.svm import SVR

svm_poly_reg = SVR(kernel="poly", degree=3, epsilon=1.5)
svm_poly_reg.fit(X, y)

In [None]:
svm_reg.predict(X)

**4. Decision Tree - 분류**

In [None]:
from sklearn.datasets import load_iris

# Decision Tree 불러오기
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()
X = iris.data[:, 2:] # petal length, width
y = iris.target

# 최대 나무 깊이
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X, y)

In [None]:
import graphviz
from sklearn import tree

dot_data = tree.export_graphviz(
    tree_clf,
    # petal length, width
    feature_names=["꽃잎 길이 (cm)", "꽃잎 너비 (cm)"],
    class_names=iris.target_names,
    rounded=True,
    filled=True
)

In [None]:
graph = graphviz.Source(dot_data)
graph

In [None]:
# 분류 cross validation score
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_clf, X, y, cv=14, scoring="accuracy")
np.mean(np.sqrt(scores))

In [None]:
tree_clf.predict([[5, 1.5]])

**4. Decision Tree - 회귀**

In [None]:
# 200개의 난수 데이터셋
np.random.seed(42)
m = 200
X = np.random.rand(m, 1)
y = 4 * (X - 0.5) ** 2
y = y + np.random.randn(m, 1) / 10

In [None]:
from sklearn.tree import DecisionTreeRegressor

# 결정트리 회귀 모델
tree_reg = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg.fit(X, y)

In [None]:
dot_data = tree.export_graphviz(
    tree_reg,
    feature_names=["x1"],
    rounded=True,
    filled=True
)

In [None]:
graph = graphviz.Source(dot_data)
graph

# x1이 만약 0.6이면 3번째 자식노드(하얀색) 노드에 해당하며
# 하얀색 노드의 value는 110개의 훈련샘플 데이터의 평균 타깃값 0.111 값을 의미한다.
# 결국 x1의 예측값은 0.111로 된다.
# mse는 110개 샘플

In [None]:
# 회귀 cross validation score
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, X, y, cv=14, scoring="neg_mean_squared_error")
np.mean(np.sqrt(-scores))

In [None]:
# 예측 값 확인
print(tree_reg.predict([[0.65]]))
print(tree_reg.predict([[0.6]]))

**4. Decision Tree - 실전**

In [None]:
'''
1. PassengerId : 각 승객의 고유 번호
2. Survived : 생존 여부(종속 변수)
  0 = 사망
  1 = 생존
3. Pclass : 객실 등급 - 승객의 사회적, 경제적 지위
  1st = Upper
  2nd = Middle
  3rd = Lower
4. Name : 이름
5. Sex : 성별
6. Age : 나이
7. SibSp : 동반한 Sibling(형제자매)와 Spouse(배우자)의 수
8. Parch : 동반한 Parent(부모) Child(자식)의 수
9. Ticket : 티켓의 고유넘버
10. Fare : 티켓의 요금
11. Cabin : 객실 번호
12. Embarked : 승선한 항
  C = Cherbourg
  Q = Queenstown
  S = Southampton
'''

In [None]:
import pandas as pd

url = "titanic.csv"
titanic = pd.read_csv(url)

In [None]:
titanic.info()
titanic.head()
titanic.shape

In [None]:
# 여자:0 남자:1로 변환
titanic.Sex = titanic.Sex.map({"female":0, "male":1})

# 나이 누락된값 median 값으로 채운다.
titanic.Age.fillna(titanic.Age.median(), inplace=True)

**get_dummy 함수**

*   머신러닝을 할 때 기계가 이해할 수 있도록 모든 데이터를 수치로 변환해주는 전처리 작업이 필수이다.

In [None]:
# Embarked 누락값을 S로
titanic.Embarked.fillna("S", inplace=True)

# get_dummy 만들기
# 접두사 추가
# S 0 1
# C 0 0
# Q 1 0 로 변환
embarked_dummies = pd.get_dummies(titanic.Embarked, prefix="Embarked", drop_first=True)
embarked_dummies

In [None]:
# 뒤에 붙여줌
titanic = pd.concat([titanic, embarked_dummies], axis=1)

In [None]:
titanic.head()

In [None]:
#  각 승객의 고유 번호, 성별, 나이 ,승선한 항 Q, S
feature_cols = ["Pclass", "Sex", "Age", "Embarked_Q", "Embarked_S"]
X = titanic[feature_cols]

# 생존 예측
y = titanic.Survived

In [None]:
from sklearn.tree import DecisionTreeClassifier

treeclf = DecisionTreeClassifier(max_depth=3, random_state=1)
treeclf.fit(X, y)

In [None]:
import graphviz
from sklearn import tree

dot_data = tree.export_graphviz(
    treeclf, out_file=None,
    feature_names=feature_cols,
    class_names=["Survived","Die"],
    filled=True, rounded=True,
    special_characters=True
)

graph = graphviz.Source(dot_data)
graph

In [None]:
# 의사결정에 기여를 많이한 정도
pd.DataFrame({"feature":feature_cols, "importance":treeclf.feature_importances_})

**5. 앙상블 기법**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

# make_moons 함수는 초승달 모양 클러스터 두 개 형상의 데이터를 생성
# n_samples : 표본 데이터의 수, 디폴트 100
# noise : 잡음의 크기. 0이면 정확한 반원을 이룬다.

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, precision_score

rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
rnd_clf.fit(X_train, y_train)

prdict_result = rnd_clf.predict(X_test)
print(prdict_result)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier, VotingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver="liblinear", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma="auto", random_state=42)

# 투표기반 분류기
voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("rf", rnd_clf), ("svc", svm_clf)],
    voting="hard"
)

voting_clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

# 모델 이름, 정확도 출력
print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

**5. 배깅 앙상블**

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# 무슨 모델, 개수, cpu 사용 : -1은 전부라는 뜻
bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42), n_estimators=500, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

In [None]:
# 그냥 결정트리
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred_tree = tree_clf.predict(X_test)

print(accuracy_score(y_test, y_pred_tree))

**6. AdaBoost**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# AdaBoostClassifier는 원본 에이다부스트 알고리즘의 변종인 SAMME와 SAMME.R 알고리즘을 사용
# SAMME.R이 SAMME보다 빠르고 성능도 좋다.
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5, random_state=42
)

ada_clf.fit(X_train, y_train)

**7. Gradient 부스팅 - 회귀**

In [None]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100)

from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=0.1, random_state=42)
gbrt.fit(X, y)

**7. Gradient 부스팅 - 분류 & Grid Search**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import f1_score,accuracy_score, recall_score,precision_score, mean_squared_error

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = GradientBoostingClassifier(random_state=1)

param_test = {
    "n_estimators": range(50, 100, 25),
    "max_depth": [1, 2, 4],
    "learning_rate": [0.0001, 0.001, 0.01, 0.1],
}

Gsearch = GridSearchCV(model, param_grid=param_test, scoring="accuracy", n_jobs=-1, cv=5)
Gsearch.fit(X_train,y_train)
print(Gsearch.best_params_)

import joblib

joblib.dump(Gsearch, "./model.pkl")

{'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 75}


['./model.pkl']

In [None]:
test_predictions = Gsearch.predict(X_test)

In [None]:
print("test acc Average: ", accuracy_score(y_test, test_predictions))
print("test Precision Average: ", precision_score(y_test, test_predictions, average="macro"))
print("test Recall Average: ", recall_score(y_test, test_predictions, average="macro"))
print("test F1_score Average: ", f1_score(y_test, test_predictions, average="macro"), "\n")

**7. Gradient 부스팅 - 조기종료**

In [None]:
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=49)

gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True, random_state=42)

# error float타입으로 inf값으로 초기화
min_val_error = float("inf")
error_going_up = 0

# n_estimators를 점차 늘릴 때
for n_estimators in range(1, 120):
  # gbrt.n_estimators 설정
  gbrt.n_estimators = n_estimators
  
  # fitting
  gbrt.fit(X_train, y_train)
  y_pred = gbrt.predict(X_val)
  val_error = mean_squared_error(y_val, y_pred)
  
  # val_error가 min_val_error 보다 작을 때
  if val_error < min_val_error:
    min_val_error = val_error
    error_going_up = 0
  else:
    error_going_up += 1
    # 5번이나 연속으로 큰 에러가 나올 때
    if error_going_up == 5:
      break # 조기 종료

print("n_estimators 몇개일때 :", gbrt.n_estimators)
print("MSE :", min_val_error)

n_estimators 몇개일때: 110
MSE: 0.06507344346271848


**8. XGBoost - 조기종료**

In [None]:
import xgboost

xgb_reg = xgboost.XGBRegressor(random_state=42)
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)

print("검증 MSE :", val_error)

검증 MSE : 0.11588755477707718


In [None]:
# early_stopping_rounds 개수 만큼 성능 향상이 없으면 중단
xgb_reg.fit(X_train, y_train,eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)

print("검증 MSE :", val_error)

[0]	validation_0-rmse:0.40582
[1]	validation_0-rmse:0.35474
[2]	validation_0-rmse:0.32464
[3]	validation_0-rmse:0.30981
[4]	validation_0-rmse:0.30304
[5]	validation_0-rmse:0.30246
[6]	validation_0-rmse:0.30195
[7]	validation_0-rmse:0.30074
[8]	validation_0-rmse:0.29867
[9]	validation_0-rmse:0.30140
검증 MSE : 0.08920381542915447


**9. Stacking**

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# base_models 설정
base_models = [
    ("rf_1", RandomForestClassifier(max_depth=10)),
    ("rf_2", AdaBoostClassifier(base_estimator=DecisionTreeClassifier(criterion="entropy", max_depth=1, random_state=1),
    learning_rate=0.1, n_estimators=1000))
]

# stacking 설정
clf = StackingClassifier(
    estimators=base_models,
    final_estimator=GradientBoostingClassifier(max_depth=4, n_estimators=75, random_state=1)
)

clf.fit(X_train,y_train)

In [None]:
test_predictions = clf.predict(X_test)

print("test acc Average: ", accuracy_score(y_test, test_predictions))
print("test Precision Average: ", precision_score(y_test, test_predictions))
print("test Recall Average: ", recall_score(y_test, test_predictions))
print("test F1_score Average: ", f1_score(y_test, test_predictions))

In [None]:
 !pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.0-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sktime>=0.16.1
  Downloading sktime-0.17.1-py3-none-any.whl (16.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.1/16.1 MB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
Collecting pmdarima!=1.8.1,<3.0.0,>=1.8.0
  Downloading pmdarima-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m39.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tbats>=1.1.0
  Downloading tbats-1.1.3-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  

In [None]:
# pycaret에서 제공하는 "juice" 데이터
from pycaret.datasets import get_data

data = get_data("juice")
data.info()

Unnamed: 0,Id,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,1,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,2,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,3,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,4,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,5,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Id              1070 non-null   int64  
 1   Purchase        1070 non-null   object 
 2   WeekofPurchase  1070 non-null   int64  
 3   StoreID         1070 non-null   int64  
 4   PriceCH         1070 non-null   float64
 5   PriceMM         1070 non-null   float64
 6   DiscCH          1070 non-null   float64
 7   DiscMM          1070 non-null   float64
 8   SpecialCH       1070 non-null   int64  
 9   SpecialMM       1070 non-null   int64  
 10  LoyalCH         1070 non-null   float64
 11  SalePriceMM     1070 non-null   float64
 12  SalePriceCH     1070 non-null   float64
 13  PriceDiff       1070 non-null   float64
 14  Store7          1070 non-null   object 
 15  PctDiscMM       1070 non-null   float64
 16  PctDiscCH       1070 non-null   float64
 17  ListPriceDiff   1070 non-null   f

In [None]:
from pycaret.datasets import get_data
from pycaret.classification import *

setup_clf = setup(data=data, target="Purchase")

Unnamed: 0,Description,Value
0,Session id,8311
1,Target,Purchase
2,Target type,Binary
3,Target mapping,"CH: 0, MM: 1"
4,Original data shape,"(1070, 19)"
5,Transformed data shape,"(1070, 19)"
6,Transformed train set shape,"(749, 19)"
7,Transformed test set shape,"(321, 19)"
8,Ordinal features,1
9,Numeric features,17


In [None]:
# ML 모델 성능 비교
top = compare_models()
top

In [None]:
from pycaret.regression import *
from pycaret.datasets import get_data

dataset = get_data("diamond")
exp = setup(dataset, target="Price")
reg_top = compare_models()
reg_top

In [None]:
# pycaret 버전
import pycaret

print("PyCaret: %s" % pycaret.__version__)

In [None]:
from pandas import read_csv

# pycaret classification 모듈
from pycaret.classification import setup

# pycaret classification 모델들을 비교하기 위한 모듈
from pycaret.classification import compare_models

# sonar 데이터셋
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"

# 데이터셋 로드
df = read_csv(url, header=None)

# 컬럼 개수
n_cols = df.shape[1]

# 컬럼명 String으로 반환
df.columns = [str(i) for i in range(n_cols)]

# 데이터셋 설정
grid = setup(data=df, target=df.columns[-1], verbose=True)

# 모델 학습
best = compare_models()

# best 모델 확인
print(best)

In [None]:
!pip3 install bayesian-optimization

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
import xgboost as xgb

# MAPE
def mean_absolute_percentage_error(y_test, y_pred):
  y_test, y_pred = np.array(y_test), np.array(y_pred)
  return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# 탐색 대상 함수(XGBRegressor)
def XGB_cv(max_depth,learning_rate, n_estimators):
  # 모델 정의
  model = xgb.XGBRegressor(
      max_depth=int(max_depth),
      learning_rate=learning_rate,
      n_estimators=int(n_estimators)
  )

  # 모델 훈련
  model.fit(X_train, y_train)

  # 예측값 출력
  y_pred= model.predict(X_test)
  r2 = r2_score(y_test, y_pred)

  # 오차 최적화로 사용할 metric 반환
  # 베이지안은 이것을 성능수치로 받아들이게 된다.
  return r2

In [None]:
# bayesian-optimization 라이브러리의 BayesianOptimization 클래스 import
from bayes_opt import BayesianOptimization
import numpy as np
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 실험하고자 하는 hyperparameter 집합
# max_depth(int, default: 3) : 기본 학습자를 위한 최대 트리 깊이
# learning_rate(float, default: 0.1) : Boosting 학습률
# n_estimators(int, default: 100) : fit하기 위한 Boosted tree의 수
pbounds = {
    "max_depth": (3, 7),
    "learning_rate": (0.001, 0.2),
    "n_estimators": (5000, 10000)
}

# Bayesian optimization 객체 생성
# f : 탐색 대상 함수, pbounds : hyperparameter 집합
# verbose = 2 항상 출력, verbose = 1 최댓값일 때 출력, verbose = 0 출력 안함
bo=BayesianOptimization(f=XGB_cv, pbounds=pbounds, verbose=2, random_state=1)

# 메소드를 이용해 최대화 과정 수행
# init_points :  초기 Random Search 갯수
# n_iter : 반복 횟수 (몇개의 입력값-함숫값 점들을 확인할지! 많을 수록 정확한 값을 얻을 수 있다.)
# acq : Acquisition Function들 중 Expected Improvement(EI)를 사용
# xi : exploration 강도 (기본값은 0.0)
bo.maximize(init_points=2, n_iter=10, acq="ei", xi=0.01)

# iter는 반복 회차, target은 목적 함수의 값, 나머지는 입력값을 나타낸다.
# 현재 회차 이전까지 조사된 함숫값들과 비교하여, 현재 회차에 최댓값이 얻어진 경우,
# bayesian-optimization 라이브러리는 이를 자동으로 다른 색 글자로 표시하는 것을 확인.

# 찾은 파라미터 값 확인
print(bo.max)