In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
import optuna
import joblib
import time



# iris 데이터셋을 로드합니다.
iris = load_iris()

# 데이터셋을 분리합니다.
X, y = iris.data, iris.target

# print(X.shape)
# (150, 4)
# print(y.shape)
# (150,)


# 데이터셋을 8:2 비율로 train과 test로 분할합니다.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

RANDOM_SEED = 42




def objective(trial):

    # 최적화할 XGBoost 파라미터입니다.
    params = {
        "objective": "multi:softprob",
        "eval_metric": 'mlogloss',
        "booster": 'gbtree',
        "tree_method": 'hist',
        "max_depth": trial.suggest_int("max_depth", 4, 10),
        "learning_rate": trial.suggest_float('learning_rate', 0.0001, 0.99),
        'n_estimators': trial.suggest_int("n_estimators", 1000, 10000, step=100),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
        "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-2, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-2, 1.0),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 15),
        "gamma": trial.suggest_float("gamma", 0.1, 1.0),
        "random_state": RANDOM_SEED,
    }
   
    # 파이프 라인을 구성합니다.
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', XGBClassifier(**params))
    ])
   
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_validation)

    accuracy = accuracy_score(y_validation, y_pred)

    return accuracy


# Optuna를 사용한 하이퍼파라미터 최적화를 시작합니다.
start_time = time.time()
study = optuna.create_study(direction="maximize", sampler=optuna.samplers.TPESampler(seed=RANDOM_SEED))
study.optimize(objective, n_trials=15)
end_time = time.time()

print(f'실행시간 = {end_time - start_time:.2f} 초')
print(f'Best accuracy: {study.best_value:.4f}')
print('Best hyperparameters:', study.best_params)


# 이제 Optuna를 사용하여 구한 최적의 하이퍼파라미터로 최종 파이프라인 학습을 진행합니다.
best_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', XGBClassifier(random_state=RANDOM_SEED, **study.best_params))
])

best_pipeline.fit(X_train, y_train)


# 파이프라인을 저장합니다.
joblib.dump(best_pipeline, 'best_pipeline.pkl')

print("파이프라인이 'best_pipeline.pkl'로 저장되었습니다.")



# 저장된 파이프라인을 로드합니다.
loaded_pipeline = joblib.load('best_pipeline.pkl')
print("'best_pipeline.pkl'에서 파이프 라인을 로드했습니다.")


# 테스트 데이터로 추론합니다.
y_pred = loaded_pipeline.predict(X_test)


# 정확도를 계산합니다.
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy:.4f}')