# Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install pandas scikit-learn catboost
!pip install scikit-optimize

import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from google.colab import files
from scipy.stats import uniform, randint
from catboost import CatBoostClassifier
from sklearn.svm import SVC

train = pd.read_csv("/content/drive/MyDrive/패턴인식/latest/train_real_final.csv")

#validation set 분리

In [None]:

X = train.drop(columns=["y"])
y = train["y"]

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42)

#cross validation 세팅
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#model

In [None]:

models = {
    "GradientBoost": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42),
    "CatBoost" : CatBoostClassifier(verbose=0, random_state=42),
    #"SVM" : SVC(kernel='rbf', probability=True, random_state=42)
}

# Rancom Search

In [None]:


param_dists = {
    'GradientBoost': {
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 0.5),
        'max_depth': randint(2, 8),
        'subsample': uniform(0.6, 0.4),},
    'XGBoost': {
        'n_estimators': randint(50, 300),
        'learning_rate': uniform(0.01, 0.5),
        'max_depth': randint(2, 8),
        'colsample_bytree': uniform(0.5, 0.5),
        'subsample': uniform(0.6, 0.4),
    },
    'CatBoost': {
        'iterations': randint(100, 1000),
        'learning_rate': uniform(0.01, 0.5),
        'max_depth': randint(2, 8),
        'l2_leaf_reg': uniform(1, 10),
        'border_count': randint(32, 255),
        'bagging_temperature': uniform(0, 1),
    },
}

def multi_score(estimator, X, y):
    y_pred = estimator.predict(X)
    if hasattr(estimator, "predict_proba"):
        y_proba = estimator.predict_proba(X)[:, 1]
    else:
        y_proba = estimator.decision_function(X)
    return np.mean([
        accuracy_score(y, y_pred),
        f1_score(y, y_pred),
        roc_auc_score(y, y_proba)
    ])

# 3) 1차 랜덤 서치
best_estimators = {}
best_params = {}

for name, model in models.items():
    print(f"=== RandomizedSearchCV: {name} ===")
    rs = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dists[name],
        n_iter=30,
        scoring=multi_score,
        cv=cv,
        random_state=42,
        n_jobs=-1,
        verbose=1,
        refit=True
    )
    rs.fit(X_train, y_train)
    best_estimators[name] = rs.best_estimator_
    best_params[name] = rs.best_params_
    print(f"Best params ({name}): {best_params[name]}")
    print(f"Best CV score ({name}): {rs.best_score_:.4f}\n")

for name, model in best_estimators.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]
    print(f"--- {name} (RS Tuned) on Validation ---")
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_proba)
    print("Accuracy :", acc)
    print("F1-score :", f1)
    print("AUC      :", auc)
    print("MeanScore:", (acc+f1+auc)/3)

print(best_estimators)
print(best_params)


=== RandomizedSearchCV: GradientBoost ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best params (GradientBoost): {'learning_rate': np.float64(0.033332831606807715), 'max_depth': 5, 'n_estimators': 257, 'subsample': np.float64(0.6931085361721216)}
Best CV score (GradientBoost): 0.6754

=== RandomizedSearchCV: XGBoost ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits


Parameters: { "use_label_encoder" } are not used.



Best params (XGBoost): {'colsample_bytree': np.float64(0.6999304858576277), 'learning_rate': np.float64(0.033332831606807715), 'max_depth': 5, 'n_estimators': 257, 'subsample': np.float64(0.6931085361721216)}
Best CV score (XGBoost): 0.6810

=== RandomizedSearchCV: CatBoost ===
Fitting 5 folds for each of 30 candidates, totalling 150 fits
Best params (CatBoost): {'bagging_temperature': np.float64(0.7712703466859457), 'border_count': 36, 'iterations': 589, 'l2_leaf_reg': np.float64(4.584657285442726), 'learning_rate': np.float64(0.06793452976256485), 'max_depth': 2}
Best CV score (CatBoost): 0.6761

--- GradientBoost (RS Tuned) on Validation ---
Accuracy : 0.6594384843643004
F1-score : 0.6576411197797155
AUC      : 0.7253877209034767
MeanScore: 0.6808224416824974


Parameters: { "use_label_encoder" } are not used.



--- XGBoost (RS Tuned) on Validation ---
Accuracy : 0.6637753937457201
F1-score : 0.6620784583620096
AUC      : 0.7301691778295132
MeanScore: 0.6853410099790809
--- CatBoost (RS Tuned) on Validation ---
Accuracy : 0.6669710111846611
F1-score : 0.6642117376294592
AUC      : 0.7280876464960785
MeanScore: 0.6864234651033995
{'GradientBoost': GradientBoostingClassifier(learning_rate=np.float64(0.033332831606807715),
                           max_depth=5, n_estimators=257, random_state=42,
                           subsample=np.float64(0.6931085361721216)), 'XGBoost': XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=np.float64(0.6999304858576277), device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='logloss', feature_types=None, gamma=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=N

#Grid Search

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

def multi_score(estimator, X, y):
    y_pred = estimator.predict(X)
    if hasattr(estimator, "predict_proba"):
        y_proba = estimator.predict_proba(X)[:, 1]
    else:
        y_proba = estimator.decision_function(X)
    return np.mean([
        accuracy_score(y, y_pred),
        f1_score(y, y_pred),
        roc_auc_score(y, y_proba)
    ])

# ── 2) RandomizedSearchCV 결과로 얻은 best_params
best_params = {
    '''
    'XGBoost': {
        'n_estimators': 257,
        'learning_rate': 0.033332831606807715,
        'max_depth': 5,
        'colsample_bytree': 0.6999304858576277,
        'subsample': 0.6931085361721216,
    },
    '''
    'CatBoost': {
        'iterations': 306,
        'learning_rate': 0.022709563372047596,
        'max_depth': 6,
        'l2_leaf_reg': 5.275410183585496,
        'border_count': 168,
        'bagging_temperature': 0.49379559636439074,
    },
    'GradientBoost': {
        'n_estimators': 257,
        'learning_rate': 0.033332831606807715,
        'max_depth': 5,
        'subsample': 0.6931085361721216,
    },
}

# ── 3) GridSearchCV로 정밀 탐색
refined_estimators = {}
for name, params in best_params.items():
    print(f"### Grid search refinement: {name} ###")
    # 모델 인스턴스 생성
    if name == 'GradientBoost':
        model = GradientBoostingClassifier(random_state=42)
        '''
    elif name == 'XGBoost':
        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
        '''
    else:  # CatBoost
        model = CatBoostClassifier(verbose=0, random_state=42)
    # 그리드 구성: ±20% 범위
    grid = {}
    for p, val in params.items():
        if p in ['n_estimators', 'iterations', 'max_depth', 'border_count']:
            grid[p] = [max(1, int(val*0.8)), int(val), int(val*1.2)]
        else:
            grid[p] = [val*0.8, val, val*1.2]
    gs = GridSearchCV(
        estimator=model,
        param_grid=grid,
        scoring=multi_score,
        cv=cv,
        n_jobs=-1,
        verbose=1,
        refit=True
    )
    gs.fit(X_train, y_train)
    refined_estimators[name] = gs.best_estimator_
    print("Refined params:", gs.best_params_)
    print("Refined CV score:", gs.best_score_, "\n")

for name, model in refined_estimators.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]
    print(f"--- {name} (GS Tuned) on Validation ---")
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_proba)
    print("Accuracy :", acc)
    print("F1-score :", f1)
    print("AUC      :", auc)
    print("MeanScore:", (acc+f1+auc)/3)


### Grid search refinement: 
    'XGBoost': {
        'n_estimators': 257,
        'learning_rate': 0.033332831606807715,
        'max_depth': 5,
        'colsample_bytree': 0.6999304858576277,
        'subsample': 0.6931085361721216,
    },
    CatBoost ###
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Refined params: {'bagging_temperature': 0.39503647709151263, 'border_count': 201, 'iterations': 367, 'l2_leaf_reg': 5.275410183585496, 'learning_rate': 0.027251476046457116, 'max_depth': 7}
Refined CV score: 0.6788609652232847 

### Grid search refinement: GradientBoost ###
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Refined params: {'learning_rate': 0.026666265285446175, 'max_depth': 5, 'n_estimators': 257, 'subsample': 0.831730243406546}
Refined CV score: 0.6797117939569308 

--- 
    'XGBoost': {
        'n_estimators': 257,
        'learning_rate': 0.033332831606807715,
        'max_depth': 5,
        'colsample_bytree': 0.6999304858576277,
      

#Bayes Search

In [None]:
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
import numpy as np

def multi_score(estimator, X, y):
    y_pred = estimator.predict(X)
    if hasattr(estimator, "predict_proba"):
        y_proba = estimator.predict_proba(X)[:, 1]
    else:
        y_proba = estimator.decision_function(X)
    return np.mean([
        accuracy_score(y, y_pred),
        f1_score(y, y_pred),
        roc_auc_score(y, y_proba)
    ])

cat = CatBoostClassifier(random_state=42)

# 베이지안 탐색 범위 설정 (GridSearch 결과 기준으로 좁은 범위 탐색)
param_space = {
        'iterations': (250,450), #367
        'learning_rate': (0.01, 0.04), #0.02725
        'max_depth': (5, 10), #7
        'l2_leaf_reg': (4,6), #5.2754
        'border_count': (150, 250), #201
        'bagging_temperature': (0.3, 0.5), #0.3950
}

# 베이지안 최적화 설정
bayes_search = BayesSearchCV(
    estimator=cat,
    search_spaces=param_space,
    n_iter=30,  # 탐색 횟수
    cv=cv,
    scoring=multi_score,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    error_score='raise'
)

# 베이지안 탐색 수행
bayes_search.fit(X_train, y_train)

# 결과 출력
print("Best cat Parameters (Bayesian):", bayes_search.best_params_)
print("Best Mean CV Score:", bayes_search.best_score_)

# 1. 최적 파라미터로 모델 생성 및 훈련
best_cat = bayes_search.best_estimator_
best_cat.fit(X_train, y_train)

y_pred = best_cat.predict(X_val)
y_proba = best_cat.predict_proba(X_val)[:, 1]
print("--- cat (bayes Tuned) on Validation ---")
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_proba)
print("Accuracy :", acc)
print("F1-score :", f1)
print("AUC      :", auc)
print("MeanScore:", (acc+f1+auc)/3)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [None]:
import joblib

joblib.dump(best_cat, "./CatBoost.pkl")

['./CatBoost.pkl']

In [None]:
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
import numpy as np

xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# 베이지안 탐색 범위 설정 (GridSearch 결과 기준으로 좁은 범위 탐색)
param_space = {
    'n_estimators': (100, 350), #257
          'learning_rate': (0.1, 0.4), #0.026
          'max_depth': (3, 8),#5
          'subsample':  (0.6, 1.0), #0.8317
          'colsample_bytree': (0.6, 1.0),#0.8399
}

# 베이지안 최적화 설정
bayes_search = BayesSearchCV(
    estimator=xgb,
    search_spaces=param_space,
    n_iter=30,  # 탐색 횟수
    cv=cv,
    scoring=multi_score,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    error_score='raise'
)

# 베이지안 탐색 수행
bayes_search.fit(X_train, y_train)

# 결과 출력
print("Best xgb Parameters (Bayesian):", bayes_search.best_params_)
print("Best Mean CV Score:", bayes_search.best_score_)

# 1. 최적 파라미터로 모델 생성 및 훈련
best_xgb = bayes_search.best_estimator_
best_xgb.fit(X_train, y_train)

y_pred = best_xgb.predict(X_val)
y_proba = best_xgb.predict_proba(X_val)[:, 1]
print("--- xgb (bayes Tuned) on Validation ---")
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_proba)
print("Accuracy :", acc)
print("F1-score :", f1)
print("AUC      :", auc)
print("MeanScore:", (acc+f1+auc)/3)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

Parameters: { "use_label_encoder" } are not used.



Best xgb Parameters (Bayesian): OrderedDict([('colsample_bytree', 0.7372828076512649), ('learning_rate', 0.1046906412570721), ('max_depth', 3), ('n_estimators', 105), ('subsample', 0.7947757687796999)])
Best Mean CV Score: 0.6778094168807


Parameters: { "use_label_encoder" } are not used.



--- xgb (bayes Tuned) on Validation ---
Accuracy : 0.6656014608536863
F1-score : 0.6640678743407475
AUC      : 0.7271949661415074
MeanScore: 0.685621433778647


In [None]:
from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
import numpy as np

gra = GradientBoostingClassifier(random_state=42)

# 베이지안 탐색 범위 설정 (GridSearch 결과 기준으로 좁은 범위 탐색)
param_space = {
    'n_estimators': (100, 400), #257
          'learning_rate': (0.1, 0.4), #0.026
          'max_depth': (3, 8),#5
          'subsample':  (0.6, 1.0), #0.8317
          'colsample_bytree': (0.6, 1.0),#0.8399
}

# 베이지안 최적화 설정
bayes_search = BayesSearchCV(
    estimator=gra,
    search_spaces=param_space,
    n_iter=30,  # 탐색 횟수
    cv=cv,
    scoring=multi_score,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    error_score='raise'
)

# 베이지안 탐색 수행
bayes_search.fit(X_train, y_train)

# 결과 출력
print("Best grad Parameters (Bayesian):", bayes_search.best_params_)
print("Best Mean CV Score:", bayes_search.best_score_)

# 1. 최적 파라미터로 모델 생성 및 훈련
best_gra = bayes_search.best_estimator_
best_gra.fit(X_train, y_train)

y_pred = best_gra.predict(X_val)
y_proba = best_gra.predict_proba(X_val)[:, 1]
print("--- gra (bayes Tuned) on Validation ---")
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
auc = roc_auc_score(y_val, y_proba)
print("Accuracy :", acc)
print("F1-score :", f1)
print("AUC      :", auc)
print("MeanScore:", (acc+f1+auc)/3)

In [None]:
from skopt import BayesSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, f1_score
import numpy as np

# GS 결과 기반
def baye(trial):
  param_space = {
      'XGBoost': {
          'n_estimators': trial.suggest_int('n_estimators', 100, 400), #257
          'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.4), #0.026
          'max_depth': trial.suggest_int('max_depth', 3, 8),#5
          'subsample': trial.suggest_float('subsample', 0.6, 1.0), #0.8317
          'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),#0.8399
          'use_label_encoder': False,
          'eval_metric': 'logloss',
          'random_state': 42,
      },
      'CatBoost': {
          'iterations': 306,
          'learning_rate': 0.022709563372047596,
          'max_depth': 6,
          'l2_leaf_reg': 5.275410183585496,
          'border_count': 168,
          'bagging_temperature': 0.49379559636439074,
      },
      'GradientBoost': {
          'n_estimators': 257,
          'learning_rate': 0.033332831606807715,
          'max_depth': 5,
          'subsample': 0.6931085361721216,
      },
  }

bayes_estimators = {}
for name, model in models.items():
    print(f"=== BayesSearchCV: {name} ===")
    bs = BayesSearchCV(
                  estimator=model,
                  search_spaces=param_space,
                  n_iter=30,  # 탐색 횟수
                  cv=cv,
                  scoring=multi_score,
                  n_jobs=-1,
                  verbose=2,
                  random_state=42,
                  error_score='raise'
                  )
    bs.fit(X_train, y_train)
    bayes_estimators[name] = bs.best_estimator_
    print("Bayes params:", bs.best_params_)
    print("Bayes CV score:", bs.best_score_, "\n")

for name, model in bayes_estimators.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)[:, 1]
    print(f"--- {name} (BS Tuned) on Validation ---")
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    auc = roc_auc_score(y_val, y_proba)
    print("Accuracy :", acc)
    print("F1-score :", f1)
    print("AUC      :", auc)
    print("MeanScore:", (acc+f1+auc)/3)


### Grid search refinement: 
    'XGBoost': {
        'n_estimators': 257,
        'learning_rate': 0.033332831606807715,
        'max_depth': 5,
        'colsample_bytree': 0.6999304858576277,
        'subsample': 0.6931085361721216,
    },
    CatBoost ###
Fitting 5 folds for each of 729 candidates, totalling 3645 fits


KeyboardInterrupt: 