### 동일한 GridSearchCV 파라미터 탐색 범위 적용

In [None]:
# 공통 라이브러리
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# 1️⃣ 데이터 불러오기 (이미 df_regre가 있다면 이 부분은 생략)
df_tree = pd.read_csv('../../ML_data/dataset/tree_model_preprocessed.csv')

# 2️⃣ 타깃 / 피처 분리
target = 'churn'
X = df_tree.drop(columns=[target])
y = df_tree[target]

# 3️⃣ 학습 / 테스트 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# 4️⃣ 공통 하이퍼파라미터 그리드
param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [300, 500, 800]
}


## LightGBM

In [4]:
# LightGBM (LGBMClassifier)
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1️⃣ 기본 모델 정의
lgbm_model = LGBMClassifier(
    objective='binary',
    random_state=42
)

# 2️⃣ GridSearchCV 설정
lgbm_grid = GridSearchCV(
    estimator=lgbm_model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# 3️⃣ 학습
lgbm_grid.fit(X_train, y_train)

# 4️⃣ 최적 모델로 예측
best_lgbm = lgbm_grid.best_estimator_
y_pred_lgbm = best_lgbm.predict(X_test)
y_proba_lgbm = best_lgbm.predict_proba(X_test)[:, 1]

# 5️⃣ 성능 지표 계산
lgbm_acc = accuracy_score(y_test, y_pred_lgbm)
lgbm_f1 = f1_score(y_test, y_pred_lgbm)
lgbm_roc = roc_auc_score(y_test, y_proba_lgbm)

print("=== LightGBM 결과 ===")
print(f"Accuracy : {lgbm_acc:.4f}")
print(f"F1-score : {lgbm_f1:.4f}")
print(f"ROC-AUC  : {lgbm_roc:.4f}")
print("Best Params :", lgbm_grid.best_params_)
print(classification_report(y_test, y_pred_lgbm))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[LightGBM] [Info] Number of positive: 25631, number of negative: 20379
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001031 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 664
[LightGBM] [Info] Number of data points in the train set: 46010, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.557075 -> initscore=0.229298
[LightGBM] [Info] Start training from score 0.229298
[LightGBM] [Info] Number of positive: 25631, number of negative: 20380
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001611 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 664
[LightGBM] [Info] Number of data points in the train s

## CatBoost

In [5]:
# !pip install catboost  # 설치 안 돼 있으면 주석 풀고 한 번 실행

from catboost import CatBoostClassifier

# 1️⃣ 기본 모델 정의
cat_model = CatBoostClassifier(
    loss_function='Logloss',
    eval_metric='AUC',      # 내부 평가지표
    random_state=42,
    verbose=0               # 학습 로그 안 보이게
)

# 2️⃣ GridSearchCV 설정 (ROC-AUC 기준으로 최적 파라미터 탐색)
cat_grid = GridSearchCV(
    estimator=cat_model,
    param_grid=param_grid,
    scoring='f1',   # ROC-AUC 기준으로 최고 모델 선택
    cv=5,
    n_jobs=-1,
    verbose=1
)

# 3️⃣ 학습
cat_grid.fit(X_train, y_train)

# 4️⃣ 최적 모델로 예측
best_cat = cat_grid.best_estimator_
y_pred_cat = best_cat.predict(X_test)
y_proba_cat = best_cat.predict_proba(X_test)[:, 1]

# 5️⃣ 성능 지표 계산
cat_acc = accuracy_score(y_test, y_pred_cat)
cat_f1 = f1_score(y_test, y_pred_cat)
cat_roc = roc_auc_score(y_test, y_proba_cat)

print("=== CatBoost 결과 ===")
print(f"Accuracy : {cat_acc:.4f}")
print(f"F1-score : {cat_f1:.4f}")
print(f"ROC-AUC  : {cat_roc:.4f}")
print("Best Params :", cat_grid.best_params_)
print(classification_report(y_test, y_pred_cat))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
=== CatBoost 결과 ===
Accuracy : 0.9381
F1-score : 0.9434
ROC-AUC  : 0.9720
Best Params : {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 500}
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      6369
           1       0.96      0.93      0.94      8010

    accuracy                           0.94     14379
   macro avg       0.94      0.94      0.94     14379
weighted avg       0.94      0.94      0.94     14379



## XGboost

In [6]:
# XGboost
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# 1️⃣ 기본 모델 정의
xgb_model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',   # 경고 메시지 방지용
    random_state=42,
    tree_method='hist'       # 속도 향상용(옵션)
)

# 2️⃣ GridSearchCV 설정
xgb_grid = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# 3️⃣ 학습
xgb_grid.fit(X_train, y_train)

# 4️⃣ 최적 모델로 예측
best_xgb = xgb_grid.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)
y_proba_xgb = best_xgb.predict_proba(X_test)[:, 1]

# 5️⃣ 성능 지표 계산
xgb_acc = accuracy_score(y_test, y_pred_xgb)
xgb_f1 = f1_score(y_test, y_pred_xgb)
xgb_roc = roc_auc_score(y_test, y_proba_xgb)

print("=== XGBoost 결과 ===")
print(f"Accuracy : {xgb_acc:.4f}")
print(f"F1-score : {xgb_f1:.4f}")
print(f"ROC-AUC  : {xgb_roc:.4f}")
print("Best Params :", xgb_grid.best_params_)
print(classification_report(y_test, y_pred_xgb))

Fitting 5 folds for each of 27 candidates, totalling 135 fits
=== XGBoost 결과 ===
Accuracy : 0.9380
F1-score : 0.9434
ROC-AUC  : 0.9716
Best Params : {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 300}
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      6369
           1       0.96      0.93      0.94      8010

    accuracy                           0.94     14379
   macro avg       0.94      0.94      0.94     14379
weighted avg       0.94      0.94      0.94     14379



## RandomForest

In [7]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score


from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import pandas as pd

# 데이터 준비

# 1️⃣ RandomForest 전용 하이퍼파라미터
param_grid_rf = {
    'max_depth': [4, 6, 8],
    'n_estimators': [300, 500, 800],
    'max_features': ['sqrt', 'log2']
}

# 2️⃣ 모델 정의
rf_model = RandomForestClassifier(random_state=42)

# 3️⃣ GridSearchCV 설정
rf_grid = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid_rf,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# 4️⃣ 학습
rf_grid.fit(X_train, y_train)

# 5️⃣ 최적 모델로 예측
best_rf = rf_grid.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_proba_rf = best_rf.predict_proba(X_test)[:, 1]

# 6️⃣ 성능지표 계산
rf_acc = accuracy_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_roc = roc_auc_score(y_test, y_proba_rf)

print("=== RandomForest 결과 ===")
print(f"Accuracy : {rf_acc:.4f}")
print(f"F1-score : {rf_f1:.4f}")
print(f"ROC-AUC  : {rf_roc:.4f}")
print("Best Params :", rf_grid.best_params_)
print(classification_report(y_test, y_pred_rf))



Fitting 5 folds for each of 18 candidates, totalling 90 fits
=== RandomForest 결과 ===
Accuracy : 0.9372
F1-score : 0.9429
ROC-AUC  : 0.9673
Best Params : {'max_depth': 8, 'max_features': 'log2', 'n_estimators': 800}
              precision    recall  f1-score   support

           0       0.91      0.95      0.93      6369
           1       0.96      0.93      0.94      8010

    accuracy                           0.94     14379
   macro avg       0.94      0.94      0.94     14379
weighted avg       0.94      0.94      0.94     14379



### Regression 데이터 사용

# LogisticRegression

In [8]:
# 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report
import pandas as pd

# ===========================
# 1️⃣ 데이터 불러오기 및 분리
# ===========================
df_regre = pd.read_csv('../../ML_data/dataset/re_log_model_preprocessed.csv')
target = 'churn'
X = df_regre.drop(columns=[target])
y = df_regre[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ===========================
# 2️⃣ 하이퍼파라미터 후보
# ===========================
param_grid = {
    'C': [5, 10, 20, 50],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}

# ===========================
# 3️⃣ GridSearchCV 학습
# ===========================
log_model = LogisticRegression(max_iter=1000, random_state=42)
log_grid = GridSearchCV(
    estimator=log_model,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)
log_grid.fit(X_train, y_train)

# ===========================
# 4️⃣ 최적 모델 예측
# ===========================
best_log = log_grid.best_estimator_
y_pred_log = best_log.predict(X_test)
y_proba_log = best_log.predict_proba(X_test)[:, 1]

# ===========================
# 5️⃣ 성능 지표 계산
# ===========================
log_acc = accuracy_score(y_test, y_pred_log)
log_f1 = f1_score(y_test, y_pred_log)
log_roc = roc_auc_score(y_test, y_proba_log)

# ===========================
# 6️⃣ DataFrame 생성 (한 줄)
# ===========================
log_param_str = ", ".join(f"{k}={v}" for k, v in log_grid.best_params_.items())

df_results_log = pd.DataFrame([{
    "모델": "LogisticRegression",
    "F1-score": f'{log_f1:.4f}',
    "Accuracy": f'{log_acc:.4f}',
    "ROC AUC": f'{log_roc:.4f}',
    "하이퍼파라미터": log_param_str
}])

# ===========================
# 7️⃣ 출력
# ===========================
print(df_results_log)
print(classification_report(y_test, y_pred_log))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
                   모델 F1-score Accuracy ROC AUC                        하이퍼파라미터
0  LogisticRegression   0.9385   0.9325  0.9579  C=5, penalty=l2, solver=lbfgs
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      6369
           1       0.95      0.92      0.94      8010

    accuracy                           0.93     14379
   macro avg       0.93      0.93      0.93     14379
weighted avg       0.93      0.93      0.93     14379



In [9]:
# lgbm_param_str = ", ".join(
#     f"{k}={v}" for k, v in lgbm_grid.best_params_.items()
# )

results = [
    {
        "모델": "LightGBM",
        "F1-score": f'{lgbm_f1:.4f}',
        "Accuracy": f'{lgbm_acc:.4f}',
        "ROC": f'{lgbm_roc:.4f}',
        "하이퍼파라미터": f'{lgbm_grid.best_params_}'
    },
    {
        "모델": "CatBoost",
        "F1-score": f'{cat_f1:.4f}',
        "Accuracy": f'{cat_acc:.4f}',
        "ROC": f'{cat_roc:.4f}',
        "하이퍼파라미터": f'{cat_grid.best_params_}'
    },
    {
        "모델": "XGBoost",
        "F1-score": f'{xgb_f1:.4f}',
        "Accuracy": f'{xgb_acc:.4f}',
        "ROC": f'{xgb_roc:.4f}',
        "하이퍼파라미터": f'{xgb_grid.best_params_}'
    },
    {
        "모델": "RandomForest",
        "F1-score": f'{rf_f1:.4f}',
        "Accuracy": f'{rf_acc:.4f}',
        "ROC": f'{rf_roc:.4f}',
        "하이퍼파라미터": f'{rf_grid.best_params_}'
    },
    {
        "모델": "LogisticRegression",
        "F1-score": f'{log_f1:.4f}',
        "Accuracy": f'{log_acc:.4f}',
        "ROC": f'{log_roc:.4f}',
        "하이퍼파라미터": f'{log_param_str}'
    }
]
df_results = pd.DataFrame(results)
df_results


Unnamed: 0,모델,F1-score,Accuracy,ROC,하이퍼파라미터
0,LightGBM,0.9437,0.9384,0.9724,"{'learning_rate': 0.05, 'max_depth': 8, 'n_est..."
1,CatBoost,0.9434,0.9381,0.972,"{'learning_rate': 0.1, 'max_depth': 6, 'n_esti..."
2,XGBoost,0.9434,0.938,0.9716,"{'learning_rate': 0.05, 'max_depth': 6, 'n_est..."
3,RandomForest,0.9429,0.9372,0.9673,"{'max_depth': 8, 'max_features': 'log2', 'n_es..."
4,LogisticRegression,0.9385,0.9325,0.9579,"C=5, penalty=l2, solver=lbfgs"


In [12]:
! pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0


In [13]:
print(df_results.to_markdown(index=False))


| 모델               |   F1-score |   Accuracy |    ROC | 하이퍼파라미터                                                |
|:-------------------|-----------:|-----------:|-------:|:--------------------------------------------------------------|
| LightGBM           |     0.9437 |     0.9384 | 0.9724 | {'learning_rate': 0.05, 'max_depth': 8, 'n_estimators': 300}  |
| CatBoost           |     0.9434 |     0.9381 | 0.972  | {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 500}   |
| XGBoost            |     0.9434 |     0.938  | 0.9716 | {'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 300}  |
| RandomForest       |     0.9429 |     0.9372 | 0.9673 | {'max_depth': 8, 'max_features': 'log2', 'n_estimators': 800} |
| LogisticRegression |     0.9385 |     0.9325 | 0.9579 | C=5, penalty=l2, solver=lbfgs                                 |
