In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [12]:
df = pd.read_csv('./data/online_retail_customer_churn.csv')
df

Unnamed: 0,Customer_ID,Age,Gender,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Email_Opt_In,Promotion_Response,Target_Churn
0,1,62,Other,45.15,5892.58,5,22,453.80,2,0,3,129,True,Responded,True
1,2,65,Male,79.51,9025.47,13,77,22.90,2,2,3,227,False,Responded,False
2,3,18,Male,29.19,618.83,13,71,50.53,5,2,2,283,False,Responded,True
3,4,21,Other,79.63,9110.30,3,33,411.83,5,3,5,226,True,Ignored,True
4,5,21,Other,77.66,5390.88,15,43,101.19,3,0,5,242,False,Unsubscribed,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,54,Male,143.72,1089.09,2,29,77.75,0,3,2,88,True,Ignored,False
996,997,19,Male,164.19,3700.24,9,90,34.45,6,4,4,352,False,Responded,True
997,998,47,Female,113.31,705.85,17,69,187.37,7,3,1,172,True,Unsubscribed,False
998,999,23,Male,72.98,3891.60,7,31,483.80,1,2,5,55,False,Responded,True


In [13]:
def drop_feature(df):
    return df.drop('Customer_ID', axis=1)

def remove_gender_other(df):
    """Gender가 'Other'인 행 제거"""
    df_cleaned = df[df['Gender'] != 'Other'].copy()
    return df_cleaned

# def remove_gender_other(df):
#     """Gender가 'Other'인 행 제거"""
#     df_cleaned = df[df['Gender'] != 'Other'].copy()
#     return df_cleaned

def encode_feature(df):
    # 원핫 인코딩 대상 컬럼
    category = ['Gender', 'Email_Opt_In', 'Promotion_Response']
    
    # 원핫 인코딩 적용 (drop_first=True는 다중공선성 회피용, 상황에 따라 False로 해도 OK)
    df_encoded = pd.get_dummies(df, columns=category, drop_first=True)
    
    return df_encoded

def scale_feature(train_data, test_data):

    # 스케일러 적용
    sc = StandardScaler()
    # sc = MinMaxScaler()

    train_scaled = sc.fit_transform(train_data)
    test_scaled = sc.transform(test_data)


    return train_scaled, test_scaled

def preprocess_data(df):
    df = drop_feature(df)
    df = remove_gender_other(df)
    df = encode_feature(df)

    return df

In [14]:
rt_df = preprocess_data(df)
rt_df

Unnamed: 0,Age,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Target_Churn,Gender_Male,Email_Opt_In_True,Promotion_Response_Responded,Promotion_Response_Unsubscribed
1,65,79.51,9025.47,13,77,22.90,2,2,3,227,False,True,False,True,False
2,18,29.19,618.83,13,71,50.53,5,2,2,283,True,True,False,True,False
5,57,190.43,255.19,19,85,417.78,5,1,4,130,False,True,False,False,True
6,27,172.13,3512.55,3,77,316.18,0,3,1,61,False,True,True,False,True
10,54,138.90,4283.84,15,33,96.55,9,1,2,92,False,True,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,50,167.73,7380.55,4,10,46.12,1,2,2,173,True,True,True,False,False
995,54,143.72,1089.09,2,29,77.75,0,3,2,88,False,True,True,False,False
996,19,164.19,3700.24,9,90,34.45,6,4,4,352,True,True,False,True,False
997,47,113.31,705.85,17,69,187.37,7,3,1,172,False,False,True,False,True


In [15]:
from sklearn.model_selection import train_test_split
# 1. 전처리
rt_df = preprocess_data(df)
rt_df
X = rt_df.drop('Target_Churn', axis=1)
y = rt_df['Target_Churn'].astype(int)

# 2. train/test 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 3. 수치형 컬럼만 스케일링
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

X_scaled_train = X_train.copy()
X_scaled_test = X_test.copy()

X_scaled_train[numeric_cols], X_scaled_test[numeric_cols] = scale_feature(
    X_train[numeric_cols], X_test[numeric_cols]
)
X_scaled_train.head()

Unnamed: 0,Age,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Gender_Male,Email_Opt_In_True,Promotion_Response_Responded,Promotion_Response_Unsubscribed
466,1.627684,0.598178,-0.139465,-0.45115,-1.20072,-1.665218,0.462431,-1.413169,-0.029341,-1.427582,True,True,False,True
155,-0.676678,-1.336659,-0.057553,0.269622,-1.512246,-0.077712,-1.607012,0.03075,-0.029341,0.421292,True,True,False,False
371,-1.005872,-0.969095,-1.035971,-1.171922,-0.127687,-0.11326,-0.917198,1.474669,-0.749536,-1.513799,False,True,True,False
291,-1.203389,0.722831,-1.193458,-1.352115,-1.581473,1.294777,-0.227383,-0.691209,-0.749536,-0.977338,False,False,False,True
231,0.771778,0.890914,-0.001688,1.530973,0.079997,0.905904,-0.57229,0.752709,-0.029341,0.46919,False,False,False,False


In [16]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

# 4. 모델 정의 및 학습
xgb = XGBClassifier(
    random_state=0,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    use_label_encoder=False,
    eval_metric='logloss'
)

xgb.fit(X_scaled_train, y_train)

# 5. 예측
y_pred = xgb.predict(X_scaled_test)

# 6. 평가
print("📊 Classification Report (Test):")
print(classification_report(y_test, y_pred))

print(f'훈련 데이터 정확도: {xgb.score(X_scaled_train, y_train):.2f}')
print(f'테스트 데이터 정확도: {xgb.score(X_scaled_test, y_test):.2f}')


📊 Classification Report (Test):
              precision    recall  f1-score   support

           0       0.53      0.35      0.42        75
           1       0.44      0.62      0.51        61

    accuracy                           0.47       136
   macro avg       0.48      0.48      0.47       136
weighted avg       0.49      0.47      0.46       136

훈련 데이터 정확도: 0.91
테스트 데이터 정확도: 0.47


Parameters: { "use_label_encoder" } are not used.



In [17]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.metrics import f1_score, classification_report

# Optuna objective 함수 정의
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 2, 5),
        'num_leaves': trial.suggest_int('num_leaves', 20, 50),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'random_state': 42
    }

    model = LGBMClassifier(**params)
    model.fit(X_scaled_train, y_train)
    y_pred = model.predict(X_scaled_test)
    return f1_score(y_test, y_pred)

# Optuna 스터디 생성 및 실행
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

# 최적 파라미터 출력
print("최적의 파라미터:", study.best_trial.params)
print("최적의 점수:", study.best_value)

# 최적 모델 학습 및 예측
best_params = study.best_trial.params
best_lgbm = LGBMClassifier(**best_params, random_state=42)
best_lgbm.fit(X_scaled_train, y_train)

y_pred = best_lgbm.predict(X_scaled_test)
print("📊 Classification Report:")
print(classification_report(y_test, y_pred))

print(f'훈련 데이터: {best_lgbm.score(X_scaled_train, y_train):.2f}')
print(f'테스트 데이터: {best_lgbm.score(X_scaled_test, y_test):.2f}')


[I 2025-04-01 12:40:54,794] A new study created in memory with name: no-name-5bf07544-9029-4c3c-965d-2defa1c9e734
[I 2025-04-01 12:40:54,887] Trial 0 finished with value: 0.5068493150684932 and parameters: {'n_estimators': 185, 'learning_rate': 0.14374670658412622, 'max_depth': 5, 'num_leaves': 39, 'subsample': 0.9989790203600141, 'colsample_bytree': 0.6251051765983555}. Best is trial 0 with value: 0.5068493150684932.
[I 2025-04-01 12:40:54,929] Trial 1 finished with value: 0.45714285714285713 and parameters: {'n_estimators': 279, 'learning_rate': 0.1738455468585245, 'max_depth': 3, 'num_leaves': 37, 'subsample': 0.622667092426595, 'colsample_bytree': 0.7963714202132676}. Best is trial 0 with value: 0.5068493150684932.


[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of po

[I 2025-04-01 12:40:54,985] Trial 2 finished with value: 0.5170068027210885 and parameters: {'n_estimators': 295, 'learning_rate': 0.039881300567579235, 'max_depth': 3, 'num_leaves': 31, 'subsample': 0.7795210589644462, 'colsample_bytree': 0.9210058248966564}. Best is trial 2 with value: 0.5170068027210885.
[I 2025-04-01 12:40:55,032] Trial 3 finished with value: 0.5135135135135135 and parameters: {'n_estimators': 144, 'learning_rate': 0.02694853353403552, 'max_depth': 5, 'num_leaves': 42, 'subsample': 0.8558814982268339, 'colsample_bytree': 0.7435499645328162}. Best is trial 2 with value: 0.5170068027210885.
[I 2025-04-01 12:40:55,068] Trial 4 finished with value: 0.4931506849315068 and parameters: {'n_estimators': 208, 'learning_rate': 0.0973906845549595, 'max_depth': 3, 'num_leaves': 46, 'subsample': 0.9910969043498714, 'colsample_bytree': 0.9196325109619777}. Best is trial 2 with value: 0.5170068027210885.
[I 2025-04-01 12:40:55,122] Trial 5 finished with value: 0.4755244755244755 

[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000127 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of po

[I 2025-04-01 12:40:55,211] Trial 7 finished with value: 0.496551724137931 and parameters: {'n_estimators': 197, 'learning_rate': 0.15528929367876765, 'max_depth': 4, 'num_leaves': 21, 'subsample': 0.9258352460235919, 'colsample_bytree': 0.7767379729453254}. Best is trial 2 with value: 0.5170068027210885.
[I 2025-04-01 12:40:55,233] Trial 8 finished with value: 0.5170068027210885 and parameters: {'n_estimators': 103, 'learning_rate': 0.19205601626281935, 'max_depth': 2, 'num_leaves': 26, 'subsample': 0.6625614287011121, 'colsample_bytree': 0.8483878604974155}. Best is trial 2 with value: 0.5170068027210885.
[I 2025-04-01 12:40:55,276] Trial 9 finished with value: 0.48226950354609927 and parameters: {'n_estimators': 221, 'learning_rate': 0.07940004727668176, 'max_depth': 4, 'num_leaves': 50, 'subsample': 0.804685781340821, 'colsample_bytree': 0.618276913500952}. Best is trial 2 with value: 0.5170068027210885.
[I 2025-04-01 12:40:55,332] Trial 10 finished with value: 0.5949367088607594 a

[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of po

[I 2025-04-01 12:40:55,394] Trial 11 finished with value: 0.5921052631578947 and parameters: {'n_estimators': 295, 'learning_rate': 0.015716594602484976, 'max_depth': 2, 'num_leaves': 30, 'subsample': 0.7414297887113894, 'colsample_bytree': 0.8892857967948754}. Best is trial 10 with value: 0.5949367088607594.
[I 2025-04-01 12:40:55,448] Trial 12 finished with value: 0.6181818181818182 and parameters: {'n_estimators': 260, 'learning_rate': 0.011110260460198125, 'max_depth': 2, 'num_leaves': 29, 'subsample': 0.7428344568330044, 'colsample_bytree': 0.8647851650440116}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:55,511] Trial 13 finished with value: 0.5070422535211268 and parameters: {'n_estimators': 259, 'learning_rate': 0.05282727976634986, 'max_depth': 2, 'num_leaves': 27, 'subsample': 0.8584341373817953, 'colsample_bytree': 0.9921484841205153}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:55,562] Trial 14 finished with value: 0.48275862

[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of po

[I 2025-04-01 12:40:55,620] Trial 15 finished with value: 0.5641025641025641 and parameters: {'n_estimators': 242, 'learning_rate': 0.01188339281083446, 'max_depth': 3, 'num_leaves': 25, 'subsample': 0.7936512643912859, 'colsample_bytree': 0.7111771899918726}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:55,674] Trial 16 finished with value: 0.4794520547945205 and parameters: {'n_estimators': 267, 'learning_rate': 0.08073175658325445, 'max_depth': 2, 'num_leaves': 28, 'subsample': 0.6709073004183134, 'colsample_bytree': 0.8423552254405201}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:55,736] Trial 17 finished with value: 0.5138888888888888 and parameters: {'n_estimators': 233, 'learning_rate': 0.0414668572048318, 'max_depth': 2, 'num_leaves': 34, 'subsample': 0.84011754594674, 'colsample_bytree': 0.9393799790585261}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:55,811] Trial 18 finished with value: 0.5 and paramet

[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000134 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of po

[I 2025-04-01 12:40:55,889] Trial 19 finished with value: 0.46153846153846156 and parameters: {'n_estimators': 272, 'learning_rate': 0.06594104690862519, 'max_depth': 3, 'num_leaves': 40, 'subsample': 0.9107604947912418, 'colsample_bytree': 0.6851840495482542}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:55,942] Trial 20 finished with value: 0.547945205479452 and parameters: {'n_estimators': 250, 'learning_rate': 0.03198595505687897, 'max_depth': 2, 'num_leaves': 23, 'subsample': 0.6812774817708527, 'colsample_bytree': 0.8233191733213393}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:56,001] Trial 21 finished with value: 0.6012269938650306 and parameters: {'n_estimators': 286, 'learning_rate': 0.013135761927338825, 'max_depth': 2, 'num_leaves': 29, 'subsample': 0.7446038126715915, 'colsample_bytree': 0.885416397950003}. Best is trial 12 with value: 0.6181818181818182.


[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000121 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of po

[I 2025-04-01 12:40:56,056] Trial 22 finished with value: 0.5975609756097561 and parameters: {'n_estimators': 279, 'learning_rate': 0.011270352803789463, 'max_depth': 2, 'num_leaves': 29, 'subsample': 0.7678624662842148, 'colsample_bytree': 0.8838994938067128}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:56,128] Trial 23 finished with value: 0.5174825174825175 and parameters: {'n_estimators': 282, 'learning_rate': 0.03180747408641776, 'max_depth': 3, 'num_leaves': 34, 'subsample': 0.7136810360406074, 'colsample_bytree': 0.9577476729404679}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:56,180] Trial 24 finished with value: 0.5174825174825175 and parameters: {'n_estimators': 224, 'learning_rate': 0.049291291481633366, 'max_depth': 2, 'num_leaves': 32, 'subsample': 0.8224570435064631, 'colsample_bytree': 0.8966084918982218}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:56,235] Trial 25 finished with value: 0.56 and p

[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000126 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000115 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of po

[I 2025-04-01 12:40:56,298] Trial 26 finished with value: 0.5641025641025641 and parameters: {'n_estimators': 260, 'learning_rate': 0.01064380609014769, 'max_depth': 3, 'num_leaves': 36, 'subsample': 0.6420178975335414, 'colsample_bytree': 0.8643581599181575}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:56,355] Trial 27 finished with value: 0.4722222222222222 and parameters: {'n_estimators': 286, 'learning_rate': 0.06900956027174084, 'max_depth': 2, 'num_leaves': 23, 'subsample': 0.7237345905126249, 'colsample_bytree': 0.7666822846176715}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:56,414] Trial 28 finished with value: 0.5369127516778524 and parameters: {'n_estimators': 248, 'learning_rate': 0.04203934502863129, 'max_depth': 3, 'num_leaves': 32, 'subsample': 0.8117358899346178, 'colsample_bytree': 0.9251510235723746}. Best is trial 12 with value: 0.6181818181818182.
[I 2025-04-01 12:40:56,459] Trial 29 finished with value: 0.4895104895

[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000131 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of positive: 287, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 540, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.531481 -> initscore=0.126093
[LightGBM] [Info] Start training from score 0.126093
[LightGBM] [Info] Number of po

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# 8. 로지스틱 회귀 모델 정의 및 학습
logreg = LogisticRegression(max_iter=1000, random_state=0)
logreg.fit(X_scaled_train, y_train)

# 9. 예측 및 평가
y_pred_logreg = logreg.predict(X_scaled_test)

accuracy = accuracy_score(y_test, y_pred_logreg)
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
report = classification_report(y_test, y_pred_logreg)

print(f'\n훈련 데이터: {logreg.score(X_scaled_train, y_train):.2f}')
print(f'테스트 데이터: {logreg.score(X_scaled_test, y_test):.2f}')

# 10. 출력
print("\n[로지스틱 회귀 모델 성능]")
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)



훈련 데이터: 0.54
테스트 데이터: 0.54

[로지스틱 회귀 모델 성능]
Accuracy: 0.5441176470588235
Confusion Matrix:
 [[25 50]
 [12 49]]

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.33      0.45        75
           1       0.49      0.80      0.61        61

    accuracy                           0.54       136
   macro avg       0.59      0.57      0.53       136
weighted avg       0.59      0.54      0.52       136



In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import optuna

# 이미 전처리되어 스케일링된 데이터를 사용한다고 가정합니다.
# X_scaled_train, X_scaled_test, y_train, y_test 가 존재하는 상태입니다.

# 원래의 로지스틱 회귀 모델 학습 (기존 코드)
logreg = LogisticRegression(max_iter=1000, random_state=0)
logreg.fit(X_scaled_train, y_train)

y_pred_logreg = logreg.predict(X_scaled_test)
accuracy = accuracy_score(y_test, y_pred_logreg)
conf_matrix = confusion_matrix(y_test, y_pred_logreg)
report = classification_report(y_test, y_pred_logreg)

print(f'훈련 데이터 (기존 모델): {logreg.score(X_scaled_train, y_train):.2f}')
print(f'테스트 데이터 (기존 모델): {logreg.score(X_scaled_test, y_test):.2f}')
print("[로지스틱 회귀 모델 성능 (기존)]")
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", report)

# ---------------------------------------------
# Optuna를 사용한 하이퍼파라미터 튜닝
# ---------------------------------------------

def objective(trial):
    # 하이퍼파라미터 C 값을 로그 스케일로 탐색
    C = trial.suggest_float("C", 1e-4, 1e2, log=True)
    # 로지스틱 회귀 모델 생성
    model = LogisticRegression(max_iter=500, random_state=0, C=C)
    # 학습
    model.fit(X_scaled_train, y_train)
    # 검증: 테스트 셋에 대한 정확도를 반환 (최대화)
    score = model.score(X_scaled_test, y_test)
    return score

# Optuna 스터디 생성 (정확도 최대화를 목표)
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

print("\n[Optuna 최적화 결과]")
print("Best Accuracy:", study.best_trial.value)
print("Best Hyperparameters:")
for key, value in study.best_trial.params.items():
    print(f"  {key}: {value}")

# 최적 하이퍼파라미터로 모델 재학습
best_C = study.best_trial.params["C"]
best_logreg = LogisticRegression(max_iter=1000, random_state=0, C=best_C)
best_logreg.fit(X_scaled_train, y_train)

# 최적 모델 평가
y_pred_best = best_logreg.predict(X_scaled_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
conf_matrix_best = confusion_matrix(y_test, y_pred_best)
report_best = classification_report(y_test, y_pred_best)

print(f'\n훈련 데이터 (최적 모델): {best_logreg.score(X_scaled_train, y_train):.2f}')
print(f'테스트 데이터 (최적 모델): {best_logreg.score(X_scaled_test, y_test):.2f}')

print("\n[최적화된 로지스틱 회귀 모델 성능]")
print("Accuracy:", accuracy_best)
print("Confusion Matrix:\n", conf_matrix_best)
print("\nClassification Report:\n", report_best)


[I 2025-04-01 12:47:06,694] A new study created in memory with name: no-name-5741ff11-e28d-4568-85ad-69449067e573
[I 2025-04-01 12:47:06,696] Trial 0 finished with value: 0.5 and parameters: {'C': 0.019454438426710567}. Best is trial 0 with value: 0.5.
[I 2025-04-01 12:47:06,696] Trial 1 finished with value: 0.4485294117647059 and parameters: {'C': 0.00031116675773981337}. Best is trial 0 with value: 0.5.
[I 2025-04-01 12:47:06,696] Trial 2 finished with value: 0.4485294117647059 and parameters: {'C': 0.0008304352282761126}. Best is trial 0 with value: 0.5.
[I 2025-04-01 12:47:06,718] Trial 3 finished with value: 0.45588235294117646 and parameters: {'C': 0.0024445490268017456}. Best is trial 0 with value: 0.5.
[I 2025-04-01 12:47:06,723] Trial 4 finished with value: 0.4485294117647059 and parameters: {'C': 0.0008183651988787729}. Best is trial 0 with value: 0.5.
[I 2025-04-01 12:47:06,728] Trial 5 finished with value: 0.47058823529411764 and parameters: {'C': 0.0048441024041268035}. Be

훈련 데이터 (기존 모델): 0.54
테스트 데이터 (기존 모델): 0.54
[로지스틱 회귀 모델 성능 (기존)]
Accuracy: 0.5441176470588235
Confusion Matrix:
 [[25 50]
 [12 49]]

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.33      0.45        75
           1       0.49      0.80      0.61        61

    accuracy                           0.54       136
   macro avg       0.59      0.57      0.53       136
weighted avg       0.59      0.54      0.52       136


[Optuna 최적화 결과]
Best Accuracy: 0.5220588235294118
Best Hyperparameters:
  C: 0.10712018275463657

훈련 데이터 (최적 모델): 0.54
테스트 데이터 (최적 모델): 0.52

[최적화된 로지스틱 회귀 모델 성능]
Accuracy: 0.5220588235294118
Confusion Matrix:
 [[22 53]
 [12 49]]

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.29      0.40        75
           1       0.48      0.80      0.60        61

    accuracy                           0.52       136
   macro avg       0.56      0.55      0.50