- 비선형 지도학습 분류
1. Random Forest
2. KNN (k-최근접 이웃)
3. SVC (RBF 커널)
4. XGBoost / LightGBM 
5. MLP (다층 퍼셉트론)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
df = pd.read_csv('./data/online_retail_customer_churn.csv')

In [None]:
def drop_feature(df):
    return df.drop('Customer_ID', axis=1)

def gender_processed(df):
    df_gender_processed = df.copy()

# 'Other'를 정확히 반반으로 나누어 'Male' / 'Female'로 분산
    other_idx = df_gender_processed[df_gender_processed['Gender'] == 'Other'].index
    half = len(other_idx) // 2
    df_gender_processed.loc[other_idx[:half], 'Gender'] = 'Male'
    df_gender_processed.loc[other_idx[half:], 'Gender'] = 'Female'

    return df_gender_processed


def encode_feature(df):
    # 원핫 인코딩 대상 컬럼
    category = ['Gender', 'Email_Opt_In', 'Promotion_Response']
    
    # 원핫 인코딩 적용 (drop_first=True는 다중공선성 회피용, 상황에 따라 False로 해도 OK)
    df_encoded = pd.get_dummies(df, columns=category, drop_first=True)
    
    return df_encoded

def scale_feature(train_data, test_data):

    sc = StandardScaler()

    train_scaled = sc.fit_transform(train_data)
    test_scaled = sc.transform(test_data)

    return train_scaled, test_scaled

def preprocess_data(df):
    df = drop_feature(df)
    df = gender_processed(df)
    df = encode_feature(df)

    return df

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import optuna

# 데이터 로드
df = pd.read_csv("./data/online_retail_customer_churn.csv")

# 전처리
df = df.drop(columns=["Customer_ID"])

# Gender 처리
def process_gender(df):
    df_gender = df.copy()
    other_idx = df_gender[df_gender['Gender'] == 'Other'].index
    half = len(other_idx) // 2
    df_gender.loc[other_idx[:half], 'Gender'] = 'Male'
    df_gender.loc[other_idx[half:], 'Gender'] = 'Female'
    return df_gender

df = process_gender(df)

# 원핫 인코딩
df = pd.get_dummies(df, columns=["Gender", "Email_Opt_In", "Promotion_Response"], drop_first=True)

# 입력과 타겟 분리
X = df.drop("Target_Churn", axis=1)
y = df["Target_Churn"].astype(int)

# 데이터 분할 및 스케일링
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Optuna 튜닝
def objective(trial):
    C = trial.suggest_float("C", 0.001, 10.0, log=True)
    penalty = trial.suggest_categorical("penalty", ["l1", "l2"])
    solver = "liblinear" if penalty == "l1" else "lbfgs"

    model = LogisticRegression(C=C, penalty=penalty, solver=solver, max_iter=1000)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    return accuracy_score(y_test, y_pred)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)

# 최적 모델로 평가
best_params = study.best_params
final_model = LogisticRegression(
    C=best_params["C"],
    penalty=best_params["penalty"],
    solver="liblinear" if best_params["penalty"] == "l1" else "lbfgs",
    max_iter=1000
)
final_model.fit(X_train_scaled, y_train)
y_pred_final = final_model.predict(X_test_scaled)

# 성능 지표
accuracy = accuracy_score(y_test, y_pred_final)
conf_matrix = confusion_matrix(y_test, y_pred_final)
report = classification_report(y_test, y_pred_final, output_dict=True)

# 결과 반환
{
    "Best_Params": best_params,
    "Accuracy": accuracy,
    "Confusion_Matrix": conf_matrix,
    "Classification_Report": report
}


[I 2025-03-31 14:13:01,239] A new study created in memory with name: no-name-c368593f-f88e-4a55-90a0-08f04da421c6
[I 2025-03-31 14:13:01,247] Trial 0 finished with value: 0.44333333333333336 and parameters: {'C': 0.715094898540468, 'penalty': 'l1'}. Best is trial 0 with value: 0.44333333333333336.
[I 2025-03-31 14:13:01,268] Trial 1 finished with value: 0.5366666666666666 and parameters: {'C': 0.0013374433769788518, 'penalty': 'l2'}. Best is trial 1 with value: 0.5366666666666666.
[I 2025-03-31 14:13:01,282] Trial 2 finished with value: 0.4666666666666667 and parameters: {'C': 0.0033887662963578077, 'penalty': 'l1'}. Best is trial 1 with value: 0.5366666666666666.
[I 2025-03-31 14:13:01,312] Trial 3 finished with value: 0.44666666666666666 and parameters: {'C': 0.20521016679155038, 'penalty': 'l2'}. Best is trial 1 with value: 0.5366666666666666.
[I 2025-03-31 14:13:01,318] Trial 4 finished with value: 0.4666666666666667 and parameters: {'C': 0.003895701749873101, 'penalty': 'l1'}. Bes

{'Best_Params': {'C': 0.00106151502727881, 'penalty': 'l2'},
 'Accuracy': 0.5433333333333333,
 'Confusion_Matrix': array([[  4, 136],
        [  1, 159]], dtype=int64),
 'Classification_Report': {'0': {'precision': 0.8,
   'recall': 0.02857142857142857,
   'f1-score': 0.05517241379310345,
   'support': 140.0},
  '1': {'precision': 0.5389830508474577,
   'recall': 0.99375,
   'f1-score': 0.6989010989010989,
   'support': 160.0},
  'accuracy': 0.5433333333333333,
  'macro avg': {'precision': 0.6694915254237288,
   'recall': 0.5111607142857143,
   'f1-score': 0.37703675634710115,
   'support': 300.0},
  'weighted avg': {'precision': 0.6607909604519775,
   'recall': 0.5433333333333333,
   'f1-score': 0.39849437918403435,
   'support': 300.0}}}