- 비선형 지도학습 분류
1. Random Forest
2. KNN (k-최근접 이웃)
3. SVC (RBF 커널)
4. XGBoost / LightGBM 
5. MLP (다층 퍼셉트론)

In [12]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

In [13]:
df = pd.read_csv('./data/online_retail_customer_churn.csv')

In [14]:
def drop_feature(df):
    return df.drop('Customer_ID', axis=1)

def gender_processed(df):
    df_gender_processed = df.copy()

# 'Other'를 정확히 반반으로 나누어 'Male' / 'Female'로 분산
    other_idx = df_gender_processed[df_gender_processed['Gender'] == 'Other'].index
    half = len(other_idx) // 2
    df_gender_processed.loc[other_idx[:half], 'Gender'] = 'Male'
    df_gender_processed.loc[other_idx[half:], 'Gender'] = 'Female'

    return df_gender_processed

def remove_gender_other(df):
    """Gender가 'Other'인 행 제거"""
    df_cleaned = df[df['Gender'] != 'Other'].copy()
    return df_cleaned



def encode_feature(df):
    # 원핫 인코딩 대상 컬럼
    category = ['Gender', 'Email_Opt_In', 'Promotion_Response']
    
    # 원핫 인코딩 적용 (drop_first=True는 다중공선성 회피용, 상황에 따라 False로 해도 OK)
    df_encoded = pd.get_dummies(df, columns=category, drop_first=True)
    
    return df_encoded


def scale_feature(train_data, test_data):

    sc = StandardScaler()
    # sc = MinMaxScaler()

    train_scaled = sc.fit_transform(train_data)
    test_scaled = sc.transform(test_data)

    return train_scaled, test_scaled

def preprocess_data(df):
    df = drop_feature(df)
    # df = gender_processed(df)
    df = remove_gender_other(df)
    df = encode_feature(df)

    return df

In [15]:
rt_df = preprocess_data(df)

In [16]:
rt_df.head()

Unnamed: 0,Age,Annual_Income,Total_Spend,Years_as_Customer,Num_of_Purchases,Average_Transaction_Amount,Num_of_Returns,Num_of_Support_Contacts,Satisfaction_Score,Last_Purchase_Days_Ago,Target_Churn,Gender_Male,Email_Opt_In_True,Promotion_Response_Responded,Promotion_Response_Unsubscribed
1,65,79.51,9025.47,13,77,22.9,2,2,3,227,False,True,False,True,False
2,18,29.19,618.83,13,71,50.53,5,2,2,283,True,True,False,True,False
5,57,190.43,255.19,19,85,417.78,5,1,4,130,False,True,False,False,True
6,27,172.13,3512.55,3,77,316.18,0,3,1,61,False,True,True,False,True
10,54,138.9,4283.84,15,33,96.55,9,1,2,92,False,True,True,True,False


In [17]:
rt_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 676 entries, 1 to 998
Data columns (total 15 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              676 non-null    int64  
 1   Annual_Income                    676 non-null    float64
 2   Total_Spend                      676 non-null    float64
 3   Years_as_Customer                676 non-null    int64  
 4   Num_of_Purchases                 676 non-null    int64  
 5   Average_Transaction_Amount       676 non-null    float64
 6   Num_of_Returns                   676 non-null    int64  
 7   Num_of_Support_Contacts          676 non-null    int64  
 8   Satisfaction_Score               676 non-null    int64  
 9   Last_Purchase_Days_Ago           676 non-null    int64  
 10  Target_Churn                     676 non-null    bool   
 11  Gender_Male                      676 non-null    bool   
 12  Email_Opt_In_True          

In [18]:
rt_df['Target_Churn'].value_counts()

Target_Churn
True     348
False    328
Name: count, dtype: int64

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.combine import SMOTETomek

# 1. X, y 분리
X = rt_df.drop('Target_Churn', axis=1)
y = rt_df['Target_Churn'].astype(int)

# 2. train/test 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

# 3. 수치형 변수만 스케일링
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

sc = StandardScaler()
X_scaled_train = X_train.copy()
X_scaled_test = X_test.copy()

X_scaled_train[numeric_cols] = sc.fit_transform(X_train[numeric_cols])
X_scaled_test[numeric_cols] = sc.transform(X_test[numeric_cols])

# ✅ 4. SMOTETomek 적용
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smt.fit_resample(X_scaled_train, y_train)

print("SMOTETomek 전:", X_scaled_train.shape, y_train.shape)
print("SMOTETomek 후:", X_train_resampled.shape, y_train_resampled.shape)


SMOTETomek 전: (473, 14) (473,)
SMOTETomek 후: (386, 14) (386,)


In [33]:
# !pip install imblearn

- RandomForest

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# 최종 모델 학습 (SMOTETomek 적용된 데이터 사용)
rf = RandomForestClassifier(
    n_estimators=80,
    max_depth=3,
    random_state=42,
    class_weight='balanced'  # 클래스 불균형 보정
)
rf.fit(X_train_resampled, y_train_resampled)

# 평가
print(f'훈련 데이터 정확도: {rf.score(X_train_resampled, y_train_resampled):.3f}')
print(f'테스트 데이터 정확도: {rf.score(X_scaled_test, y_test):.3f}')

y_pred_rf = rf.predict(X_scaled_test)
print("\n[RandomForest 분류 리포트]")
print(classification_report(y_test, y_pred_rf))


훈련 데이터 정확도: 0.764
테스트 데이터 정확도: 0.522

[RandomForest 분류 리포트]
              precision    recall  f1-score   support

           0       0.50      0.56      0.53        98
           1       0.54      0.49      0.51       105

    accuracy                           0.52       203
   macro avg       0.52      0.52      0.52       203
weighted avg       0.52      0.52      0.52       203



In [37]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report
import numpy as np

# 목적 함수 정의
def objective(trial):
    model = RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 150),
        max_depth=trial.suggest_int("max_depth", 2, 7),
        min_samples_split=trial.suggest_int("min_samples_split", 10, 40),
        min_samples_leaf=trial.suggest_int("min_samples_leaf", 5, 30),
        max_leaf_nodes=trial.suggest_int("max_leaf_nodes", 10, 50),
        max_features=trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        max_samples=trial.suggest_float("max_samples", 0.5, 0.9),
        bootstrap=True,
        random_state=42,
        n_jobs=-1
    )

    # StratifiedKFold로 클래스 비율 유지
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    # score = cross_val_score(model, X_train_oversampled, y_train_oversampled, scoring="f1", cv=cv)
    score = cross_val_score(
    model,
    X_train_resampled, y_train_resampled,
    scoring="balanced_accuracy",  # 또는 'recall'
    cv=cv
)

    return score.mean()



# Optuna 스터디 실행
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=200)

# 최적 파라미터로 모델 학습
print("Best trial parameters:")
print(study.best_trial.params)

best_params = study.best_trial.params
# best_rf = RandomForestClassifier(**best_params, class_weight='balanced', random_state=42)
# best_rf = RandomForestClassifier(**best_params, random_state=42)
best_rf = RandomForestClassifier(
    **study.best_trial.params,
    class_weight='balanced',
    random_state=42
)
best_rf.fit(X_train_resampled, y_train_resampled)

# 테스트셋 평가
y_pred = best_rf.predict(X_scaled_test)
print("Classification Report (F1 기반 튜닝 + balanced):")
print(classification_report(y_test, y_pred))


[I 2025-03-31 18:41:00,880] A new study created in memory with name: no-name-c2b96015-cb28-47ab-9ba5-5e9e7188c8d5
[I 2025-03-31 18:41:01,436] Trial 0 finished with value: 0.507962213225371 and parameters: {'n_estimators': 73, 'max_depth': 7, 'min_samples_split': 11, 'min_samples_leaf': 13, 'max_leaf_nodes': 40, 'max_features': 'sqrt', 'max_samples': 0.6593472622246394}. Best is trial 0 with value: 0.507962213225371.
[I 2025-03-31 18:41:02,343] Trial 1 finished with value: 0.5549932523616734 and parameters: {'n_estimators': 135, 'max_depth': 6, 'min_samples_split': 36, 'min_samples_leaf': 5, 'max_leaf_nodes': 24, 'max_features': 'log2', 'max_samples': 0.8994466121667325}. Best is trial 1 with value: 0.5549932523616734.
[I 2025-03-31 18:41:02,937] Trial 2 finished with value: 0.4791497975708502 and parameters: {'n_estimators': 90, 'max_depth': 7, 'min_samples_split': 33, 'min_samples_leaf': 27, 'max_leaf_nodes': 23, 'max_features': 'log2', 'max_samples': 0.7300467385684007}. Best is tria

Best trial parameters:
{'n_estimators': 119, 'max_depth': 7, 'min_samples_split': 14, 'min_samples_leaf': 9, 'max_leaf_nodes': 42, 'max_features': 'log2', 'max_samples': 0.7739542798761803}
Classification Report (F1 기반 튜닝 + balanced):
              precision    recall  f1-score   support

           0       0.47      0.55      0.50        98
           1       0.49      0.41      0.45       105

    accuracy                           0.48       203
   macro avg       0.48      0.48      0.48       203
weighted avg       0.48      0.48      0.48       203



In [38]:
print(f'훈련 데이터: {best_rf.score(X_train_resampled, y_train_resampled)}')
print(f'테스트 데이터: {best_rf.score(X_scaled_test, y_test)}')
print(classification_report(y_test, best_rf.predict(X_scaled_test)))

훈련 데이터: 0.8652849740932642
테스트 데이터: 0.47783251231527096
              precision    recall  f1-score   support

           0       0.47      0.55      0.50        98
           1       0.49      0.41      0.45       105

    accuracy                           0.48       203
   macro avg       0.48      0.48      0.48       203
weighted avg       0.48      0.48      0.48       203

