In [3]:
import pandas as pd
import numpy as np
import pandas

In [4]:
# 1. 데이터 가져오기
def load_dataset():
    # 데이터 load
    data = pd.read_csv("data/credit_card_churn.csv", na_values='Unknown')
    # 컬럼명 소문자로 변경
    data.columns = data.columns.str.lower()
    data.rename(columns={'attrition_flag': 'churn'}, inplace=True)
    
    ## 불필요 칼럼 삭제
    data.drop(
        columns=[
            'clientnum',
            'naive_bayes_classifier_attrition_flag_card_category_contacts_count_12_mon_dependent_count_education_level_months_inactive_12_mon_1',
            'naive_bayes_classifier_attrition_flag_card_category_contacts_count_12_mon_dependent_count_education_level_months_inactive_12_mon_2'
        ], 
        inplace=True
    )
    return data

df = load_dataset()

In [5]:

# 2. 데이터 전처리 (이상치, 결측치, Feature Engineering)

# 2-1. 데이터 전처리 > 이상치(Outlier) 
# - IQR 식별 -> 극단치 제거
#  2-1-1. Outlier 식별: IQR(Inter quantile Range) 을 이용해 Outlier 식별 
def find_outliers(df, column_name, whis=1.5):
    """
    분위수 기준으로 이상치를 찾는 함수

    Parameters:
    df (pd.DataFrame): 데이터프레임
    column_name (str): 이상치를 찾을 컬럼명

    Returns:
    pd.Series: 이상치 값들
    """
    q1, q3 = df[column_name].quantile(q=[0.25, 0.75])
    iqr = q3 - q1
    iqr *= whis
    return df.loc[~df[column_name].between(q1 - iqr, q3 + iqr)]

# ==> ["customer_age", "total_trans_ct"] 두 칼럼의 이상치를 삭제하기로 결정

#  2-1-2. Outlier 제거: 삭제할 이상치 index를 찾아서 drop
def delete_outliers(df, columns, whis=1.5):
    index_list = []
    _df = df.copy()
    
    for col in columns: 
        outliers_column_index = find_outliers(df, col, whis=whis)
        index_list.extend(outliers_column_index.index)
        
        
    _df = _df.drop(index=index_list)
        
    _df.reset_index(drop=True, inplace=True)
    
    return _df

outlier_columns = ["customer_age", "total_trans_ct"]
df = delete_outliers(df, outlier_columns)

In [6]:
# 2-2. 데이터 전처리 > 결측치 대체(imputation) 
# - SimpleImputer: 최빈값으로 대치(column 2개) + 사용자 정의 imputer: 비율에 따른 대치(column 1개)
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin

#  2-2-1. SimpleImputer(strategy=최빈값) 
simple_imputer = SimpleImputer(strategy='most_frequent')

#  2-2-2. ProportionalImputer(사용자 정의)
class ProportionalImputer(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.fill_values = {}

    def fit(self, X, y=None):
        for column in self.columns:
            value_counts = X[column].value_counts(normalize=True)
            self.fill_values[column] = (value_counts.index, value_counts.values)
        return self

    def transform(self, X):
        X = X.copy()
        for column in self.columns:

            nan_count = X[column].isna().sum()
            if nan_count > 0:
                fill_values = np.random.choice(
                    self.fill_values[column][0], size=nan_count, p=self.fill_values[column][1]
                )
                X.loc[X[column].isna(), column] = fill_values
        return X

def simple_impute_most_frequent(df, columns):
    imputer = SimpleImputer(strategy='most_frequent')
    df[columns] = imputer.fit_transform(df[columns])
    return df
def proportional_impute(df, columns):
    imputer = ProportionalImputer(columns=columns)
    imputer.fit(df)
    return imputer.transform(df)

df = simple_impute_most_frequent(df, ['education_level', 'marital_status'])
df = proportional_impute(df, ['income_category'])

In [7]:
# 2-3. 데이터 전처리 > Feature Engineering
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# 2-3-1. 라벨 인코딩(Label Encoding) - 'gender'
# 이유: 이진 변수의 경우 모델 성능에 큰 차이가 없으므로, 간단히 라벨 인코딩을 사용하기로 함.
label_encoder = LabelEncoder()

df['gender'] = label_encoder.fit_transform(df['gender'])

# 2-3-2. 순서 인코딩 (Ordinal Encoding) - 'education_level'
education_order = {"Uneducated": 0, "High School": 1, "College": 2, "Graduate": 3, "Post-Graduate": 4, "Doctorate": 5}
df['education_level'] = df['education_level'].map(education_order)

# 2-3-3. mapping - 'gender'
df['churn'] = df['churn'].map({"Existing Customer": 0, "Attrited Customer": 1})

# 2-3-4. 원핫 인코딩(One-Hot encoding) - 'marital_status', 'card_category', 'income_category'
# 이유: 순서가 없고 각 값이 독립적인 범주형 데이터으로서 순서나 크기 정보 없이 각각 독립적인 특성으로 변환되므로, 머신러닝 모델에서 더 잘 해석될 가능성이 있다고 보아 원핫 인코딩 하기로 결정.
ohe_encoder = OneHotEncoder(drop='first', sparse_output=False)

columns_to_ohe_encode = [ 'marital_status', 'card_category', 'income_category']
encoded_data = ohe_encoder.fit_transform(df[columns_to_ohe_encode])

# 인코딩된 데이터를 dataframe으로 변환
encoded_df = pd.DataFrame(encoded_data, columns=ohe_encoder.get_feature_names_out(columns_to_ohe_encode))
# 기존 df와 인코딩된 dataframe을 병합하고 원본 열 삭제
df = pd.concat([df.drop(columns=columns_to_ohe_encode), encoded_df], axis=1)

In [16]:
df

Unnamed: 0,churn,customer_age,gender,dependent_count,education_level,months_on_book,total_relationship_count,months_inactive_12_mon,contacts_count_12_mon,credit_limit,...,avg_utilization_ratio,marital_status_Married,marital_status_Single,card_category_Gold,card_category_Platinum,card_category_Silver,income_category_$40K - $60K,income_category_$60K - $80K,income_category_$80K - $120K,income_category_Less than $40K
0,0,45,1,3,1,39,5,1,3,12691.0,...,0.061,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0,49,0,5,3,44,6,1,2,8256.0,...,0.105,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0,51,1,3,3,36,4,1,0,3418.0,...,0.000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0,40,0,4,1,34,3,4,1,3313.0,...,0.760,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0,40,1,3,0,21,5,1,0,4716.0,...,0.000,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10118,0,50,1,2,3,40,3,2,3,4003.0,...,0.462,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10119,1,41,1,2,3,25,4,2,3,4277.0,...,0.511,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10120,1,44,0,1,1,36,5,3,4,5409.0,...,0.000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10121,1,30,1,2,3,36,4,3,3,5281.0,...,0.000,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [17]:
## 데이터 스플릿

from sklearn.model_selection import train_test_split
X = df.drop(["churn"], axis=1)
y = df["churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=0)

In [10]:
### XGBoost

In [None]:
# 1. 개념



In [None]:
# 2. 주요 파라미터 (5개~)

In [None]:
# 3. 학습 및 예측
# 4. 모델평가

In [18]:
from xgboost import XGBClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
import numpy as np


# XGBoostClassifier 모델 생성
xgb = XGBClassifier()

xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
y_pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]

# 성능 지표 계산
accuracy = accuracy_score(y_test, y_pred_xgb)
auc = roc_auc_score(y_test, y_pred_proba_xgb)
precision = precision_score(y_test, y_pred_xgb)
recall = recall_score(y_test, y_pred_xgb)
f1 = f1_score(y_test, y_pred_xgb)

print(f"XGBoost - Accuracy: {accuracy:.4f}, recall: {recall:.4f} ,AUC: {auc:.4f}, Precision: {precision:.4f}, F1 Score: {f1:.4f}")

XGBoost - Accuracy: 0.9664, recall: 0.8526 ,AUC: 0.9928, Precision: 0.9328, F1 Score: 0.8909


In [None]:
# 5. 최적의 매개변수 구하기 - GridSearchCV

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

#  하이퍼파라미터를 찾을 모델
model = XGBClassifier(random_state=0)

# 하이퍼 파라미터 후보 설정: dict[hp 이름, 후보들]
params = {
    "max_depth":[1, 2, 3, 4, 5], # iterable: range(1, 6)
    'learning_rate': [0.1],  # 학습률
    'n_estimators': [100, 200, 300],  # 부스팅 라운드 수
    'subsample': [0.5, 0.7],  # 각 트리의 훈련에 사용되는 샘플 비율
    'colsample_bytree': [0.5, 0.7, 1.0],  # 각 트리의 훈련에 사용되는 피처 비율
    'gamma': [0, 0.1],  # 노드 분할에 대한 최소 손실 감소
    'reg_alpha': [0],  # L1 정규화
    'reg_lambda': [0.1]  # L2 정규화
}

gs = GridSearchCV(
    estimator=model,    # 대상 모델
    param_grid=params,      # 하이퍼파라미터 후보들
    scoring='accuracy', # 평가 지표 (이 평가지표가 가장 높은 하이퍼파라미터를 찾는다.)
    cv=4, # Cross validation의 fold개수.
    n_jobs=-1, # 병렬연산(처리) -> 모든 프로세서(CPU)를 다 사용해라.
)

In [20]:
gs.fit(X_train, y_train)

In [22]:
#### 가장 성능 좋은 hyper parameter 조합
gs.best_params_

{'colsample_bytree': 0.5,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_depth': 4,
 'n_estimators': 300,
 'reg_alpha': 0,
 'reg_lambda': 0.1,
 'subsample': 0.5}

In [None]:
# 6. 테스트

In [None]:
# 7. 특성중요도

In [None]:
# 8. 개인 review