In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.ensemble import IsolationForest
from sklearn.metrics import classification_report
from statsmodels.stats.proportion import proportions_ztest

In [None]:
df = pd.read_csv('Bank Customer Churn Prediction.csv')

In [None]:
df = df[df['country'] == 'Germany']

# Feature 조정

## 이상치 처리 (isolation forest + Z-score)



In [None]:
df['gender'] = df['gender'].apply(lambda x: 1 if x == 'Male' else 0)
df.drop(columns = ['customer_id', 'country'], inplace = True)
df.shape

(2509, 10)

In [None]:
from sklearn.ensemble import IsolationForest
# ML 방식을 사용하여 이상치 열 생성
model = IsolationForest(contamination=0.1)  # 이상치 비율을 나타내는 매개변수
df['outlier'] = model.fit_predict(df)

# 통계적 방식을 사용하여 Z-score 열 생성
df['z'] = (df['credit_score'] - df['credit_score'].mean()) / df['credit_score'].std()

# 이상치 조건을 사용하여 이상치 여부 결정
df['is_outlier'] = (df['outlier'] == -1) & (abs(df['z']) > 3)

# 결과를 기준으로 df를 업데이트
df = df[df['is_outlier'] != True]
df.drop(columns = ['outlier', 'z', 'is_outlier'], inplace = True)

# outlier 제거 완료
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns = ['outlier', 'z', 'is_outlier'], inplace = True)


Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
7,376,0,29,4,115046.74,4,1,0,119346.88,1
15,616,1,45,3,143129.41,2,0,1,64327.26,0
16,653,1,58,1,132602.88,1,1,0,5097.67,1
26,756,1,36,2,136815.64,1,1,1,170041.95,0
28,574,0,43,3,141349.43,1,1,1,100187.43,0
...,...,...,...,...,...,...,...,...,...,...
9982,655,0,46,7,137145.12,1,1,0,115146.40,1
9984,602,1,35,7,90602.42,2,1,1,51695.41,0
9986,673,1,47,1,183579.54,2,0,1,34047.54,0
9990,714,1,33,3,35016.60,1,1,0,53667.08,0


### df : 이상치 처리만

In [None]:
df.head()

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
7,376,0,29,4,115046.74,4,1,0,119346.88,1
15,616,1,45,3,143129.41,2,0,1,64327.26,0
16,653,1,58,1,132602.88,1,1,0,5097.67,1
26,756,1,36,2,136815.64,1,1,1,170041.95,0
28,574,0,43,3,141349.43,1,1,1,100187.43,0


## 데이터 그룹 생성 및 데이터 전처리

In [None]:
def make_group(df, col, bins, labels) :
  return pd.cut(df[col], bins=bins, labels=labels)

In [None]:
new_df = df.copy()

#age_group
# 결혼 및 아이의 독립을 기준으로 나누기
new_df['age_group'] = make_group(new_df, 'age', [18, 32, 56, 84], ['before marriage', 'after marriage', 'senior'])


# 베이비붐 세대: 46년생 ~ 64년생
# X세대: 65년생 ~ 80년생
# 밀레니얼 세대 (Y세대): 81년생 ~ 95년생
# Z세대: 96년생 ~ 12년생
# new_df['age_group'] = make_group(new_df, 'age', [18, 27, 42, 58, 84], ['Z', 'millennial', 'X', 'baby boom'])
#---------------------------------------------------------------------------------------------------------------


# salary_group : 통계값으로 나누기
new_df['salary_group'] = make_group(new_df, 'estimated_salary', [11.58, 51113.14, 102184.66, 151167.94, 199992.48], ['75', '50', '25', '0'])


#---------------------------------------------------------------------------------------------------------------

# balance의 경우 scaling 하여 사용합시다

# new_df['balance_group'] = make_group(new_df, 'balance', [27288.43, 102773.2, 119714.25, 137648.41, 214346.96], ['75', '50', '25', '0'])
# new_df.drop(columns = 'balance', inplace = True)

#---------------------------------------------------------------------------------------------------------------
# #credit score + credit_card 그룹
# # FICO 표준 평가 방법 [350, 579, 669, 739, 799, 850]을 modify해서 사용
new_df['credit_group'] = make_group(new_df, 'credit_score', [350, 579, 669, 739, 850], ['D', 'C', 'B', 'A'])
new_df['credit_group'] = new_df['credit_group'].astype(str)
new_df['combined_group'] = new_df['credit_group'] + new_df['credit_card'].astype(str)

new_df.shape

(2507, 14)

In [None]:
new_df.head()

Unnamed: 0,credit_score,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn,age_group,salary_group,credit_group,combined_group
7,376,0,29,4,115046.74,4,1,0,119346.88,1,before marriage,25,D,D1
15,616,1,45,3,143129.41,2,0,1,64327.26,0,after marriage,50,C,C0
16,653,1,58,1,132602.88,1,1,0,5097.67,1,senior,75,C,C1
26,756,1,36,2,136815.64,1,1,1,170041.95,0,after marriage,0,A,A1
28,574,0,43,3,141349.43,1,1,1,100187.43,0,after marriage,50,D,D1


In [None]:
# 스케일링할 변수 선택
columns_to_scale = ['balance','tenure']
# one-hot encoding할 변수 선택
columns_to_encode = ['products_number', 'age_group', 'salary_group', 'combined_group']

# StandardScaler 객체 생성
scaler = StandardScaler()

# 선택한 변수들에 대해 스케일링을 수행
new_df_scaled = new_df.copy()  # 데이터프레임 복사
new_df_scaled[columns_to_scale] = scaler.fit_transform(new_df_scaled[columns_to_scale])

# one-hot encoding 수행
new_df_encoded = pd.get_dummies(new_df_scaled, columns=columns_to_encode)

# one-hot encoding 결과 확인
new_df_encoded
new_df_encoded.columns

Index(['credit_score', 'gender', 'age', 'tenure', 'balance', 'credit_card',
       'active_member', 'estimated_salary', 'churn', 'credit_group',
       'products_number_1', 'products_number_2', 'products_number_3',
       'products_number_4', 'age_group_before marriage',
       'age_group_after marriage', 'age_group_senior', 'salary_group_75',
       'salary_group_50', 'salary_group_25', 'salary_group_0',
       'combined_group_A0', 'combined_group_A1', 'combined_group_B0',
       'combined_group_B1', 'combined_group_C0', 'combined_group_C1',
       'combined_group_D0', 'combined_group_D1'],
      dtype='object')

### new_df1 : age_group만 생성

In [None]:
new_df1 = new_df_encoded.copy()
new_df1.drop(columns = ['age','salary_group_75',
       'salary_group_50', 'salary_group_25', 'salary_group_0',
       'combined_group_A0', 'combined_group_A1', 'combined_group_B0',
       'combined_group_B1', 'combined_group_C0', 'combined_group_C1',
       'combined_group_D0', 'combined_group_D1'],inplace = True)
new_df1.drop(columns = 'credit_group',inplace = True)
new_df1.head()

Unnamed: 0,credit_score,gender,tenure,balance,credit_card,active_member,estimated_salary,churn,products_number_1,products_number_2,products_number_3,products_number_4,age_group_before marriage,age_group_after marriage,age_group_senior
7,376,0,-0.34505,-0.172883,1,0,119346.88,1,0,0,0,1,1,0,0
15,616,1,-0.685886,0.866725,0,1,64327.26,0,0,1,0,0,0,1,0
16,653,1,-1.367558,0.477038,1,0,5097.67,1,1,0,0,0,0,0,1
26,756,1,-1.026722,0.632992,1,1,170041.95,0,1,0,0,0,0,1,0
28,574,0,-0.685886,0.800831,1,1,100187.43,0,1,0,0,0,0,1,0


### new_df2 : salary_group만 생성

In [None]:
new_df2 = new_df_encoded.copy()
new_df2.drop(columns = ['credit_group', 'age_group_before marriage',
       'age_group_after marriage', 'age_group_senior', 'combined_group_A0', 'combined_group_A1', 'combined_group_B0',
       'combined_group_B1', 'combined_group_C0', 'combined_group_C1',
       'combined_group_D0', 'combined_group_D1', 'estimated_salary'],inplace = True)
new_df2.head()

Unnamed: 0,credit_score,gender,age,tenure,balance,credit_card,active_member,churn,products_number_1,products_number_2,products_number_3,products_number_4,salary_group_75,salary_group_50,salary_group_25,salary_group_0
7,376,0,29,-0.34505,-0.172883,1,0,1,0,0,0,1,0,0,1,0
15,616,1,45,-0.685886,0.866725,0,1,0,0,1,0,0,0,1,0,0
16,653,1,58,-1.367558,0.477038,1,0,1,1,0,0,0,1,0,0,0
26,756,1,36,-1.026722,0.632992,1,1,0,1,0,0,0,0,0,0,1
28,574,0,43,-0.685886,0.800831,1,1,0,1,0,0,0,0,1,0,0


### new_df3 : credit score + credit_card 그룹:만 생성

In [None]:
new_df3 = new_df_encoded.copy()
new_df3.drop(columns = ['credit_group', 'age_group_before marriage',
       'age_group_after marriage', 'age_group_senior', 'credit_score', 'salary_group_75',
       'salary_group_50', 'salary_group_25', 'salary_group_0', 'credit_card'],inplace = True)
new_df3.head()

Unnamed: 0,gender,age,tenure,balance,active_member,estimated_salary,churn,products_number_1,products_number_2,products_number_3,products_number_4,combined_group_A0,combined_group_A1,combined_group_B0,combined_group_B1,combined_group_C0,combined_group_C1,combined_group_D0,combined_group_D1
7,0,29,-0.34505,-0.172883,0,119346.88,1,0,0,0,1,0,0,0,0,0,0,0,1
15,1,45,-0.685886,0.866725,1,64327.26,0,0,1,0,0,0,0,0,0,1,0,0,0
16,1,58,-1.367558,0.477038,0,5097.67,1,1,0,0,0,0,0,0,0,0,1,0,0
26,1,36,-1.026722,0.632992,1,170041.95,0,1,0,0,0,0,1,0,0,0,0,0,0
28,0,43,-0.685886,0.800831,1,100187.43,0,1,0,0,0,0,0,0,0,0,0,0,1


### new_df4 : salary_group만 생성 + gender와 credit_card 열을 제거

In [None]:
new_df4 = new_df2.copy()
new_df4.drop(columns = ['gender','credit_card'],inplace = True)
new_df4.head()

Unnamed: 0,credit_score,age,tenure,balance,active_member,churn,products_number_1,products_number_2,products_number_3,products_number_4,salary_group_75,salary_group_50,salary_group_25,salary_group_0
7,376,29,-0.34505,-0.172883,0,1,0,0,0,1,0,0,1,0
15,616,45,-0.685886,0.866725,1,0,0,1,0,0,0,1,0,0
16,653,58,-1.367558,0.477038,0,1,1,0,0,0,1,0,0,0
26,756,36,-1.026722,0.632992,1,0,1,0,0,0,0,0,0,1
28,574,43,-0.685886,0.800831,1,0,1,0,0,0,0,1,0,0


# Gradient Boosting

## train-test split(X_train_val, X_test, y_train_val, y_test)

### 이상치만 제거

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# 데이터를 피처와 타겟 변수로 분리
X =  df.drop('churn', axis=1)
y =  df['churn']

# train-test split (테스트 데이터는 계층적 샘플링을 사용하지 않습니다)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  4,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 3,
                                      n_estimators = 60,
                                      loss = 'exponential')
# 모델 학습
gb_model.fit(X_train_val, y_train_val)

# 최적의 모델로 예측 수행
y_pred = gb_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_test, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)


정확도: 0.8087649402390438
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.88      0.86       341
           1       0.72      0.66      0.69       161

    accuracy                           0.81       502
   macro avg       0.78      0.77      0.78       502
weighted avg       0.81      0.81      0.81       502

Confusion Matrix:
[[300  41]
 [ 55 106]]


### age_group만 생성

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# 데이터를 피처와 타겟 변수로 분리
X =  new_df1.drop('churn', axis=1)
y =  new_df1['churn']

# train-test split (테스트 데이터는 계층적 샘플링을 사용하지 않습니다)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  4,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 3,
                                      n_estimators = 60,
                                      loss = 'exponential')
# 모델 학습
gb_model.fit(X_train_val, y_train_val)

# 최적의 모델로 예측 수행
y_pred = gb_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_test, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

정확도: 0.7589641434262948
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.82      0.82       341
           1       0.62      0.62      0.62       161

    accuracy                           0.76       502
   macro avg       0.72      0.72      0.72       502
weighted avg       0.76      0.76      0.76       502

Confusion Matrix:
[[281  60]
 [ 61 100]]


### salary_group만 생성

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# 데이터를 피처와 타겟 변수로 분리
X =  new_df2.drop('churn', axis=1)
y =  new_df2['churn']

# train-test split (테스트 데이터는 계층적 샘플링을 사용하지 않습니다)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  3,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 2,
                                      n_estimators = 55,
                                      loss = 'exponential')

# 모델 학습
gb_model.fit(X_train_val, y_train_val)

# 최적의 모델로 예측 수행
y_pred = gb_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_test, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

정확도: 0.8067729083665338
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.86      0.86       341
           1       0.70      0.69      0.70       161

    accuracy                           0.81       502
   macro avg       0.78      0.78      0.78       502
weighted avg       0.81      0.81      0.81       502

Confusion Matrix:
[[294  47]
 [ 50 111]]


### credit score + credit_card 그룹만 생성

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# 데이터를 피처와 타겟 변수로 분리
X =  new_df3.drop('churn', axis=1)
y =  new_df3['churn']

# train-test split (테스트 데이터는 계층적 샘플링을 사용하지 않습니다)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  3,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 2,
                                      n_estimators = 55,
                                      loss = 'exponential')
# 모델 학습
gb_model.fit(X_train_val, y_train_val)

# 최적의 모델로 예측 수행
y_pred = gb_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_test, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

정확도: 0.8107569721115537
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86       341
           1       0.73      0.65      0.69       161

    accuracy                           0.81       502
   macro avg       0.79      0.77      0.78       502
weighted avg       0.81      0.81      0.81       502

Confusion Matrix:
[[302  39]
 [ 56 105]]


### salary_group만 생성 + gender와 credit_card 열을 제거

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# 데이터를 피처와 타겟 변수로 분리
X =  new_df4.drop('churn', axis=1)
y =  new_df4['churn']

# train-test split (테스트 데이터는 계층적 샘플링을 사용하지 않습니다)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  3,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 2,
                                      n_estimators = 55,
                                      loss = 'exponential')
# 모델 학습
gb_model.fit(X_train_val, y_train_val)

# 최적의 모델로 예측 수행
y_pred = gb_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_test, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

정확도: 0.8127490039840638
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86       341
           1       0.71      0.71      0.71       161

    accuracy                           0.81       502
   macro avg       0.79      0.79      0.79       502
weighted avg       0.81      0.81      0.81       502

Confusion Matrix:
[[294  47]
 [ 47 114]]


최적의 하이퍼파라미터 찾기(이탈자 재현율 기준)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# 데이터를 피처와 타겟 변수로 분리
X =  df.drop('churn', axis=1)
y =  df['churn']

# train-test split (테스트 데이터는 계층적 샘플링을 사용하지 않습니다)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier()

param_dist = {
    'max_depth': [3, 4, 5, 6, 7],  # 트리의 최대 깊이
    'subsample': [0.7, 0.8, 0.9, 1.0],  # 각 트리에 대해 사용할 샘플의 비율
    'min_samples_split': [2, 3, 4, 5],  # 노드를 분할하기 위한 최소한의 샘플 수
    'min_samples_leaf': [1, 2, 3, 4],  # 리프 노드에 있어야 할 최소한의 샘플 수
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_dist, cv=5, scoring='recall', n_jobs=-1, verbose=2)

# 그리드 탐색을 통한 모델 피팅
grid_search.fit(X_train_val, y_train_val)

# 최적의 하이퍼파라미터 출력
print("최적의 하이퍼파라미터:", grid_search.best_params_)

# 최적의 모델 추출
best_gb_model = grid_search.best_estimator_

# 최적의 모델로 예측 수행
y_pred = best_gb_model.predict(X_test)

# 모델 평가
accuracy = accuracy_score(y_test, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_test, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

Fitting 5 folds for each of 320 candidates, totalling 1600 fits
최적의 하이퍼파라미터: {'max_depth': 4, 'min_samples_leaf': 2, 'min_samples_split': 3, 'subsample': 0.7}
정확도: 0.7948207171314741
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.87      0.85       341
           1       0.69      0.65      0.67       161

    accuracy                           0.79       502
   macro avg       0.77      0.76      0.76       502
weighted avg       0.79      0.79      0.79       502

Confusion Matrix:
[[295  46]
 [ 57 104]]


## train-validation split(X_train, X_val, y_train, y_val)

### 이상치만 제거

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# 데이터를 피처와 타겟 변수로 분리
X =  df.drop('churn', axis=1)
y =  df['churn']

# train-validation split (검증용 데이터는 계층적 샘플링을 사용합니다)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)  # 0.25 x 0.8 = 0.2

# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  3,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 2,
                                      n_estimators = 60)
# 모델 학습
model.fit(X_train, y_train)

# 테스트 세트 예측
y_pred = model.predict(X_val)

# 모델 평가
accuracy = accuracy_score(y_val, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_val, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)



정확도: 0.2649402390438247
Classification Report:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.00      0.00      0.00       339
           1       0.29      0.82      0.43       163

    accuracy                           0.26       502
   macro avg       0.10      0.27      0.14       502
weighted avg       0.09      0.26      0.14       502

Confusion Matrix:
[[  0   0   0]
 [ 29   0 312]
 [ 14   0 147]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### age_group만 생성

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# 데이터를 피처와 타겟 변수로 분리
X =  new_df1.drop('churn', axis=1)
y =  new_df1['churn']

# train-validation split (검증용 데이터는 계층적 샘플링을 사용합니다)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)  # 0.25 x 0.8 = 0.2
# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  3,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 2,
                                      n_estimators = 60)
# 모델 학습
model.fit(X_train, y_train)

# 테스트 세트 예측
y_pred = model.predict(X_val)

# 모델 평가
accuracy = accuracy_score(y_val, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_val, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

정확도: 0.8027888446215139
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       339
           1       0.74      0.61      0.67       163

    accuracy                           0.80       502
   macro avg       0.78      0.75      0.76       502
weighted avg       0.80      0.80      0.80       502

Confusion Matrix:
[[244  97]
 [122  39]]


### salary_group만 생성

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# 데이터를 피처와 타겟 변수로 분리
X =  new_df2.drop('churn', axis=1)
y =  new_df2['churn']

# train-validation split (검증용 데이터는 계층적 샘플링을 사용합니다)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)  # 0.25 x 0.8 = 0.2

# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  3,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 2,
                                      n_estimators = 60)
# 모델 학습
model.fit(X_train, y_train)

# 테스트 세트 예측
y_pred = model.predict(X_val)

# 모델 평가
accuracy = accuracy_score(y_val, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_val, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

정확도: 0.796812749003984
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.86       339
           1       0.73      0.60      0.66       163

    accuracy                           0.80       502
   macro avg       0.77      0.75      0.76       502
weighted avg       0.79      0.80      0.79       502

Confusion Matrix:
[[245  96]
 [122  39]]


### credit score + credit_card 그룹만 생성

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# 데이터를 피처와 타겟 변수로 분리
X =  new_df3.drop('churn', axis=1)
y =  new_df3['churn']

# train-validation split (검증용 데이터는 계층적 샘플링을 사용합니다)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)  # 0.25 x 0.8 = 0.2

# 그래디언트 부스팅 모델 생성
gb_model = GradientBoostingClassifier(learning_rate =  0.3,
                                      max_depth =  3,
                                      max_features ='sqrt',
                                      min_samples_leaf = 2,
                                      min_samples_split = 2,
                                      n_estimators = 60)
# 모델 학습
model.fit(X_train, y_train)

# 테스트 세트 예측
y_pred = model.predict(X_val)

# 모델 평가
accuracy = accuracy_score(y_val, y_pred)
print("정확도:", accuracy)

# classification report 출력
print("Classification Report:")
print(classification_report(y_val, y_pred))

# confusion matrix 출력
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

정확도: 0.8007968127490039
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.88      0.86       339
           1       0.72      0.63      0.67       163

    accuracy                           0.80       502
   macro avg       0.78      0.76      0.76       502
weighted avg       0.80      0.80      0.80       502

Confusion Matrix:
[[238 103]
 [123  38]]
