<a href="https://colab.research.google.com/github/i-am-U-hyUn/data-science/blob/main/%EB%AF%B8%EC%85%984_%EC%9D%80%ED%96%89%EA%B3%A0%EA%B0%9D%EB%8D%B0%EC%9D%B4%ED%84%B0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 구성

https://www.kaggle.com/datasets/shubhammeshram579/bank-customer-churn-prediction

1. Customer ID: 각 고객의 고유 식별자
2. Surname: 고객의 성(이름)
3. Credit Score: 고객의 신용점수를 나타내는 수치
4. Geography: 고객이 거주하는 국가(프랑스, 스페인, 독일)
5. Gender: 고객의 성별
6. Age: 고객의 나이
7. Tenure: 고객이 은행에 속한 연수
8. Balance: 고객의 계좌 잔액
9. NumOfProducts: 고객이 이용하는 은행 상품 수(예: 적금, 신용카드)
10. HasCrCard: 고객의 신용카드 보유 여부(1 = 예, 0 = 아니요)
11. IsActiveMember: 고객이 활성 회원인지 여부(1 = 예, 0 = 아니요)
12. EstimatedSalary: 고객의 예상 급여
13. Exited: 고객이 이탈했는지 여부(1 = 예, 0 = 아니요)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

%matplotlib inline

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv("Churn_Modelling.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10002 entries, 0 to 10001
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10002 non-null  int64  
 1   CustomerId       10002 non-null  int64  
 2   Surname          10002 non-null  object 
 3   CreditScore      10002 non-null  int64  
 4   Geography        10001 non-null  object 
 5   Gender           10002 non-null  object 
 6   Age              10001 non-null  float64
 7   Tenure           10002 non-null  int64  
 8   Balance          10002 non-null  float64
 9   NumOfProducts    10002 non-null  int64  
 10  HasCrCard        10001 non-null  float64
 11  IsActiveMember   10001 non-null  float64
 12  EstimatedSalary  10002 non-null  float64
 13  Exited           10002 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 1.1+ MB


In [None]:
# index, 고객 고유 아이디, 성은 상관없다 판단
df = df.iloc[:,3:]

# 범주형 One-hot 전처리 / NA 삭재

In [None]:
df.isnull().sum()

CreditScore        0
Geography          1
Gender             0
Age                1
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          1
IsActiveMember     1
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
# 3개의 na 삭재
df = df.dropna()

In [None]:
X = pd.get_dummies(df.iloc[:,df.columns != "Exited"])
y = df["Exited"]

# 모델 - Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

### 그리드 서치를 통해 하이퍼파라미터 찾기

In [None]:
model = RandomForestClassifier()

param_grid = {
    'n_estimators': [150, 300, 500],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X, y)
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 150}


In [None]:
best_model = grid_search.best_estimator_

In [None]:
y_pred = best_model.predict(X_test)

# 모델 성능

In [None]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)


print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.9220
Precision: 0.9406
Recall: 0.6593
F1 Score: 0.7752
