In [2]:
import pandas as pd

x_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_train.csv')
y_train = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_train.csv')
x_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/X_test.csv')

In [3]:
# 특성 제거
drop_columns = ['CustomerId', 'Surname']
x_train_drop = x_train.drop(columns=drop_columns)
x_test_drop = x_test.drop(columns=drop_columns)

In [14]:
# 범주형 값을 고유 이진 벡터로 변환
x_train_dummies = pd.get_dummies(x_train_drop)
# 종속변수 뽑아내기
y = y_train['Exited']

In [15]:
# 테스트 값도 더미화
x_test_dummies = pd.get_dummies(x_test_drop)
# train과 컬럼 순서 동일하게 하기
x_test_dummies = x_test_dummies[x_train_dummies.columns]

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_validation, y_train, y_validation = train_test_split(x_train_dummies, y, test_size=0.3, random_state=42, stratify=y)
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)

In [34]:
predict_train_label = rfc.predict(X_train)
predict_train_proba = rfc.predict_proba(X_train)[:,1]

predict_validation_label = rfc.predict(X_validation)
predict_validation_proba = rfc.predict_proba(X_validation)[:,1]

In [37]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

print(f'train accuracy : {accuracy_score(y_train, predict_train_label)}')
print(f'validation accuracy : {accuracy_score(y_validation, predict_validation_label)}\n')

print(f'train f1_score : {f1_score(y_train, predict_train_label)}')
print(f'validation f1_score : {f1_score(y_validation, predict_validation_label)}\n')

print(f'train recall : {recall_score(y_train, predict_train_label)}')
print(f'validation recall : {recall_score(y_validation, predict_validation_label)}\n')

print(f'train precision : {precision_score(y_train, predict_train_label)}')
print(f'validation precision : {precision_score(y_validation, predict_validation_label)}\n')

print(f'train roc_auc_score : {roc_auc_score(y_train, predict_train_proba)}')
print(f'validation roc_auc_score : {roc_auc_score(y_validation, predict_validation_proba)}\n')

train accuracy : 1.0
validation accuracy : 0.8630769230769231

train f1_score : 1.0
validation f1_score : 0.5572139303482587

train recall : 1.0
validation recall : 0.42317380352644834

train precision : 1.0
validation precision : 0.8155339805825242

train roc_auc_score : 1.0
validation roc_auc_score : 0.844464520607713



In [41]:
predict_test_label = rfc.predict(x_test_dummies)
predict_test_proba = rfc.predict_proba(x_test_dummies)[:,1]
y_test = pd.read_csv('https://raw.githubusercontent.com/Datamanim/datarepo/main/churnk/y_test.csv')
y_test = y_test['Exited']

In [43]:
print(f'test accuracy : {accuracy_score(y_test, predict_test_label)}')
print(f'test f1_score : {f1_score(y_test, predict_test_label)}')
print(f'test recall : {recall_score(y_test, predict_test_label)}')
print(f'test precision : {precision_score(y_test, predict_test_label)}')
print(f'test roc_auc_score : {roc_auc_score(y_test, predict_test_proba)}')

test accuracy : 0.8631819480148529
test f1_score : 0.5625570776255707
test recall : 0.4319775596072931
test precision : 0.806282722513089
test roc_auc_score : 0.844979032559899


In [44]:
pd.DataFrame({'CustomerId': x_test['CustomerId'], 'Exited': predict_test_label})

Unnamed: 0,CustomerId,Exited
0,15601012,1
1,15734762,1
2,15586757,0
3,15590888,0
4,15726087,0
...,...,...
3496,15733966,0
3497,15669994,0
3498,15712403,1
3499,15643819,0
