In [38]:
# 필요한 패키지 임포트
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import warnings 
warnings.filterwarnings('ignore')

In [39]:
# 데이터 로딩 => dataframe
cust_df = pd.read_csv('../data/santander-customer-satisfaction/train.csv')
test_df = pd.read_csv('../data/santander-customer-satisfaction/test.csv')
# 데이터 구조 확인
print(cust_df.shape)
print(test_df.shape)

(76020, 371)
(75818, 370)


In [40]:
# 1. ID feature drop
cust_df.drop('ID', axis=1, inplace=True)
test_df.drop('ID', axis=1, inplace=True)

In [41]:
X_features = cust_df.iloc[:, :-1] # 데이터 추출
y_labels = cust_df.loc[:, 'TARGET']
print(f'데이터 구조 : {X_features.shape}')

데이터 구조 : (76020, 369)


In [42]:
# 의미 없는 feature 찾기 위해 sklearn의 variancethreshold 사용
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.01)
sel.fit(X_features)

In [43]:
# 분산도 높은 feature 수
print(sum(sel.get_support()))

272


In [44]:
# 분산도 낮은 feature 컬럼 을 X_drop에 저장
X_drop = [x for x in X_features.columns if x not in X_features.columns[sel.get_support()]]

In [45]:
# 테스트 데이터와 학습데이터에서 분산도 낮은 feature제거 
X_features.drop(X_drop, axis=1, inplace=True)
test_df.drop(X_drop, axis=1, inplace=True)

In [46]:
# 데이터 구조 확인
X_features.shape, test_df.shape

((76020, 272), (75818, 272))

In [47]:
# 상관관계 제거
X_features_copy = X_features.copy()  # 학습 데이터
test_df_copy = test_df.copy() # 테스트 데이터

# 상관계수 행렬 계산
corr_matrix = X_features_copy.corr()

# 상관관계가 높은 피처 찾기
high_corr_var = np.where((corr_matrix > 1) | (corr_matrix < -1))

# 상관관계가 높은 피처 삭제
to_drop = set()  # 삭제할 피처를 저장할 집합

for i in range(len(high_corr_var[0])):
    if high_corr_var[0][i] != high_corr_var[1][i]:  # 같은 피처 쌍은 제외
        # 두 피처 중 하나만 삭제
        to_drop.add(X_features_copy.columns[high_corr_var[1][i]])

# 피처 삭제
X_features_copy.drop(columns=to_drop, inplace=True)
test_df_copy.drop(columns=to_drop, inplace=True)

# 결과 데이터프레임 확인
X_features_drop = X_features_copy
# 테스트 데이터
test_df_drop = test_df_copy


In [48]:
# 데이터 구조 확인
X_features_drop.shape, test_df_drop.shape

((76020, 263), (75818, 263))

In [49]:
# QuantileTransformer
from sklearn.preprocessing import QuantileTransformer

# 학습 데이터
X_features_QT = pd.DataFrame(X_features_drop)
# 테스트 데이터
test_df_QT = pd.DataFrame(test_df_drop)

# 퀀틸트랜스포머 초기화
quantile_transformer = QuantileTransformer(output_distribution='normal')

# 데이터 변환
# 학습 데이터
transformed_data = quantile_transformer.fit_transform(X_features_QT)
# 테스트 데이터
transformed_data_test = quantile_transformer.fit_transform(test_df_QT)

# 학습 데이터
X_features_Q = transformed_data
# 테스트 데이터
test_df_Q = transformed_data_test


In [50]:
# 데이터 구조 확인
X_features_Q.shape, test_df_Q.shape

((76020, 263), (75818, 263))

In [51]:
# 학습/테스트 데이터 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
      X_features_Q
    , y_labels
    , test_size=0.2
    , random_state=0
    , stratify=y_labels
)

In [58]:
# 조기 종료 => 검증 데이터 분리
X_tr, X_val, y_tr, y_val = train_test_split(
      X_train
    , y_train
    , test_size=0.3
    , random_state=0
    , stratify=y_train
)

In [54]:
# 학습, 평가 : AUC score
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [None]:
xgb_clf = XGBClassifier(
      n_estimators=800
    , max_depth=5
    , learning_rate=0.01
    , early_stopping_rounds=100
    , eval_metric='auc'
    , random_state=156
)

In [23]:
# 학습 진행 : fit()
xgb_clf.fit(
      X_tr # 학습할 데이터
    , y_tr # 학습할 답
    , eval_set=[(X_val, y_val)]
)
# 점수 표시
xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
print(f'ROC AUC : {xgb_roc_score:.4f}')

[0]	validation_0-auc:0.82195
[1]	validation_0-auc:0.82572
[2]	validation_0-auc:0.82483
[3]	validation_0-auc:0.82729
[4]	validation_0-auc:0.82983
[5]	validation_0-auc:0.83120
[6]	validation_0-auc:0.83308
[7]	validation_0-auc:0.83240
[8]	validation_0-auc:0.83038
[9]	validation_0-auc:0.83468
[10]	validation_0-auc:0.83691
[11]	validation_0-auc:0.83714
[12]	validation_0-auc:0.83829
[13]	validation_0-auc:0.83889
[14]	validation_0-auc:0.83915
[15]	validation_0-auc:0.83986
[16]	validation_0-auc:0.84045
[17]	validation_0-auc:0.84124
[18]	validation_0-auc:0.84178
[19]	validation_0-auc:0.84275
[20]	validation_0-auc:0.84319
[21]	validation_0-auc:0.84349
[22]	validation_0-auc:0.84441
[23]	validation_0-auc:0.84485
[24]	validation_0-auc:0.84476
[25]	validation_0-auc:0.84473
[26]	validation_0-auc:0.84451
[27]	validation_0-auc:0.84490
[28]	validation_0-auc:0.84435
[29]	validation_0-auc:0.84436
[30]	validation_0-auc:0.84452
[31]	validation_0-auc:0.84484
[32]	validation_0-auc:0.84499
[33]	validation_0-au

In [37]:
# (테스트데이터의 답, 테스트데이터의 예측확률)
xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1])
print(f'ROC AUC : {xgb_roc_score:.4f}')

ROC AUC : 0.8230


In [131]:
# hyperopt, KFold 사용한 튜닝
from hyperopt import hp

# max_depth는 1에서 10까지 2간격으로, min_child_weight는 1에서 6까지 1간격으로
# colsample_bytree는 0.5에서 0.95사이, learning_rate는 0.01에서 0.1사이 정규 분포된 값으로 검색. 

xgb_search_space = {'max_depth': hp.quniform('max_depth', 1, 10, 2), 
                    'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
                    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 0.95),
                    'learning_rate': hp.uniform('learning_rate', 0.01, 0.1)
}

In [143]:
# 데이터 구조 변경
X_train_df = pd.DataFrame(X_train)
y_train_sr = pd.Series(y_train)

In [144]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

# 목적 함수 설정. 
# 추후 fmin()에서 입력된 search_space값으로 XGBClassifier 교차 검증 학습 후 -1* roc_auc 평균 값을 반환.  
def objective_func(search_space):
    xgb_clf = XGBClassifier(n_estimators=100, max_depth=int(search_space['max_depth']),
                            min_child_weight=int(search_space['min_child_weight']),
                            colsample_bytree=search_space['colsample_bytree'],
                            learning_rate=search_space['learning_rate'],
                            early_stopping_rounds=30,
                            eval_metric='auc'
                           )
    # 2개 k-fold 방식으로 평가된 roc_auc 지표를 담는 list
    roc_auc_list= []
    
    # 2개 k-fold방식 적용 
    kf = KFold(n_splits=3)
    # X_train_over_df을 다시 학습과 검증용 데이터로 분리
    for tr_index, val_index in kf.split(X_train_df):
        # kf.split(X_train_over_df)으로 추출된 학습과 검증 index값으로 학습과 검증 데이터 세트 분리 
        X_tr, y_tr = X_train_df.iloc[tr_index], y_train_sr.iloc[tr_index]
        X_val, y_val = X_train_df.iloc[val_index], y_train_sr.iloc[val_index]
        # early stopping은 30회로 설정하고 추출된 학습과 검증 데이터로 XGBClassifier 학습 수행. 
        xgb_clf.fit(X_tr, y_tr,
                   eval_set=[(X_tr, y_tr), (X_val, y_val)])
    
        # 1로 예측한 확률값 추출후 roc auc 계산하고 평균 roc auc 계산을 위해 list에 결과값 담음. 
        score = roc_auc_score(y_val, xgb_clf.predict_proba(X_val)[:, 1])
        roc_auc_list.append(score)
        
    # 3개 k-fold로 계산된 roc_auc값의 평균값을 반환하되, 
    # HyperOpt는 목적함수의 최소값을 위한 입력값을 찾으므로 -1을 곱한 뒤 반환. 
    return -1 * np.mean(roc_auc_list)

In [145]:
from hyperopt import fmin, tpe, Trials

trials = Trials()

# fmin()함수를 호출. max_evals지정된 횟수만큼 반복 후 목적함수의 최소값을 가지는 최적 입력값 추출. 
best = fmin(fn=objective_func, space=xgb_search_space, algo=tpe.suggest,
            max_evals=50, # 최대 반복 횟수를 지정합니다.
            trials=trials, rstate=np.random.default_rng(seed=156))

print('best:', best)

[0]	validation_0-auc:0.77925	validation_1-auc:0.75694 
[1]	validation_0-auc:0.83920	validation_1-auc:0.81382 
[2]	validation_0-auc:0.83388	validation_1-auc:0.81077 
[3]	validation_0-auc:0.82972	validation_1-auc:0.80592 
[4]	validation_0-auc:0.82722	validation_1-auc:0.80273 
[5]	validation_0-auc:0.82353	validation_1-auc:0.79582 
[6]	validation_0-auc:0.83619	validation_1-auc:0.80975 
[7]	validation_0-auc:0.83465	validation_1-auc:0.80767 
[8]	validation_0-auc:0.83394	validation_1-auc:0.80585 
[9]	validation_0-auc:0.83884	validation_1-auc:0.81192 
[10]	validation_0-auc:0.84160	validation_1-auc:0.81541
[11]	validation_0-auc:0.84664	validation_1-auc:0.82118
[12]	validation_0-auc:0.84885	validation_1-auc:0.82378
[13]	validation_0-auc:0.85106	validation_1-auc:0.82627
[14]	validation_0-auc:0.84921	validation_1-auc:0.82450
[15]	validation_0-auc:0.85224	validation_1-auc:0.82719
[16]	validation_0-auc:0.85102	validation_1-auc:0.82565
[17]	validation_0-auc:0.85003	validation_1-auc:0.82504
[18]	valid

In [26]:
best= {'colsample_bytree': np.float64(0.8242619585971493)
       , 'learning_rate': np.float64(0.08431814420718546)
       , 'max_depth': np.float64(4.0)
       , 'min_child_weight': np.float64(3.0)}

In [59]:
xgb_clf =  XGBClassifier(
                        n_estimators=500,
                        max_depth=int(best['max_depth']),
                        min_child_weight=int(best['min_child_weight']), 
                        colsample_bytree=best['colsample_bytree'],
                        learning_rate=best['learning_rate'],
                        early_stopping_rounds=100, 
                        eval_metric="auc"
                        )

# evaluation metric을 auc로, early stopping은 100 으로 설정하고 학습 수행. 
xgb_clf.fit(X_tr, y_tr,eval_set=[(X_tr, y_tr), (X_val, y_val)])

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:.4f}'.format(xgb_roc_score))


[0]	validation_0-auc:0.82056	validation_1-auc:0.82084
[1]	validation_0-auc:0.82483	validation_1-auc:0.82538
[2]	validation_0-auc:0.82599	validation_1-auc:0.82339
[3]	validation_0-auc:0.83005	validation_1-auc:0.82681
[4]	validation_0-auc:0.83371	validation_1-auc:0.82924
[5]	validation_0-auc:0.83456	validation_1-auc:0.83078
[6]	validation_0-auc:0.83541	validation_1-auc:0.83270
[7]	validation_0-auc:0.83778	validation_1-auc:0.83227
[8]	validation_0-auc:0.83479	validation_1-auc:0.83027
[9]	validation_0-auc:0.83907	validation_1-auc:0.83446
[10]	validation_0-auc:0.84173	validation_1-auc:0.83675
[11]	validation_0-auc:0.84280	validation_1-auc:0.83702
[12]	validation_0-auc:0.84456	validation_1-auc:0.83817
[13]	validation_0-auc:0.84543	validation_1-auc:0.83886
[14]	validation_0-auc:0.84516	validation_1-auc:0.83891
[15]	validation_0-auc:0.84632	validation_1-auc:0.83967
[16]	validation_0-auc:0.84689	validation_1-auc:0.84043
[17]	validation_0-auc:0.84700	validation_1-auc:0.84105
[18]	validation_0-au

In [60]:
xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:,1])
print('ROC AUC: {0:.4f}'.format(xgb_roc_score))

ROC AUC: 0.8238


In [31]:
# 전체 데이터 학습
xgb_clf =  XGBClassifier(n_estimators=1000,
                        max_depth=int(best['max_depth']),
                        min_child_weight=int(best['min_child_weight']), 
                        colsample_bytree=best['colsample_bytree'],
                        learning_rate=best['learning_rate'],
                        early_stopping_rounds=100, 
                        eval_metric="auc"
                        )

# evaluation metric을 auc로, early stopping은 100 으로 설정하고 학습 수행. 
xgb_clf.fit(X_features_Q, y_labels, eval_set=[(X_val, y_val)])

xgb_roc_score = roc_auc_score(y_labels, xgb_clf.predict_proba(X_features_Q)[:,1])
print('ROC AUC: {0:.4f}'.format(xgb_roc_score))


[0]	validation_0-auc:0.82646
[1]	validation_0-auc:0.82841
[2]	validation_0-auc:0.82578
[3]	validation_0-auc:0.83230
[4]	validation_0-auc:0.83486
[5]	validation_0-auc:0.83733
[6]	validation_0-auc:0.83767
[7]	validation_0-auc:0.83906
[8]	validation_0-auc:0.83701
[9]	validation_0-auc:0.84225
[10]	validation_0-auc:0.84436
[11]	validation_0-auc:0.84579
[12]	validation_0-auc:0.84699
[13]	validation_0-auc:0.84750
[14]	validation_0-auc:0.84800
[15]	validation_0-auc:0.84879
[16]	validation_0-auc:0.84926
[17]	validation_0-auc:0.84966
[18]	validation_0-auc:0.84998
[19]	validation_0-auc:0.85054
[20]	validation_0-auc:0.85146
[21]	validation_0-auc:0.85194
[22]	validation_0-auc:0.85251
[23]	validation_0-auc:0.85314
[24]	validation_0-auc:0.85353
[25]	validation_0-auc:0.85428
[26]	validation_0-auc:0.85469
[27]	validation_0-auc:0.85498
[28]	validation_0-auc:0.85532
[29]	validation_0-auc:0.85575
[30]	validation_0-auc:0.85616
[31]	validation_0-auc:0.85685
[32]	validation_0-auc:0.85724
[33]	validation_0-au

In [86]:
test_result = xgb_clf.predict_proba(test_df_Q)[:,1]

In [87]:
sample = pd.read_csv('./santander-customer-satisfaction/sample_submission.csv')
sample['TARGET'] = test_result

In [88]:
sample.to_csv('./result.csv', index=False)

In [32]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

def get_clf_eval(y_test, pred=None, pred_proba=None):
    confusion = confusion_matrix( y_test, pred)
    accuracy = accuracy_score(y_test , pred)
    precision = precision_score(y_test , pred)
    recall = recall_score(y_test , pred)
    f1 = f1_score(y_test,pred)
    # ROC-AUC 추가 
    roc_auc = roc_auc_score(y_test, pred_proba)
    print('오차 행렬')
    print(confusion)
    # ROC-AUC print 추가
    print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f},\
    F1: {3:.4f}, AUC:{4:.4f}'.format(accuracy, precision, recall, f1, roc_auc))


In [57]:
get_clf_eval(y_test, xgb_clf.predict(X_test), xgb_clf.predict_proba(X_test)[:,1])

오차 행렬
[[13290  1312]
 [  349   253]]
정확도: 0.8908, 정밀도: 0.1617, 재현율: 0.4203,    F1: 0.2335, AUC:0.7765


In [33]:
get_clf_eval(y_test, xgb_clf.predict(X_test), xgb_clf.predict_proba(X_test)[:,1])

오차 행렬
[[14601     1]
 [  533    69]]
정확도: 0.9649, 정밀도: 0.9857, 재현율: 0.1146,    F1: 0.2054, AUC:0.9086
