In [10]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.model_selection import PredefinedSplit
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, recall_score, precision_score, accuracy_score, make_scorer
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.feature_selection import SelectFromModel, RFE
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import matplotlib.pyplot as plt
import numpy as np
import random
from itertools import product
import time
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss, TomekLinks, ClusterCentroids
from catboost import CatBoostClassifier

### 배경 설명

신용카드 회사에서 제공한 데이터. 개인정보보호를 위해 각 변수들은 PCA Transformation을 통해 V1~V30으로 변환되었다. 28만개의 데이터 중 약 500개가 신용카드 사기를 당한 사람들의 데이터. 

### 목표

양성 클래스 (신용카드 사기를 당한 케이스)의 예측을 AUPRC을 평가기준으로 삼아 모델링 학습

### 사용 모델

XGBoosting, CatBoosting, LightGBM 사용

### 학습 방법

1. 데이터셋 분리

gpu 적용 가능한 XGB, CatBoosting, LightGBM을 원본 데이터셋 그대로 사용

2. Feature Selection

SFS, SelectFromModel, RFE 라이브러리를 사용하여 Feature Selection 시행, 원본 데이터셋과 학습속도와 학습 성능 비교

3. GridSearch

cv = 5으로 교차검증하여 평균 점수를 비교. 여러 파라미터들을 학습시켜가며 최적의 파라미터 찾기


### 결과

XGB의 성능이 AUPRC값 0.88 정도로 가장 높았으며 LightGBM, CatBoosting은 0.86 정도로 비슷하나 조금 낮은 수준.

In [2]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
print('데이터 갯수: ', len(df))
df.head(5)

데이터 갯수:  284807


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


<span style = 'font-size: 18px;'>

데이터 갯수가 28만개나 되다보니 학습속도가 지나치게 느릴 수 있다. 사용할 전략은

1. GPU 적용가능한 XGBoost 이용
2. Fraud가 아닌 데이터를 약 5만개로 줄여서 학습

GridSearch를 이용할 때에는
1. 한 파라미터 당 3개 정도의 종류로 학습
2. 파라미터 중간값의 성능이 좋다면 그대로 고정, 최저나 최고 값이 성능이 좋다면 값의 범위를 조정
3. 1, 2단계 반복
4. 수렴하는 경우 중단 

In [3]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
X = df.drop(columns = ['Class', 'Time'])
y = df['Class']

xgb_model = XGBClassifier(random_state = 42, tree_method = 'hist', device = 'cuda')

param_grid = {
    'n_estimators' : [250],
    'max_depth' : [4],
    'min_child_weight' : [6],
    'learning_rate' : [ 0.1],
    'colsample_bytree' : [1.0],
    'reg_lambda' : [1],
    'gamma' : [0],
    'scale_pos_weight' : [1, 100, 200, 400]
}

stratifiedkfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state =42)

grid_search = GridSearchCV(
    xgb_model,
    param_grid = param_grid,
    cv = stratifiedkfold,
    scoring = 'average_precision',
    verbose = 1,
    n_jobs = -1
)

grid_search.fit(X, y)


Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [4]:
cv_results = pd.DataFrame(grid_search.cv_results_)
sorted_results = cv_results[['mean_test_score', 'params']].sort_values(by = 'mean_test_score', ascending = False)
pd.set_option('display.max_colwidth', None)
sorted_results.head(15)

Unnamed: 0,mean_test_score,params
0,0.861961,"{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 6, 'n_estimators': 250, 'reg_lambda': 1, 'scale_pos_weight': 1}"
2,0.853861,"{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 6, 'n_estimators': 250, 'reg_lambda': 1, 'scale_pos_weight': 200}"
1,0.852496,"{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 6, 'n_estimators': 250, 'reg_lambda': 1, 'scale_pos_weight': 100}"
3,0.846976,"{'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 6, 'n_estimators': 250, 'reg_lambda': 1, 'scale_pos_weight': 400}"


<span style = 'font-size: 17px;'>

#### XGB 파라미터 조정 결과

1. max_depth : 4 
2. n_estimators : 250
3. gamma : 0
4. learning_rate : 0.1
5. min_child_weidght : 6 
6. reg_lambda : 1
7. colsample_bytree : 1.0
8. pos_scale_weight : 1

에서 가장 높은 score 값을 보인다

최종 mean_test_score는 0.86으로 높게 나온 편. <br>


----


#### Feature Selection

<span style = 'font-size: 18px;'>

이제 feature_selection을 해보자. Feature Selection은

1. RFE 
2. SelectFromModel
3. Stepwise Selection 

을 이용하여 최적의 피처엔지니어링을 수행, 이후 모델 성능이 올라갔는지 평가


In [15]:
X = df.drop(columns = ['Class', 'Time'])
y = df['Class']

auprc_scorer = make_scorer(average_precision_score, needs_proba = True)


best_model = XGBClassifier(
    max_depth = 4,
    n_estimators = 250,
    gamma = 0,
    learning_rate = 0.1,
    min_child_weight = 6,
    reg_lambda = 1,
    colsample_bytree = 1.0,
    scale_pos_weight = 1,
    use_label_encoder = False,   
    tree_method = 'hist',
    device = 'cuda'
)


stratifiedkfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

start_time_1 = time.time()
cv_scores = cross_val_score(
    estimator = best_model,
    X = X,
    y = y,
    cv = stratifiedkfold,
    scoring = auprc_scorer,
    n_jobs = -1   
)
end_time_1 = time.time()
print(f'{end_time_1 - start_time_1:.2f}')



print(cv_scores)
print(cv_scores.mean())

11.30
[0.8529059  0.89532714 0.87258875 0.85560161 0.83338182]
0.8619610431880524


In [16]:
rfe = RFE(estimator = best_model, n_features_to_select = 10, step =3 )
rfe.fit(X, y)
feature_selected = X.columns[rfe.support_]
X_selected = X[feature_selected]



reduced_best_model = XGBClassifier(
    max_depth = 4,
    n_estimators = 250,
    gamma = 0,
    learning_rate = 0.1,
    min_child_weidght = 6,
    reg_lambda = 1,
    colsample_bytree = 1.0,
    pos_scale_weight = 1,
    use_label_encoder= False,
    tree_method  = 'hist',
    device = 'cuda'
)

stratifiedkfold = StratifiedKFold(n_splits = 5, shuffle =True, random_state = 42)


start_time_2  = time.time()
reduced_cv_scores = cross_val_score(
    estimator = reduced_best_model,
    X = X_selected,
    y = y,
    cv = stratifiedkfold,
    scoring  = auprc_scorer,
    n_jobs = -1
)
end_time_2 = time.time()
print(f"{end_time_2 - start_time_2:.2f}")


print(reduced_cv_scores)
print(reduced_cv_scores.mean())

5.99
[0.83935848 0.88115387 0.86876874 0.84645632 0.82222585]
0.8515926510066911


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    stratify = y, random_state = 42)


best_model = XGBClassifier(
    max_depth=4,
    n_estimators=250,
    gamma=0,
    learning_rate=0.1,
    min_child_weight=6,
    reg_lambda=1,
    colsample_bytree=1.0,
    scale_pos_weight=1,
    use_label_encoder=False,


)

best_model.fit(X_train, y_train)


selector = SelectFromModel(estimator = best_model, threshold = "median", prefit = True)
selected_features = X.columns[selector.get_support()]
X_selected = X[selected_features]


stratifiedkfold = StratifiedKFold(n_splits = 5, shuffle =True, random_state = 42)

best_model = XGBClassifier(
    max_depth=4,
    n_estimators=250,
    gamma=0,
    learning_rate=0.1,
    min_child_weight=6,
    reg_lambda=1,
    colsample_bytree=1.0,
    scale_pos_weight=1,
    use_label_encoder=False,
    tree_method = 'hist',
    device = 'cuda'

)


start_time_2  = time.time()
reduced_cv_scores = cross_val_score(
    estimator = best_model,
    X = X_selected,
    y = y,
    cv = stratifiedkfold,
    scoring  = auprc_scorer,
    n_jobs = -1
)
end_time_2 = time.time()
print(f"{end_time_2 - start_time_2:.2f}")


print(reduced_cv_scores)
print(reduced_cv_scores.mean())

8.54
[0.85021613 0.89269761 0.8700275  0.8409479  0.83658298]
0.8580944254047725


In [22]:
best_model = XGBClassifier(
    max_depth=4,
    n_estimators=250,
    gamma=0,
    learning_rate=0.1,
    min_child_weight=6,
    reg_lambda=1,
    colsample_bytree=1.0,
    scale_pos_weight=1,
    use_label_encoder=False,
    tree_method  = 'hist',
    device = 'cuda'
)


strafiedkfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42 )

sfs = SFS(
    estimator = best_model,
    k_features = 15,
    forward = True,
    floating = False,
    scoring = 'average_precision',
    cv = stratifiedkfold,
    n_jobs = -1
)


start_time_2  = time.time()
sfs.fit(X, y)
end_time_2 = time.time()




print(f"{end_time_2 - start_time_2:.2f}")

# 최종 선택된 특성 단계 정보 가져오기
final_step = len(sfs.subsets_)  # 마지막 단계 번호
final_subset = sfs.subsets_[final_step]

# 최종 선택된 특성 및 점수 출력
print(f"Final Selected Features: {final_subset['feature_idx']}")
print(f"Final CV Scores: {final_subset['cv_scores']}")
print(f"Final Average Score: {final_subset['avg_score']:.4f}")


1225.68
Final Selected Features: (1, 3, 6, 7, 8, 9, 13, 15, 16, 27, 28)
Final CV Scores: [0.84392862 0.89369211 0.8767866  0.86390733 0.83517604]
Final Average Score: 0.8627



STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

Feature_Selection은 일반적으로 학습시간을 감소시키지만 동시에 성능도 소폭 감소시킴을 알 수 있다. <br>
또한 SFS는 시간이 굉장히 오래 걸리는 편...  

In [6]:
from lightgbm import LGBMClassifier


df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
X = df.drop(columns = ['Class', 'Time'])
y = df['Class']


lgbm_model = LGBMClassifier(random_state = 42, device = 'gpu', boosting_type = 'gbdt', verbose = -1)

param_grid = {
    'n_estimators' : [400],
    'max_depth' : [10],
    'learning_rate' : [0.1],
    'min_child_weight' : [3],
    'min_data_in_leaf' : [1, 2, 3],
    'scale_pos_weight' : [1],
    'reg_lambda' : [1]
}

stratifiedkfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42 )
auprc_scorer = make_scorer(average_precision_score, needs_proba = True)


grid_search = GridSearchCV(
    estimator = lgbm_model,
    param_grid = param_grid,
    cv = stratifiedkfold,
    scoring = auprc_scorer,
    n_jobs = -1
)

grid_search.fit(X, y)
cv_results = pd.DataFrame(grid_search.cv_results_)
sorted_results = cv_results[['mean_test_score', 'params']].sort_values(by = 'mean_test_score', ascending = False)
pd.set_option('display.max_colwidth', None)
sorted_results.head(10)

Unnamed: 0,mean_test_score,params
0,0.86022,"{'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 400, 'reg_lambda': 1, 'scale_pos_weight': 1}"
2,0.845732,"{'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 400, 'reg_lambda': 1, 'scale_pos_weight': 10}"
1,0.841026,"{'learning_rate': 0.1, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 400, 'reg_lambda': 1, 'scale_pos_weight': 5}"


In [18]:
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
X = df.drop(columns = ['Class', 'Time'])
y = df['Class']


catboost_model = CatBoostClassifier(
    random_state=42,
    task_type = 'GPU',
    verbose = 0   
)


param_grid = {
    'iterations': [ 600, 700],
    'depth': [10],
    'learning_rate': [0.1],
    'l2_leaf_reg': [3],
    'scale_pos_weight': [1],
    'min_data_in_leaf' : [ 5],
    'border_count' : [128, 256, 512]
}


stratifiedkfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auprc_scorer = make_scorer(average_precision_score, needs_proba=True)

grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=param_grid,
    cv=stratifiedkfold,
    scoring=auprc_scorer,
    n_jobs=1
)


grid_search.fit(X, y)

cv_results = pd.DataFrame(grid_search.cv_results_)
sorted_results = cv_results[['mean_test_score', 'params']].sort_values(by='mean_test_score', ascending=False)
print(sorted_results.head(10))

   mean_test_score  \
1         0.862292   
5         0.862115   
0         0.861881   
4         0.861837   
3         0.858577   
2         0.858448   

                                                                                                                                        params  
1  {'border_count': 128, 'depth': 10, 'iterations': 700, 'l2_leaf_reg': 3, 'learning_rate': 0.1, 'min_data_in_leaf': 5, 'scale_pos_weight': 1}  
5  {'border_count': 512, 'depth': 10, 'iterations': 700, 'l2_leaf_reg': 3, 'learning_rate': 0.1, 'min_data_in_leaf': 5, 'scale_pos_weight': 1}  
0  {'border_count': 128, 'depth': 10, 'iterations': 600, 'l2_leaf_reg': 3, 'learning_rate': 0.1, 'min_data_in_leaf': 5, 'scale_pos_weight': 1}  
4  {'border_count': 512, 'depth': 10, 'iterations': 600, 'l2_leaf_reg': 3, 'learning_rate': 0.1, 'min_data_in_leaf': 5, 'scale_pos_weight': 1}  
3  {'border_count': 256, 'depth': 10, 'iterations': 700, 'l2_leaf_reg': 3, 'learning_rate': 0.1, 'min_data_in_leaf': 5, 