In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, ParameterGrid, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

train_scaled = pd.read_csv("data/train_scaled.csv")
train_removed = pd.read_csv("data/train_removed.csv")

Train Data와 Test Data 구분

In [49]:
X_scaled = train_scaled.drop(['Class'], axis=1)
y_scaled = train_scaled['Class']
X_removed = train_removed.drop(['Class'], axis=1)
y_removed = train_removed['Class']

In [50]:
X_scaled_train, X_scaled_test, y_scaled_train, y_scaled_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=0)
X_removed_train, X_removed_test, y_removed_train, y_removed_test = train_test_split(X_removed, y_removed, test_size=0.2, random_state=0)
print("X_scaled_train: {0}행, X_scaled_test: {1}행, y_scaled_train: {2}행, y_scaled_test: {3}행".\
    format(X_scaled_train.shape[0], X_scaled_test.shape[0], y_scaled_train.shape[0], y_scaled_test.shape[0]))
print("X_removed_train: {0}행, X_removed_test: {1}행, y_removed_train: {2}행, y_removed_test: {3}행".\
    format(X_removed_train.shape[0], X_removed_test.shape[0], y_removed_train.shape[0], y_removed_test.shape[0]))

X_scaled_train: 399360행, X_scaled_test: 99841행, y_scaled_train: 399360행, y_scaled_test: 99841행
X_removed_train: 163503행, X_removed_test: 40876행, y_removed_train: 163503행, y_removed_test: 40876행


In [51]:
scaled_train = pd.merge(X_scaled_train, y_scaled_train, how='inner', left_index=True, right_index=True)

print("Scaled Train Data의 Genuine Transaction(클래스 0): {}건".format(scaled_train['Class'].value_counts()[0]))
print("Scaled Train Data의 Fraudulent Transaction(클래스 1): {}건".format(scaled_train['Class'].value_counts()[1]))
print("Fraudulent / Genuine: {:4f}%".format(scaled_train['Class'].value_counts()[1]/scaled_train['Class'].value_counts()[0]))

Scaled Train Data의 Genuine Transaction(클래스 0): 398659건
Scaled Train Data의 Fraudulent Transaction(클래스 1): 701건
Fraudulent / Genuine: 0.001758%


In [52]:
removed_train = pd.merge(X_removed_train, y_removed_train, how='inner', left_index=True, right_index=True)

print("Removed Train Data의 Genuine Transaction(클래스 0): {}건".format(removed_train['Class'].value_counts()[0]))
print("Removed Train Data의 Fraudulent Transaction(클래스 1): {}건".format(removed_train['Class'].value_counts()[1]))
print("Fraudulent / Genuine: {:4f}%".format(removed_train['Class'].value_counts()[1]/removed_train['Class'].value_counts()[0]))

Removed Train Data의 Genuine Transaction(클래스 0): 163426건
Removed Train Data의 Fraudulent Transaction(클래스 1): 77건
Fraudulent / Genuine: 0.000471%


In [53]:
scaled_test = pd.merge(X_scaled_test, y_scaled_test, how='inner', left_index=True, right_index=True)
removed_test = pd.merge(X_removed_test, y_removed_test, how='inner', left_index=True, right_index=True)

In [54]:
def convert_dtype(dataframe):
    dataframe_converted = dataframe.copy()
    float64_columns = []
    int64_columns = []

    for column in dataframe_converted.columns:
        if dataframe_converted[column].dtype == 'float64':
            float64_columns.append(column)
        elif dataframe_converted[column].dtype == 'int64':
            int64_columns.append(column)
    
    for column in float64_columns:
        dataframe_converted[column] = dataframe_converted[column].astype('float16')
    
    for column in int64_columns:
        dataframe_converted[column] = dataframe_converted[column].astype('int8')

    return dataframe_converted

In [55]:
scaled = convert_dtype(scaled_train)
removed = convert_dtype(removed_train)
test_scaled = convert_dtype(scaled_test)
test_removed = convert_dtype(removed_test) 

데이터: scaled와 removed

언더샘플링

In [56]:
def undersampling(data):
    class_0 = data[data['Class']==0]
    class_1 = data[data['Class']==1]
    wanted_ratio = class_1.shape[0] * 10/ class_0.shape[0]
    sample_size = int((class_1.shape[0] * (1-wanted_ratio)) / wanted_ratio)
    class_0_sampled_df = class_0.sample(n=sample_size, random_state=0)
    train_sampled = pd.concat([class_0_sampled_df, class_1], axis=0)
    return train_sampled

데이터와 모델 선택   
DecisionTree, LogisticRegression, KNeighborsClassifier, RandomForestClassifier, XGBClassifier, LGBMClassifier, CatboostClassifier

In [17]:
def classification(datas, classifiers):
    result = []
    for data_name, data in datas.items():
        X = data.drop(['Class'], axis=1)
        y = data['Class']
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
        sampled_train = undersampling(pd.concat([X_train, y_train], axis=1))
        sampled_X_train = sampled_train.drop(['Class'], axis=1)
        sampled_y_train = sampled_train['Class']
        for classifier in classifiers:
            classifier.fit(sampled_X_train, sampled_y_train)
            y_pred_proba = classifier.predict_proba(X_val)[:, 1]                    
            score = roc_auc_score(y_val, y_pred_proba)
            result.append([data_name, classifier.__class__.__name__, score])
    df = pd.DataFrame(result, columns=['data', 'model', 'roc_auc']).sort_values('roc_auc', ascending=False)
    return df

In [18]:
dt_clf = DecisionTreeClassifier()
lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()
rf_clf = RandomForestClassifier()
xgb_clf = XGBClassifier()
lgbm_clf = LGBMClassifier()
cat_clf = CatBoostClassifier(verbose=False)

classifiers = [dt_clf, lr_clf, knn_clf, rf_clf, xgb_clf, lgbm_clf, cat_clf]
datas = {"outlier_scaled": scaled,  "outlier_removed": removed}


In [19]:
classification(datas, classifiers)

Unnamed: 0,data,model,roc_auc
5,outlier_scaled,LGBMClassifier,0.889629
6,outlier_scaled,CatBoostClassifier,0.888952
3,outlier_scaled,RandomForestClassifier,0.880137
4,outlier_scaled,XGBClassifier,0.879137
1,outlier_scaled,LogisticRegression,0.870068
2,outlier_scaled,KNeighborsClassifier,0.742622
0,outlier_scaled,DecisionTreeClassifier,0.717009
13,outlier_removed,CatBoostClassifier,0.664308
8,outlier_removed,LogisticRegression,0.621654
11,outlier_removed,XGBClassifier,0.615642


데이터: outlier_scaled 선택   
모델: CatBoostClassifier와 XGBoostClassifier, LGBMClassifier 3개 모델 선택

하이퍼 파라미터 튜닝

In [60]:
def hyper_parameter_search(data, classifier, classifiers_parameters, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)
    result = pd.DataFrame()

    X = data.drop(['Class'], axis=1)
    y = data['Class']

    for clf, params in classifiers_parameters.items():
        model = classifier[clf]
        param_grid = ParameterGrid(params)

        for param in param_grid:
            scores = []

            for train_index, val_index in skf.split(X, y):
                X_train, X_val = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y.iloc[train_index], y.iloc[val_index]

                # Undersampling
                sampled_train = undersampling(pd.concat([X_train, y_train], axis=1))
                sampled_X_train = sampled_train.drop(['Class'], axis=1)
                sampled_y_train = sampled_train['Class']

                if clf == 'cat_clf':
                    new_model = CatBoostClassifier(**param)
                    new_model.fit(sampled_X_train, sampled_y_train)
                    y_pred_proba = new_model.predict_proba(X_val)[:, 1]
                else:
                    model.set_params(**param)
                    model.fit(sampled_X_train, sampled_y_train)
                    y_pred_proba = model.predict_proba(X_val)[:, 1]

                score = roc_auc_score(y_val, y_pred_proba)
                scores.append(score)

            avg_score = np.mean(scores)
            temp = pd.DataFrame({'model': clf, 'parameter': [param], 'roc_auc': avg_score})
            result = pd.concat([result, temp], ignore_index=True)

    return result



In [61]:
xgb_clf = XGBClassifier()
lgbm_clf = LGBMClassifier()
cat_clf = CatBoostClassifier(verbose=True)

classifier = {'xgb_clf': xgb_clf,
              'lgbm_clf': lgbm_clf,
              'cat_clf': cat_clf}

classifiers_parameters = {'xgb_clf': {'n_estimators': [200],
                                     'max_depth': [3, 4, 5],
                                     'learning_rate': [0.03, 0.04],
                                     'objective': ['binary:logistic']},

                         'cat_clf': {'n_estimators': [200],
                                     'learning_rate': [0.03, 0.045], 
                                     'depth': [6, 8],
                                     'min_data_in_leaf': [12, 14],
                                     'boosting_type': ['Plain'],
                                     'bootstrap_type': ['Bernoulli'],
                                     'verbose': [False]},

                         'lgbm_clf' : {'n_estimators': [200],
                                       'learning_rate': [0.03, 0.04],
                                       'max_depth': [4, 6],
                                       'num_leaves': [50, 70]}}

In [63]:
model3_result = hyper_parameter_search(scaled, classifier, classifiers_parameters).sort_values('roc_auc', ascending=False)
model3_result

Unnamed: 0,model,parameter,roc_auc
11,cat_clf,"{'boosting_type': 'Plain', 'bootstrap_type': '...",0.897945
10,cat_clf,"{'boosting_type': 'Plain', 'bootstrap_type': '...",0.897945
6,cat_clf,"{'boosting_type': 'Plain', 'bootstrap_type': '...",0.895474
7,cat_clf,"{'boosting_type': 'Plain', 'bootstrap_type': '...",0.895474
8,cat_clf,"{'boosting_type': 'Plain', 'bootstrap_type': '...",0.893638
9,cat_clf,"{'boosting_type': 'Plain', 'bootstrap_type': '...",0.893638
12,cat_clf,"{'boosting_type': 'Plain', 'bootstrap_type': '...",0.893617
13,cat_clf,"{'boosting_type': 'Plain', 'bootstrap_type': '...",0.893617
3,xgb_clf,"{'learning_rate': 0.04, 'max_depth': 3, 'n_est...",0.891253
19,lgbm_clf,"{'learning_rate': 0.04, 'max_depth': 4, 'n_est...",0.890479


In [64]:
top10 = pd.json_normalize(model3_result[:10]['parameter'])
top10

Unnamed: 0,boosting_type,bootstrap_type,depth,learning_rate,min_data_in_leaf,n_estimators,verbose,max_depth,objective,num_leaves
0,Plain,Bernoulli,8.0,0.03,14.0,200,False,,,
1,Plain,Bernoulli,8.0,0.03,12.0,200,False,,,
2,Plain,Bernoulli,6.0,0.03,12.0,200,False,,,
3,Plain,Bernoulli,6.0,0.03,14.0,200,False,,,
4,Plain,Bernoulli,6.0,0.045,12.0,200,False,,,
5,Plain,Bernoulli,6.0,0.045,14.0,200,False,,,
6,Plain,Bernoulli,8.0,0.045,12.0,200,False,,,
7,Plain,Bernoulli,8.0,0.045,14.0,200,False,,,
8,,,,0.04,,200,,3.0,binary:logistic,
9,,,,0.04,,200,,4.0,,70.0


CatBoostClassifier와 XGBClassifier 기반 보팅

In [65]:
xgb_clf = XGBClassifier(n_estimators=200,
                        max_depth=4,
                        learning_rate=0.04,
                        objective='binary:logistic')

cat_clf = CatBoostClassifier(n_estimators=200,
                             learning_rate=0.03,
                             depth=8,
                             min_data_in_leaf=14,
                             boosting_type='Plain',
                             bootstrap_type='Bernoulli',
                             verbose=False)


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
scores = []
X = scaled.drop(['Class'], axis=1)
y = scaled['Class']

for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    
    sampled_train = undersampling(pd.concat([X_train, y_train], axis=1))
    sampled_X_train = sampled_train.drop(['Class'], axis=1)
    sampled_y_train = sampled_train['Class']
    
    vo_clf = VotingClassifier(estimators=[('XGB', xgb_clf), ('CAT', cat_clf)], voting='soft')
    vo_clf.fit(sampled_X_train, sampled_y_train)
    y_pred_proba = vo_clf.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, y_pred_proba)
    
    scores.append(score)

average_score = np.mean(scores)
print("Average ROC-AUC: {}".format(average_score))


Average ROC-AUC: 0.8964238383372811


최종 모델인 CatBoostClassifier로 Test Data 예측

In [67]:
cat_clf = CatBoostClassifier(n_estimators=200,
                             learning_rate=0.03,
                             depth=8,
                             min_data_in_leaf=14,
                             boosting_type='Plain',
                             bootstrap_type='Bernoulli',
                             verbose=False)
train_data = undersampling(scaled)
X_train = train_data.drop(['Class'], axis=1)
y_train = train_data['Class']
X_test = scaled_test.drop(['Class'], axis=1)
y_test = scaled_test['Class']

cat_clf.fit(X_train, y_train)
y_pred_proba = cat_clf.predict_proba(X_test)[:,1]
score = roc_auc_score(y_test, y_pred_proba)

print(score)

0.8928892763082932
