In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
#!pip install lightgbm
import lightgbm as lgb
import calendar

# 100 개 열 보이기
pd.set_option('display.max_row', 100)
pd.set_option('display.max_columns', 100)

from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import  RFECV

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load Data
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")

## Existing Column transform

In [3]:
# Log transform for distance
#---------------------------------------------------------------------------------------------------

log_transform = [ '결제장소와주소정보1사이의거리', '결제장소와주소정보2사이의거리']

for col in log_transform:
    X_train[col] = X_train[col].apply(lambda x: np.log1p(x))
    X_test[col] = X_test[col].apply(lambda x: np.log1p(x))

In [4]:
# P_info consistency
#---------------------------------------------------------------------------------------------------
# T:+1, F:-2, Nan:0 Encoding
p_info_ag_cols = []
for col in X_train.columns:
    if '개인정보일치여부' in col:
        p_info_ag_cols.append(col)
    else:
        pass
p_info_ag_cols.remove('개인정보일치여부_4')


for col in p_info_ag_cols:
    X_train[col].fillna(-1, inplace=True)
    X_test[col].fillna(-1, inplace=True)

for col in p_info_ag_cols:
    X_train[col] = X_train[col].apply(lambda x: 1 if x == 'T' else (-3 if x == 'F' else -1))
    X_test[col] = X_test[col].apply(lambda x: 1 if x == 'T' else (-3 if x == 'F' else -1))

X_train['agree_score'] = X_train[p_info_ag_cols].sum(axis=1)
X_test['agree_score'] = X_test[p_info_ag_cols].sum(axis=1)

for col in p_info_ag_cols:
    X_train[col] = X_train[col].astype('category')
    X_test[col] = X_test[col].astype('category')

In [5]:
# NA ratio over th -> 0,1,2 Encoding
#---------------------------------------------------------------------------------------------------
# 0.3: 55cols, 0.4: 55cols / 0.45: 54 / 0.47: 51 cols / 0.48: 48 cols
intg_X = pd.concat([X_train.iloc[:,1:], X_test.iloc[:,1:]], axis=0)

over_na = []

for col in intg_X.columns:
    na_ratio = intg_X[col].isnull().sum() / len(intg_X[col])
    if na_ratio > 0.48:
        over_na.append(col)

for col in over_na:
    if intg_X[col].dtype != 'object':
        # na: 0, small: 1, big: 2 for categorical data
        train_mean = intg_X[col].mean()
        test_mean = intg_X[col].mean()
        X_train[col] = X_train[col].apply(lambda x: 0 if (pd.isnull(x)) else (1 if x < train_mean else 2))
        X_test[col] = X_test[col].apply(lambda x: 0 if (pd.isnull(x)) else (1 if x < test_mean else 2))
    else:
        # na: 0, else: 1
        X_train[col] = X_train[col].apply(lambda x: 0 if (pd.isnull(x)) else 1)
        X_test[col] = X_test[col].apply(lambda x: 0 if (pd.isnull(x)) else 1)

len(over_na)

44

## New column Add

In [6]:
# Transform date, day AND drop original date column
#---------------------------------------------------------------------------------------------------
def transform_day(dt):
    days = ['mon', 'tue', 'wed', 'thur', 'fri', "sat", 'sun']
    date = str(dt).split(" ")[0]
    yyyy = int(date.split('-')[0])
    mm = int(date.split('-')[1])
    dd = int(date.split('-')[2])
    day = days[calendar.weekday(yyyy, mm, dd)]
    return day

def transform_hour(dt):
    time = str(dt).split(' ')[1]
    hour = int(time.split(':')[0])
    return hour

def transform_month(dt):
    date = str(dt).split(" ")[0]
    mm = int(date.split('-')[1])
    return mm
    
X_train["day"] = X_train["날짜"].apply(transform_day)
X_test['day'] = X_test['날짜'].apply(transform_day)

X_train["month"] = X_train["날짜"].apply(transform_month)
X_test['month'] = X_test['날짜'].apply(transform_month)

X_train["hour"] = X_train["날짜"].apply(transform_hour)
X_test['hour'] = X_test['날짜'].apply(transform_hour)

X_train = X_train.drop(['날짜'], axis = 1)
X_test = X_test.drop(['날짜'], axis = 1)

In [7]:
X_train = pd.get_dummies(X_train, columns=['day'], drop_first= True)
X_test = pd.get_dummies(X_test, columns=['day'], drop_first= True)

In [8]:
# Payment information NA count
#---------------------------------------------------------------------------------------------------

pay_info_count_cols = []
for col in X_train.columns:
    if '결제정보' in col:
        pay_info_count_cols.append(col)
    else:
        pass

X_train['결제정보_na_count'] = X_train[pay_info_count_cols].isnull().sum(1)
X_test['결제정보_na_count'] = X_test[pay_info_count_cols].isnull().sum(1)

In [9]:
# Event gap day
#---------------------------------------------------------------------------------------------------

event_gap_cols = []
for col in X_train.columns:
    if '직전결제일간격혹은이벤트' in col:
        event_gap_cols.append(col)
    else:
        pass

for col in event_gap_cols:
    X_train[col].fillna(0, inplace=True)
    X_test[col].fillna(0, inplace=True)

X_train['avg_event_gap'] = X_train[event_gap_cols].mean(axis=1) 
X_test['avg_event_gap'] = X_test[event_gap_cols].mean(axis=1) 

In [10]:
# Change T/F
#---------------------------------------------------------------------------------------------------

X_train["잔돈"] = X_train["결제금액"].apply(lambda x : x - x//100*100)
X_train["잔돈여부"] = X_train["결제금액"].apply(lambda x : x == x//100*100)
X_test["잔돈"] = X_test["결제금액"].apply(lambda x : x - x//100*100)
X_test["잔돈여부"] = X_test["결제금액"].apply(lambda x : x == x//100*100)

In [11]:
# Label Weight
#---------------------------------------------------------------------------------------------------

cols = ['주소정보1','주소정보2','카드정보_1','카드정보_2',"카드정보_3","카드정보_4",
        '카드소유주주소',"카드사용자주소", "결제정보_7", "결제정보_20",
        "개인정보갯수_1", "개인정보갯수_2"]
        
# '결제정보_7', '결제정보_20': kdeplot 에서 fraud에 따라 차이가 많이나서 추가함
for col in cols :
    sum_df = pd.concat([X_train[col],X_test[col]])
    count = sum_df.value_counts(dropna = True, normalize = True).to_dict()
    count[-1] = -1
    
    new_col = col+"의분포"
    
    X_train[new_col] = X_train[col].map(count)
    X_test[new_col] = X_test[col].map(count)
    
    # print(f"{new_col} 생성완료")

In [14]:
# Fill NA value
#---------------------------------------------------------------------------------------------------

X_train = X_train.fillna(-1)
X_test = X_test.fillna(-1)

In [15]:
# Label Encoding
#---------------------------------------------------------------------------------------------------

for col in (X_train.columns):
    if X_train[col].dtype == 'object':
        enc = LabelEncoder()
        enc.fit(list(X_train[col].values) + list(X_test[col].values))
        X_train[col] = enc.transform(list(X_train[col].values))
        X_test[col] = enc.transform(list(X_test[col].values))

## Bayesian Optimization

In [17]:
from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score
# from sklearn.model_selection import KFold as kf
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from sklearn.metrics import roc_auc_score
import xgboost as xgb
from sklearn.model_selection import cross_val_score

In [18]:
# BAYESAIN OPT
def lgbm_eval(feature_fraction,bagging_fraction, output = 'score'):
    trn_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val)
    params = {
        "boosting_type": "gbdt",
        "learning_rate":0.05,
        'objective': 'binary',
        "metric": 'auc',
        "verbosity": -1,
        'random_state': 102,
        # 'device':"gpu",
        'max_depth':12,
        'feature_fraction':feature_fraction,
        'bagging_fraction':bagging_fraction
        }

    lgbm_model = lgb.train(params, trn_data,
                                num_boost_round=15000,
                                valid_sets=[trn_data, val_data],
                                early_stopping_rounds=300,
                                verbose_eval=1000, 
                                # num_boost_round=18000
                                )

    # clf = lgb.train(params, trn_data, num_boost_round=1000,valid_sets =[trn_data, val_data],verbose_eval=1000,early_stopping_rounds=50)

    best_iter = lgbm_model.best_iteration
    print(best_iter)
    lgbm_model = lgb.LGBMClassifier(**params, num_boost_round=best_iter)
    lgbm_model.fit(X_train, y_train)
    pred = lgbm_model.predict_proba(X_val)[:, 1]
    roc_score = roc_auc_score(y_val, pred)
    return roc_score

# 주어진 범위 사이에서 적절한 값을 찾는다.
pbounds = {'feature_fraction': (0.70, 0.9),
           'bagging_fraction': (0.6, 0.9),
          }

lgbmBO = BayesianOptimization(f = lgbm_eval, pbounds = pbounds, verbose = 2, random_state = 1)
# 메소드를 이용해 최대화!
lgbmBO.maximize(init_points=10, n_iter = 10, acq='ei', xi=0.01)

lgbmBO.max # 찾은 파라미터 값 확인

## Light GBM model

In [23]:
params = {
          'learning_rate': 0.009,  # original: 0.005
          "boosting_type": "gbdt",
          'objective': 'binary',
          "metric": 'auc',
          "verbosity": -1,
          'random_state': 1040,
          'feature_fraction': 0.7297,
          'bagging_fraction': 0.7704
         }

In [24]:
trn_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val)
clf = lgb.train(params, trn_data, 18000, valid_sets = [trn_data, val_data], verbose_eval=1500, early_stopping_rounds=500) # 10000, 1000 500

Training until validation scores don't improve for 500 rounds
[1500]	training's auc: 0.95868	valid_1's auc: 0.94551
[3000]	training's auc: 0.974994	valid_1's auc: 0.955663
[4500]	training's auc: 0.984117	valid_1's auc: 0.961655
[6000]	training's auc: 0.989197	valid_1's auc: 0.964979
[7500]	training's auc: 0.992607	valid_1's auc: 0.967419
[9000]	training's auc: 0.994884	valid_1's auc: 0.969178
[10500]	training's auc: 0.99639	valid_1's auc: 0.970408
[12000]	training's auc: 0.997693	valid_1's auc: 0.971491
[13500]	training's auc: 0.998402	valid_1's auc: 0.971918
[15000]	training's auc: 0.998935	valid_1's auc: 0.972514
[16500]	training's auc: 0.99924	valid_1's auc: 0.972771
Early stopping, best iteration is:
[17182]	training's auc: 0.999354	valid_1's auc: 0.972825


In [25]:
best_iter = clf.best_iteration

In [26]:
clf = lgb.LGBMClassifier(**params, num_boost_round=best_iter)
clf.fit(X, y)

LGBMClassifier(bagging_fraction=0.7704, feature_fraction=0.7297,
               learning_rate=0.009, metric='auc', num_boost_round=17182,
               objective='binary', random_state=1040, verbosity=-1)

In [27]:
sub = pd.read_csv("sample_submission.csv")
# sub['Fraud'] = clf.predict_proba(X_test.iloc[:,1:])[:, 1]
sub['Fraud'] = clf.predict_proba(X_test.iloc[:,1:])[:, 1]
sub.to_csv('result_1210_02.csv', index=False)