In [1]:
import pandas as pd

data_path = '../data/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col = 'id')
test = pd.read_csv(data_path + 'test.csv', index_col = 'id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col= 'id')

In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis = 1)

In [3]:
all_features = all_data.columns
all_features

Index(['ps_ind_01', 'ps_ind_02_cat', 'ps_ind_03', 'ps_ind_04_cat',
       'ps_ind_05_cat', 'ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin',
       'ps_ind_09_bin', 'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin',
       'ps_ind_13_bin', 'ps_ind_14', 'ps_ind_15', 'ps_ind_16_bin',
       'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_reg_01', 'ps_reg_02', 'ps_reg_03',
       'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat',
       'ps_car_05_cat', 'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat',
       'ps_car_09_cat', 'ps_car_10_cat', 'ps_car_11_cat', 'ps_car_11',
       'ps_car_12', 'ps_car_13', 'ps_car_14', 'ps_car_15', 'ps_calc_01',
       'ps_calc_02', 'ps_calc_03', 'ps_calc_04', 'ps_calc_05', 'ps_calc_06',
       'ps_calc_07', 'ps_calc_08', 'ps_calc_09', 'ps_calc_10', 'ps_calc_11',
       'ps_calc_12', 'ps_calc_13', 'ps_calc_14', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='obj

In [4]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()

encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

encoded_cat_matrix

<1488028x184 sparse matrix of type '<class 'numpy.float64'>'
	with 20832392 stored elements in Compressed Sparse Row format>

In [5]:
drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin',
                'ps_ind_12_bin','ps_ind_13_bin','ps_car_14']

remaining_features = [feature for feature in all_features
                      if ('cat' not in feature and
                          'calc' not in feature and
                          feature not in drop_features)]



In [9]:
from scipy import sparse
all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data[remaining_features]),
                               encoded_cat_matrix], format='csr')

### 모델링 이전 준비

In [10]:
num_train = len(train)

X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values


In [11]:
import numpy as np

def eval_gini(y_true, y_pred):
    # 실제 값과 예측 값의 크기가 같은지 확인
    assert y_true.shape == y_pred.shape

    n_samples = y_true.shape[0] # 데이터 개수
    L_mid = np.linspace( 1 / n_samples, 1, n_samples) # 대각선

    # 1. 예측값에 대한 지니계수
    pred_order = y_true[y_pred.argsort()]
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    G_pred = np.sum(L_mid - L_pred)

    # 2. 예측이 완벽할 때의 지니계수
    true_order = y_true[y_true.argsort()]
    L_true = np.cumsum(true_order) / np.sum(pred_order)
    G_true = np.sum(L_mid - L_true)

    return G_pred / G_true
    
    

In [12]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    # RETURN: 평가지표 이름, 평가 점수, 평가 지표가 높을수록 좋은지 여부
    return 'gini', eval_gini(labels, preds), True

### BaseLine: OOF 기반으로 훈련

In [13]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

In [15]:
params = {'objective': 'binary',
          'learning_rate': 0.01,
          'force_row_wise': True,
          'random_state': 0}

oof_val_preds = np.zeros(X.shape[0])
oof_test_preds = np.zeros(X_test.shape[0])

In [22]:
import lightgbm as lgb

for idx, (train_idx, valid_idx) in enumerate(folds.split(X,y)):
    print("#"*40, f'폴드 {idx + 1} / 폴드 {folds.n_splits}', "#"*40)

    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    dtrain = lgb.Dataset(X_train, y_train)
    dvalid = lgb.Dataset(X_valid, y_valid)

    lgb_model = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=1000,
        valid_sets=dvalid,
        feval=gini,
        callbacks=[lgb.early_stopping(stopping_rounds=100)],
    )

    oof_test_preds += lgb_model.predict(X_test) / folds.n_splits
    oof_val_preds[valid_idx] += lgb_model.predict(X_valid)

    gini_score = eval_gini(y_valid, oof_test_preds[valid_idx])
    print(f'폴드 {idx+1}, 지니계수 {gini_score}\n')



######################################## 폴드 1 / 폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[681]	valid_0's binary_logloss: 0.151659	valid_0's gini: 0.289034
폴드 1, 지니계수 0.007445387011907078

######################################## 폴드 2 / 폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1093
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764


In [23]:
submission['target'] = oof_test_preds
submission.to_csv(data_path + 'submission.csv')

### 개선1. 최적화 및 피쳐 엔지니어링

#### 피쳐 엔지니어링

In [24]:
import pandas as pd

data_path = '../data/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col = 'id')
test = pd.read_csv(data_path + 'test.csv', index_col = 'id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col= 'id')

In [25]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis = 1)

In [26]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [feature for feature in all_features if 'cat' in feature]

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features])

encoded_cat_matrix

<1488028x184 sparse matrix of type '<class 'numpy.float64'>'
	with 20832392 stored elements in Compressed Sparse Row format>

In [28]:
# 피쳐엔지니어링

## IDEA 1. 데이터 하나당 결측값 개수를 파생피쳐로 사용한다면?
all_data['num_missing'] = (all_data == -1).sum(axis=1) # 여기서 -1이 결측값임에 주의
all_data['num_missing']

0          1
1          2
2          3
3          0
4          2
          ..
1488023    1
1488024    1
1488025    2
1488026    1
1488027    0
Name: num_missing, Length: 1488028, dtype: int64

In [29]:
remaining_features = [feature for feature in all_features
                      if ('cat' not in feature and 'calc' not in feature)]

remaining_features.append('num_missing') # num_missing도 새로운 피쳐로 추가

In [30]:
## IDEA 2. mix_ind

ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [32]:
all_data['mix_ind']

0          2_2_5_1_0_0_1_0_0_0_0_0_0_0_11_0_1_0_
1           1_1_7_0_0_0_0_1_0_0_0_0_0_0_3_0_0_1_
2          5_4_9_1_0_0_0_1_0_0_0_0_0_0_12_1_0_0_
3           0_1_2_0_0_1_0_0_0_0_0_0_0_0_8_1_0_0_
4           0_2_0_1_0_1_0_0_0_0_0_0_0_0_9_1_0_0_
                           ...                  
1488023     0_1_6_0_0_0_1_0_0_0_0_0_0_0_2_0_0_1_
1488024    5_3_5_1_0_0_0_1_0_0_0_0_0_0_11_1_0_0_
1488025     0_1_5_0_0_1_0_0_0_0_0_0_0_0_5_0_0_1_
1488026    6_1_5_1_0_0_0_0_1_0_0_0_0_0_13_1_0_0_
1488027    7_1_4_1_0_0_0_0_1_0_0_0_0_0_12_1_0_0_
Name: mix_ind, Length: 1488028, dtype: object

In [33]:
all_data['ps_ind_02_cat'].value_counts().to_dict()

{1: 1079327, 2: 309747, 3: 70172, 4: 28259, -1: 523}

In [38]:
cat_count_features = []

for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x: val_counts_dict[x])

    cat_count_features.append(f'{feature}_count')
all_data['ps_ind_02_cat_count']

0           309747
1          1079327
2            28259
3          1079327
4           309747
            ...   
1488023    1079327
1488024      70172
1488025    1079327
1488026    1079327
1488027    1079327
Name: ps_ind_02_cat_count, Length: 1488028, dtype: int64

In [58]:
from scipy import sparse

drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin',
                 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix], format='csr')

#### 하이퍼 파라미터 최적화

In [59]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=0.2,
                                                      random_state=0)

bayes_dtrain = lgb.Dataset(X_train, y_train)
bayes_dvalid = lgb.Dataset(X_valid, y_valid)

In [60]:
param_bounds = {'num_leaves': (30,40),
                'lambda_l1': (0.7, 0.9),
                'lambda_l2': (0.9, 1),
                'feature_fraction': (0.6, 0.7),
                'bagging_fraction': (0.6, 0.9),
                'min_child_samples': (6, 10),
                'min_child_weight': (10, 40)}

fixed_params = {'objective': 'binary',
                'learning_rate': 0.005,
                'bagging_freq': 1,
                'force_row_wise': True,
                'random_state': 1991}

In [61]:
def eval_function(num_leaves, lambda_l1, lambda_l2, feature_fraction,
                  bagging_fraction, min_child_samples, min_child_weight):
    '''
    최적화하려는 지니계수를 계산하는 함수
    '''
    params = {'num_leaves': int(round(num_leaves)),
                'lambda_l1': lambda_l1,
                'lambda_l2': lambda_l2,
                'feature_fraction': feature_fraction,
                'bagging_fraction': bagging_fraction,
                'min_child_samples': int(round(min_child_samples)),
                'min_child_weight': min_child_weight,
                'feature_pre_filter': False}
    
    params.update(fixed_params)
    print('하이퍼파라미터:', params)

    lgb_model = lgb.train(params=params,
                          train_set=bayes_dtrain,
                          num_boost_round=2500,
                          valid_sets=bayes_dvalid,
                          feval=gini,
                          callbacks=[lgb.early_stopping(stopping_rounds=100)])
    preds = lgb_model.predict(X_valid)

    gini_score = eval_gini(y_valid, preds)

    print(f'지니계수: {gini_score}\n')

    return gini_score

    

In [62]:
!pip install bayesian-optimization



In [63]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function,
                                 pbounds = param_bounds,
                                 random_state=0)

In [64]:
optimizer.maximize(init_points=3, n_iter=6)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | min_ch... | min_ch... | num_le... |
-------------------------------------------------------------------------------------------------------------
하이퍼파라미터: {'num_leaves': 34, 'lambda_l1': 0.8205526752143287, 'lambda_l2': 0.9544883182996897, 'feature_fraction': 0.6715189366372419, 'bagging_fraction': 0.7646440511781974, 'min_child_samples': 8, 'min_child_weight': 29.376823391999682, 'feature_pre_filter': False, 'objective': 'binary', 'learning_rate': 0.005, 'bagging_freq': 1, 'force_row_wise': True, 'random_state': 1991}
[LightGBM] [Info] Number of positive: 17383, number of negative: 458786
[LightGBM] [Info] Total Bins 1098
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 201
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036506 -> initscore=-3.273091
[LightGBM] [Info] Start training from score -3.273091
Training until validation scores don't improve for 100 rounds
Earl

In [65]:
max_params = optimizer.max['params']
max_params['num_leaves'] = int(round(max_params['num_leaves']))
max_params['min_child_samples'] = int(round(max_params['min_child_samples']))

In [66]:
max_params.update(fixed_params)
max_params

{'bagging_fraction': 0.6213108174593661,
 'feature_fraction': 0.608712929970154,
 'lambda_l1': 0.7040436794880651,
 'lambda_l2': 0.9832619845547939,
 'min_child_samples': 9,
 'min_child_weight': 36.10036444740457,
 'num_leaves': 40,
 'objective': 'binary',
 'learning_rate': 0.005,
 'bagging_freq': 1,
 'force_row_wise': True,
 'random_state': 1991}

In [67]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

oof_val_preds = np.zeros(X.shape[0])
oof_test_preds = np.zeros(X_test.shape[0])

for idx, (train_idx, valid_idx) in enumerate(folds.split(X,y)):
    print("#"*40, f'폴드 {idx + 1} / 폴드 {folds.n_splits}', "#"*40)

    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    dtrain = lgb.Dataset(X_train, y_train)
    dvalid = lgb.Dataset(X_valid, y_valid)

    lgb_model = lgb.train(
        params=max_params,
        train_set=dtrain,
        num_boost_round=1000,
        valid_sets=dvalid,
        feval=gini,
        callbacks=[lgb.early_stopping(stopping_rounds=100)],
    )

    oof_test_preds += lgb_model.predict(X_test) / folds.n_splits
    oof_val_preds[valid_idx] += lgb_model.predict(X_valid)

    gini_score = eval_gini(y_valid, oof_test_preds[valid_idx])
    print(f'폴드 {idx+1}, 지니계수 {gini_score}\n')



######################################## 폴드 1 / 폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 100 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's binary_logloss: 0.15148	valid_0's gini: 0.29614
폴드 1, 지니계수 0.00784703807985343

######################################## 폴드 2 / 폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1104
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 200
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore

In [68]:
submission['target'] = oof_test_preds
submission.to_csv(data_path + 'submission.csv')

### 성능 개선 2. XGBoost

In [69]:
def gini(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds)

In [70]:
import xgboost as xgb
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      test_size=0.2,
                                                      random_state=0)

bayes_dtrain = xgb.DMatrix(X_train ,y_train)
bayes_dvalid = xgb.DMatrix(X_valid, y_valid)

In [71]:
param_bounds = {'max_depth': (4, 8),
                'subsample': (0.6, 0.9),
                'colsample_bytree': (0.7, 1.0),
                'min_child_weight': (10, 40),
                'gamma': (8, 11),
                'reg_alpha': (7, 9),
                'reg_lambda': (1.1, 1.5),
                'scale_pos_weight': (1.4, 1.6)
                }

fixed_params = {'objective': 'binary:logistic',
                'learning_rate': 0.02,
                'random_state': 1991}

In [82]:
def eval_function(max_depth, subsample, colsample_bytree, min_child_weight,
                  reg_alpha, gamma, reg_lambda, scale_pos_weight):
    '''
    최적화하려는 지니계수를 계산하는 함수
    '''
    params = {'max_depth': int(round(max_depth)),
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'min_child_weight': min_child_weight,
              'gamma': gamma,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'scale_pos_weight': scale_pos_weight
             }
    
    params.update(fixed_params)
    print('하이퍼파라미터:', params)

    xgb_model = xgb.train(params=params,
                          dtrain=bayes_dtrain,
                          num_boost_round=2000,
                          evals=[(bayes_dvalid, 'bayes_dvalid')],
                          maximize=True,
                          feval=gini,
                          early_stopping_rounds=100,
                          verbose_eval=False)
    
    best_iter = xgb_model.best_iteration
    preds = xgb_model.predict(bayes_dvalid, iteration_range=(0, best_iter))
    gini_score = eval_gini(y_valid, preds)

    print(f'지니계수: {gini_score}\n')

    return gini_score

In [83]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function,
                                 pbounds = param_bounds,
                                 random_state=0)

In [84]:
optimizer.maximize(init_points=3, n_iter=6)

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | reg_alpha | reg_la... | scale_... | subsample |
-------------------------------------------------------------------------------------------------------------------------
하이퍼파라미터: {'max_depth': 6, 'subsample': 0.8675319002346239, 'colsample_bytree': 0.8646440511781974, 'min_child_weight': 26.346495489906907, 'gamma': 10.14556809911726, 'reg_alpha': 7.84730959867781, 'reg_lambda': 1.3583576452266626, 'scale_pos_weight': 1.4875174422525386, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}




지니계수: 0.2750366949976141

| [39m1        [39m | [39m0.275    [39m | [39m0.8646   [39m | [39m10.15    [39m | [39m6.411    [39m | [39m26.35    [39m | [39m7.847    [39m | [39m1.358    [39m | [39m1.488    [39m | [39m0.8675   [39m |
하이퍼파라미터: {'max_depth': 7, 'subsample': 0.6261387899104622, 'colsample_bytree': 0.9890988281503088, 'min_child_weight': 25.866847592587135, 'gamma': 9.150324556477333, 'reg_alpha': 8.136089122187865, 'reg_lambda': 1.4702386553170643, 'scale_pos_weight': 1.4142072116395774, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}
지니계수: 0.2778881193365795

| [35m2        [39m | [35m0.2779   [39m | [35m0.9891   [39m | [35m9.15     [39m | [35m7.167    [39m | [35m25.87    [39m | [35m8.136    [39m | [35m1.47     [39m | [35m1.414    [39m | [35m0.6261   [39m |
하이퍼파라미터: {'max_depth': 7, 'subsample': 0.8341587528859367, 'colsample_bytree': 0.7060655192320977, 'min_child_weight': 36.10036444740457, 'gamma': 10.4978



지니계수: 0.27528800833025396

| [39m4        [39m | [39m0.2753   [39m | [39m0.9808   [39m | [39m10.97    [39m | [39m6.231    [39m | [39m27.83    [39m | [39m7.906    [39m | [39m1.34     [39m | [39m1.454    [39m | [39m0.8883   [39m |
하이퍼파라미터: {'max_depth': 6, 'subsample': 0.6777569933758918, 'colsample_bytree': 0.7031729260762098, 'min_child_weight': 23.360582308827137, 'gamma': 9.751364317718942, 'reg_alpha': 7.163192458885216, 'reg_lambda': 1.4257319147018181, 'scale_pos_weight': 1.4319324527229846, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}




지니계수: 0.2763529814820517

| [39m5        [39m | [39m0.2764   [39m | [39m0.7032   [39m | [39m9.751    [39m | [39m5.878    [39m | [39m23.36    [39m | [39m7.163    [39m | [39m1.426    [39m | [39m1.432    [39m | [39m0.6778   [39m |
하이퍼파라미터: {'max_depth': 7, 'subsample': 0.787087261792321, 'colsample_bytree': 0.8135648655592653, 'min_child_weight': 12.787024427483665, 'gamma': 9.11855921878361, 'reg_alpha': 8.114990497685058, 'reg_lambda': 1.4561707979191623, 'scale_pos_weight': 1.4876134907518463, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}




지니계수: 0.2774784791300434

| [39m6        [39m | [39m0.2775   [39m | [39m0.8136   [39m | [39m9.119    [39m | [39m6.866    [39m | [39m12.79    [39m | [39m8.115    [39m | [39m1.456    [39m | [39m1.488    [39m | [39m0.7871   [39m |
하이퍼파라미터: {'max_depth': 4, 'subsample': 0.7914002574584101, 'colsample_bytree': 0.8199566286759267, 'min_child_weight': 33.18226177889865, 'gamma': 10.954712589154026, 'reg_alpha': 8.383576538871093, 'reg_lambda': 1.2622476362983295, 'scale_pos_weight': 1.4446047674967932, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}




지니계수: 0.2739156655196047

| [39m7        [39m | [39m0.2739   [39m | [39m0.82     [39m | [39m10.95    [39m | [39m4.068    [39m | [39m33.18    [39m | [39m8.384    [39m | [39m1.262    [39m | [39m1.445    [39m | [39m0.7914   [39m |
하이퍼파라미터: {'max_depth': 7, 'subsample': 0.6377641552550046, 'colsample_bytree': 0.9998113981912331, 'min_child_weight': 25.878381343628586, 'gamma': 9.161858307518786, 'reg_alpha': 8.147589308643452, 'reg_lambda': 1.4866040460102463, 'scale_pos_weight': 1.425868266770488, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}




지니계수: 0.27884404083566094

| [35m8        [39m | [35m0.2788   [39m | [35m0.9998   [39m | [35m9.162    [39m | [35m7.178    [39m | [35m25.88    [39m | [35m8.148    [39m | [35m1.487    [39m | [35m1.426    [39m | [35m0.6378   [39m |
하이퍼파라미터: {'max_depth': 7, 'subsample': 0.6656131905598175, 'colsample_bytree': 0.9744808824787814, 'min_child_weight': 25.90601155804709, 'gamma': 9.189488687088284, 'reg_alpha': 8.175138838733645, 'reg_lambda': 1.4901615422175771, 'scale_pos_weight': 1.4538022059092626, 'objective': 'binary:logistic', 'learning_rate': 0.02, 'random_state': 1991}




지니계수: 0.27857799669465416

| [39m9        [39m | [39m0.2786   [39m | [39m0.9745   [39m | [39m9.189    [39m | [39m7.206    [39m | [39m25.91    [39m | [39m8.175    [39m | [39m1.49     [39m | [39m1.454    [39m | [39m0.6656   [39m |


In [87]:
max_params = optimizer.max['params']
max_params['max_depth'] = int(round(max_params['max_depth']))
max_params.update(fixed_params)
max_params

{'colsample_bytree': 0.9998113981912331,
 'gamma': 9.161858307518786,
 'max_depth': 7,
 'min_child_weight': 25.878381343628586,
 'reg_alpha': 8.147589308643452,
 'reg_lambda': 1.4866040460102463,
 'scale_pos_weight': 1.425868266770488,
 'subsample': 0.6377641552550046,
 'objective': 'binary:logistic',
 'learning_rate': 0.02,
 'random_state': 1991}

In [90]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

oof_val_preds = np.zeros(X.shape[0])
oof_test_preds = np.zeros(X_test.shape[0])

for idx, (train_idx, valid_idx) in enumerate(folds.split(X,y)):
    print("#"*40, f'폴드 {idx + 1} / 폴드 {folds.n_splits}', "#"*40)

    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
    dtest = xgb.DMatrix(X_test)

    xgb_model = xgb.train(
        params=max_params,
        dtrain=dtrain,
        num_boost_round=2000,
        evals=[(dvalid, 'dvalid')],
        maximize=True,
        feval=gini,
        early_stopping_rounds=200,
        verbose_eval=100
    )

    best_iter = xgb_model.best_iteration

    oof_test_preds += xgb_model.predict(dtest, iteration_range=(0, best_iter)) / folds.n_splits
    oof_val_preds[valid_idx] += xgb_model.predict(dvalid, iteration_range=(0, best_iter))

    gini_score = eval_gini(y_valid, oof_test_preds[valid_idx])
    print(f'폴드 {idx+1}, 지니계수 {gini_score}\n')



######################################## 폴드 1 / 폴드 5 ########################################
[0]	dvalid-logloss:0.21747	dvalid-gini:0.20621




[100]	dvalid-logloss:0.16013	dvalid-gini:0.27063
[200]	dvalid-logloss:0.15493	dvalid-gini:0.28532
[300]	dvalid-logloss:0.15421	dvalid-gini:0.29030
[400]	dvalid-logloss:0.15408	dvalid-gini:0.29198
[500]	dvalid-logloss:0.15404	dvalid-gini:0.29270
[600]	dvalid-logloss:0.15401	dvalid-gini:0.29304
[700]	dvalid-logloss:0.15400	dvalid-gini:0.29354
[800]	dvalid-logloss:0.15399	dvalid-gini:0.29379
[900]	dvalid-logloss:0.15401	dvalid-gini:0.29395
[1000]	dvalid-logloss:0.15398	dvalid-gini:0.29405
[1100]	dvalid-logloss:0.15398	dvalid-gini:0.29414
[1200]	dvalid-logloss:0.15398	dvalid-gini:0.29424
[1300]	dvalid-logloss:0.15398	dvalid-gini:0.29438
[1400]	dvalid-logloss:0.15396	dvalid-gini:0.29447
[1500]	dvalid-logloss:0.15396	dvalid-gini:0.29453
[1600]	dvalid-logloss:0.15396	dvalid-gini:0.29466
[1700]	dvalid-logloss:0.15396	dvalid-gini:0.29482
[1800]	dvalid-logloss:0.15395	dvalid-gini:0.29490
[1900]	dvalid-logloss:0.15394	dvalid-gini:0.29489
[1999]	dvalid-logloss:0.15395	dvalid-gini:0.29497
폴드 1, 지니계

In [91]:
submission['target'] = oof_test_preds
submission.to_csv(data_path + 'submission.csv')

### 성능 개선 3. 앙상블

In [None]:
oof_test_preds = oof_test_preds_lgb * 0.5 + oof_test_preds_xgb * 0.5

submission['target'] = oof_test_preds
submission.to_csv(data_path + 'submission.csv')