In [1]:
import pandas as pd

data_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'

train = pd.read_csv(data_path + 'train.csv', index_col='id')
test = pd.read_csv(data_path + 'test.csv', index_col='id')
submission = pd.read_csv(data_path + 'sample_submission.csv', index_col='id')

In [2]:
all_data = pd.concat([train, test], ignore_index=True)
all_data = all_data.drop('target', axis=1) 

all_features = all_data.columns 

In [3]:
from sklearn.preprocessing import OneHotEncoder

cat_features = [feature for feature in all_features if 'cat' in feature] 

onehot_encoder = OneHotEncoder()
encoded_cat_matrix = onehot_encoder.fit_transform(all_data[cat_features]) 

In [4]:
all_data['num_missing'] = (all_data==-1).sum(axis=1)

In [5]:
remaining_features = [feature for feature in all_features 
                      if ('cat' not in feature and 'calc' not in feature)] 

remaining_features.append('num_missing')

In [6]:
ind_features = [feature for feature in all_features if 'ind' in feature]

is_first_feature = True
for ind_feature in ind_features:
    if is_first_feature:
        all_data['mix_ind'] = all_data[ind_feature].astype(str) + '_'
        is_first_feature = False
    else:
        all_data['mix_ind'] += all_data[ind_feature].astype(str) + '_'

In [7]:
cat_count_features = []
for feature in cat_features+['mix_ind']:
    val_counts_dict = all_data[feature].value_counts().to_dict()
    all_data[f'{feature}_count'] = all_data[feature].apply(lambda x: 
                                                           val_counts_dict[x])
    cat_count_features.append(f'{feature}_count')

In [8]:
cat_count_features

['ps_ind_02_cat_count',
 'ps_ind_04_cat_count',
 'ps_ind_05_cat_count',
 'ps_car_01_cat_count',
 'ps_car_02_cat_count',
 'ps_car_03_cat_count',
 'ps_car_04_cat_count',
 'ps_car_05_cat_count',
 'ps_car_06_cat_count',
 'ps_car_07_cat_count',
 'ps_car_08_cat_count',
 'ps_car_09_cat_count',
 'ps_car_10_cat_count',
 'ps_car_11_cat_count',
 'mix_ind_count']

In [9]:
from scipy import sparse

drop_features = ['ps_ind_14', 'ps_ind_10_bin', 'ps_ind_11_bin', 
                 'ps_ind_12_bin', 'ps_ind_13_bin', 'ps_car_14']

all_data_remaining = all_data[remaining_features+cat_count_features].drop(drop_features, axis=1)

all_data_sprs = sparse.hstack([sparse.csr_matrix(all_data_remaining),
                               encoded_cat_matrix],
                              format='csr')

In [10]:
num_train = len(train) 

X = all_data_sprs[:num_train]
X_test = all_data_sprs[num_train:]

y = train['target'].values

In [11]:
import numpy as np

def eval_gini(y_true, y_pred):

    assert y_true.shape == y_pred.shape

    n_samples = y_true.shape[0]                      
    L_mid = np.linspace(1 / n_samples, 1, n_samples) 

    pred_order = y_true[y_pred.argsort()] 
    L_pred = np.cumsum(pred_order) / np.sum(pred_order) 
    G_pred = np.sum(L_mid - L_pred)       

    true_order = y_true[y_true.argsort()] 
    L_true = np.cumsum(true_order) / np.sum(true_order) 
    G_true = np.sum(L_mid - L_true)       

    return G_pred / G_true

In [12]:
def gini_lgb(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds), True

In [13]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', eval_gini(labels, preds)

In [14]:
from sklearn.model_selection import StratifiedKFold

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1991)

In [15]:
max_params_lgb = {
    'bagging_fraction': 0.6213108174593661,
    'feature_fraction': 0.608712929970154,
    'lambda_l1': 0.7040436794880651,
    'lambda_l2': 0.9832619845547939,
    'min_child_samples': 9,
    'min_child_weight': 36.10036444740457,
    'num_leaves': 40,
    'objective': 'binary',
    'learning_rate': 0.005,
    'bagging_freq': 1,
    'force_row_wise': True,
    'random_state': 1991
}

In [16]:
import lightgbm as lgb

oof_val_preds_lgb = np.zeros(X.shape[0]) 

oof_test_preds_lgb = np.zeros(X_test.shape[0]) 

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    print('#'*40, f'폴드 {idx+1} / 폴드 {folds.n_splits}', '#'*40)
    
    X_train, y_train = X[train_idx], y[train_idx] 
    X_valid, y_valid = X[valid_idx], y[valid_idx] 

    dtrain = lgb.Dataset(X_train, y_train) 
    dvalid = lgb.Dataset(X_valid, y_valid) 
                          
    lgb_model = lgb.train(params=max_params_lgb,     
                          train_set=dtrain,          
                          num_boost_round=2500,      
                          valid_sets=dvalid,         
                          feval=gini_lgb,            
                          early_stopping_rounds=300, 
                          verbose_eval=100)          
    
    oof_test_preds_lgb += lgb_model.predict(X_test)/folds.n_splits
    
    oof_val_preds_lgb[valid_idx] += lgb_model.predict(X_valid)
    
    gini_score = eval_gini(y_valid, oof_val_preds_lgb[valid_idx])
    print(f'폴드 {idx+1} 지니계수 : {gini_score}\n')

######################################## 폴드 1 / 폴드 5 ########################################
[LightGBM] [Info] Number of positive: 17355, number of negative: 458814
[LightGBM] [Info] Total Bins 1554
[LightGBM] [Info] Number of data points in the train set: 476169, number of used features: 216
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.036447 -> initscore=-3.274764
[LightGBM] [Info] Start training from score -3.274764
Training until validation scores don't improve for 300 rounds
[100]	valid_0's binary_logloss: 0.154239	valid_0's gini: 0.270944
[200]	valid_0's binary_logloss: 0.153176	valid_0's gini: 0.275764
[300]	valid_0's binary_logloss: 0.152584	valid_0's gini: 0.279501
[400]	valid_0's binary_logloss: 0.152222	valid_0's gini: 0.282893
[500]	valid_0's binary_logloss: 0.151986	valid_0's gini: 0.286058
[600]	valid_0's binary_logloss: 0.151824	valid_0's gini: 0.288805
[700]	valid_0's binary_logloss: 0.151712	valid_0's gini: 0.290719
[800]	valid_0's binary_logloss: 0.151622	valid_

In [17]:
max_params_xgb = {
    'colsample_bytree': 0.8843124587484356,
    'gamma': 10.452246227672624,
    'max_depth': 7,
    'min_child_weight': 6.494091293383359,
    'reg_alpha': 8.551838810159788,
    'reg_lambda': 1.3814765995549108,
    'scale_pos_weight': 1.423280772455086,
    'subsample': 0.7001630536555632,
    'objective': 'binary:logistic',
    'learning_rate': 0.02,
    'random_state': 1991
}

In [18]:
import xgboost as xgb

oof_val_preds_xgb = np.zeros(X.shape[0]) 

oof_test_preds_xgb = np.zeros(X_test.shape[0]) 

for idx, (train_idx, valid_idx) in enumerate(folds.split(X, y)):

    print('#'*40, f'폴드 {idx+1} / 폴드 {folds.n_splits}', '#'*40)
    
    X_train, y_train = X[train_idx], y[train_idx]
    X_valid, y_valid = X[valid_idx], y[valid_idx]

    dtrain = xgb.DMatrix(X_train, y_train)
    dvalid = xgb.DMatrix(X_valid, y_valid)
    dtest = xgb.DMatrix(X_test)

    xgb_model = xgb.train(params=max_params_xgb, 
                          dtrain=dtrain,
                          num_boost_round=2000,
                          evals=[(dvalid, 'valid')],
                          maximize=True,
                          feval=gini_xgb,
                          early_stopping_rounds=200,
                          verbose_eval=100)

    best_iter = xgb_model.best_iteration

    oof_test_preds_xgb += xgb_model.predict(dtest,
                                            iteration_range=(0, best_iter))/folds.n_splits
    
    oof_val_preds_xgb[valid_idx] += xgb_model.predict(dvalid, 
                                                      iteration_range=(0, best_iter))
    
    gini_score = eval_gini(y_valid, oof_val_preds_xgb[valid_idx])
    print(f'폴드 {idx+1} 지니계수 : {gini_score}\n')

######################################## 폴드 1 / 폴드 5 ########################################
[0]	valid-logloss:0.67674	valid-gini:0.15993
[100]	valid-logloss:0.19089	valid-gini:0.24958
[200]	valid-logloss:0.15778	valid-gini:0.27832
[300]	valid-logloss:0.15458	valid-gini:0.28767
[400]	valid-logloss:0.15406	valid-gini:0.29234
[500]	valid-logloss:0.15391	valid-gini:0.29475
[600]	valid-logloss:0.15386	valid-gini:0.29597
[700]	valid-logloss:0.15380	valid-gini:0.29749
[800]	valid-logloss:0.15378	valid-gini:0.29842
[900]	valid-logloss:0.15375	valid-gini:0.29890
[1000]	valid-logloss:0.15373	valid-gini:0.29963
[1100]	valid-logloss:0.15371	valid-gini:0.29988
[1200]	valid-logloss:0.15370	valid-gini:0.30000
[1300]	valid-logloss:0.15369	valid-gini:0.30005
[1400]	valid-logloss:0.15370	valid-gini:0.30017
[1500]	valid-logloss:0.15367	valid-gini:0.30076
[1600]	valid-logloss:0.15370	valid-gini:0.30043
[1700]	valid-logloss:0.15370	valid-gini:0.30049
[1705]	valid-logloss:0.15369	valid-gini:0.30050
폴드 1 지

In [19]:
print('LightGBM OOF 검증 데이터 지니계수 :', eval_gini(y, oof_val_preds_lgb))

LightGBM OOF 검증 데이터 지니계수 : 0.2889651000887542


In [20]:
print('XGBoost OOF 검증 데이터 지니계수 :', eval_gini(y, oof_val_preds_xgb))

XGBoost OOF 검증 데이터 지니계수 : 0.2894739428104039


## 8.6.1 앙상블 수행

In [21]:
oof_test_preds = oof_test_preds_lgb * 0.5 + oof_test_preds_xgb * 0.5

## 8.6.2 예측 및 결과 제출

In [22]:
submission['target'] = oof_test_preds
submission.to_csv('submission.csv')