In [1]:
import pandas as pd
import numpy as np
import os, sys

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

plt.rcParams["font.family"] = 'AppleGothic'
plt.rcParams['font.size'] = 15

### 방법론

1. Cross Validation Fold = 10
2. Feature Selection
    1) delete_features = ['ps_calc_15_bin', 'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin', 'ps_calc_20_bin']
    
    
3. Feature Engineering
    1) categorical relabeling: 10

4. 결측치는 모델에게 맡김

5. Model
    1) LightGBM

In [2]:
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from gini import gini, gini_normalized

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


### 1. Preprocessing

In [4]:
### 1. 데이터 전처리
folder_loc = '/Users/lifesailor/.kaggle/porto-seguro/'

train = pd.read_csv(folder_loc + 'train.csv')
train_label = train['target']
train_id = train['id']

test = pd.read_csv(folder_loc + 'test.csv')
test_id = test['id']
del test['id']
del train['id']

In [5]:
bin_vars = []
cat_vars = []
con_ord_vars = []

for col in train.columns[2:]:
    if 'cat' in col:
        cat_vars.append(col)
    elif 'bin' in col:
        bin_vars.append(col)
    else:
        con_ord_vars.append(col)

print("continuouse, ordinal variables: ", con_ord_vars[:5])
print("binary variables: ", bin_vars[:5])
print("catgorical variables: ", cat_vars[:5])

for col in cat_vars:
    train[col] = train[col].astype('O')

continuouse, ordinal variables:  ['ps_ind_03', 'ps_ind_14', 'ps_ind_15', 'ps_reg_01', 'ps_reg_02']
binary variables:  ['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin', 'ps_ind_10_bin']
catgorical variables:  ['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat', 'ps_car_02_cat']


### 2. Feature Selection

In [6]:
train_fill = train.copy()
test_fill = test.copy()

features = []
features += con_ord_vars
features += bin_vars[:-6]
features = list(features)

### 3. Feature Engineering

In [7]:
train_fill_cat = train[cat_vars].astype('O')
test_fill_cat = test[cat_vars].astype('O')

train_test = pd.concat([train_fill, test_fill], axis=0)

### 2. categorical relabeling
cat_features = {}

for col in train[cat_vars].columns:
    cat_features[col] = len(train_fill_cat[col].unique())

new_category_col = {}
for col in cat_vars:
    if len(train_fill[col].unique()) <= 7:
        new_category_col[col] = train[col].unique()
        continue
    
    target_by_category = train.groupby(col)['target'].mean().sort_values(ascending=False)
    unique_category_length = len(target_by_category) / 7
    
    
    new_category_col[col] = [target_by_category[int(i * unique_category_length):
                                                int((i+1) * unique_category_length)].index.values
                                                for i in range(7)]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


In [8]:
for key, category in new_category_col.items():
    length = len(category)
    
    for i in range(length):
        new_column = key + '_new_' + str(i)
        train_fill[new_column] = 0
        
        if type(category) is list:
            train_fill.loc[train_fill[key].isin(category[i]), new_column] = 1
            test_fill.loc[test_fill[key].isin(category[i]), new_column] = 1
        else:
            train_fill.loc[train_fill[key] == category[i], new_column] = 1
            test_fill.loc[test_fill[key] == category[i], new_column] = 1

cat_features_lst = [feature for feature in train_fill.columns if 'new' in feature] 
features += cat_features_lst

train_test = pd.concat([train_fill, test_fill], axis=0)
train_fill.shape, test_fill.shape

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




((595212, 124), (892816, 123))

In [9]:
train_fill.drop(columns='target', axis=1, inplace=True)
train_fill.shape, test_fill.shape

train_new = train_fill.astype(float).copy()
test_new = test_fill.astype(float).copy()

### 4. Model

In [10]:
import lightgbm as lgbm
from sklearn.model_selection import StratifiedKFold
from gini import gini, gini_normalized

# LightGBM 모델의 설정값이다.
num_boost_round = 10000
params = {"objective": "binary",
          "boosting_type": "gbdt",
          "learning_rate": 0.02,
          "num_leaves": 15,
          "max_bin": 256,
          "feature_fraction": 0.6,
          "verbosity": 0,
          "drop_rate": 0.1,
          "is_unbalance": False,
          "max_drop": 50,
          "min_child_samples": 10,
          "min_child_weight": 150,
          "min_split_gain": 0,
          "subsample": 0.9,
          "seed": 2018
}

def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'gini', gini_normalized(labels, preds), True

In [11]:
NFOLDS = 10
kfold = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=2018)
kf = kfold.split(train_new, train_label)

cv_train = np.zeros(len(train_label))
cv_pred = np.zeros(len(test_id))    
best_trees = []
fold_scores = []

for i, (train_fold, validate) in enumerate(kf):
    # 훈련/검증 데이터를 분리한다
    X_train, X_validate, label_train, label_validate = train_new.iloc[train_fold, :], train_new.iloc[validate, :], train_label[train_fold], train_label[validate]
    dtrain = lgbm.Dataset(X_train, label_train)
    dvalid = lgbm.Dataset(X_validate, label_validate, reference=dtrain)
    
    # 훈련 데이터를 학습하고, evalerror() 함수를 통해 검증 데이터에 대한 정규화 Gini 계수 점수를 기준으로 최적의 트리 개수를 찾는다.
    bst = lgbm.train(params, dtrain, num_boost_round, valid_sets=dvalid, feval=evalerror, verbose_eval=100, early_stopping_rounds=100)
    best_trees.append(bst.best_iteration)
    
    # 테스트 데이터에 대한 예측값을 cv_pred에 더한다.
    cv_pred += bst.predict(test_new, num_iteration=bst.best_iteration)
    cv_train[validate] += bst.predict(X_validate)

    # 검증 데이터에 대한 평가 점수를 출력한다.
    score = gini_normalized(label_validate, cv_train[validate])
    print(score)
    fold_scores.append(score)

cv_pred /= NFOLDS

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152996	valid_0's gini: 0.264874
[200]	valid_0's binary_logloss: 0.152459	valid_0's gini: 0.272241
[300]	valid_0's binary_logloss: 0.152285	valid_0's gini: 0.276157
[400]	valid_0's binary_logloss: 0.152246	valid_0's gini: 0.277057
[500]	valid_0's binary_logloss: 0.152217	valid_0's gini: 0.278038
[600]	valid_0's binary_logloss: 0.152194	valid_0's gini: 0.278638
[700]	valid_0's binary_logloss: 0.152186	valid_0's gini: 0.27874
[800]	valid_0's binary_logloss: 0.152187	valid_0's gini: 0.278856
Early stopping, best iteration is:
[752]	valid_0's binary_logloss: 0.152171	valid_0's gini: 0.27931
0.27931040135041235
Training until validation scores don't improve for 100 rounds.
[100]	valid_0's binary_logloss: 0.152842	valid_0's gini: 0.263249
[200]	valid_0's binary_logloss: 0.15218	valid_0's gini: 0.274236
[300]	valid_0's binary_logloss: 0.151936	valid_0's gini: 0.28046
[400]	valid_0's binary_logloss:

### 5. Evaluation

In [12]:
# 시드값별로 교차 검증 점수를 출력한다.
print("cv score:")
print(gini_normalized(train_label, cv_train))
print(fold_scores)
print(best_trees, np.mean(best_trees))

test_submission = pd.DataFrame({'id': test_id, 'target': cv_pred})
test_submission.to_csv('../porto-seguro/final.csv', index=False)

cv score:
0.2870140119919881
[0.27931040135041235, 0.28585780880686357, 0.29017164918334387, 0.28645063904818036, 0.2905718206449686, 0.2970029025256726, 0.29936256721331234, 0.2929794288083118, 0.2924351794554219, 0.2577364565981507]
[752, 702, 933, 763, 508, 591, 437, 910, 615, 642] 685.3
