# Kaggleで磨く 機械学習の実践力
# 第7章 2値分類のコンペ (Home Credit Default Risk)

# 7.3: ベースライン作成

## 7.3.2 データ前処理
#### スクリプト7-1: ライブラリの読み込み

In [19]:
import numpy as np
import pandas as pd
import re
import pickle
import gc

# scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

#### スクリプト7-2: ファイルの読み込み・データ確認

In [20]:
application_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
print(application_train.shape)
application_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


#### スクリプト7-3: メモリ削減のための関数

In [21]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            pass

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

#### スクリプト7-4: メモリ削減の実行

In [22]:
application_train = reduce_mem_usage(application_train)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%


## 7.3.3 データセット作成
#### スクリプト7-5: データセットの作成

In [23]:
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

#### スクリプト7-6: カテゴリ変数をcategory型に変換 

In [24]:
for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

## 7.3.4 バリデーション設計
#### スクリプト7-7: 1の割合とそれぞれの件数を確認

In [25]:
print("mean: {:.4f}".format(y_train.mean()))
y_train.value_counts()

mean: 0.0807


0    282686
1     24825
Name: TARGET, dtype: int64

#### スクリプト7-8: バリデーションのindexリスト作成

In [26]:
# 層化分割したバリデーションのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

# indexの確認：fold=0のtrainデータ
print("index(train):", cv[0][0])

# indexの確認：fold=0のvalidデータ
print("index(valid):", cv[0][1])

index(train): [     0      1      3 ... 307508 307509 307510]
index(valid): [     2     11     22 ... 307488 307495 307497]


## 7.3.5 モデル学習
#### スクリプト7-9: 学習データと検証データに分離

In [27]:
# foldごとのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

# 0fold目のindexのリスト取得
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# 学習データと検証データに分離
x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

(246008, 120) (246008,) (246008, 1)
(61503, 120) (61503,) (61503, 1)


#### スクリプト7-10: モデル学習

In [28]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# モデルの学習
model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr, y_tr), (x_va, y_va)],
          early_stopping_rounds=100,
          verbose=100
         )

# モデルの保存
with open("../models/notebooks/model_lgb_fold0.pickle", "wb") as f:
    pickle.dump(model, f, protocol=4)

Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
Early stopping, best iteration is:
[217]	training's auc: 0.812578	valid_1's auc: 0.758595


#### スクリプト7-11: モデル評価

In [29]:
# 学習データの推論値取得とROC計算
y_tr_pred = model.predict_proba(x_tr)[:,1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)

# 検証データの推論値取得とROC計算
y_va_pred = model.predict_proba(x_va)[:,1]
metric_va = roc_auc_score(y_va, y_va_pred)

# 評価値を入れる変数の作成（最初のfoldのときのみ）
metrics = []

# 評価値を格納
metrics.append([nfold, metric_tr, metric_va])

# 結果の表示
print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

[auc] tr:0.8126, va:0.7586


#### スクリプト7-12: OOFデータの推論値取得

In [30]:
# oofの予測値を入れる変数の作成
train_oof = np.zeros(len(x_train))

# validデータのindexに予測値を格納
train_oof[idx_va] = y_va_pred

#### スクリプト7-13: 説明変数の重要度取得

In [31]:
# 重要度の取得
imp_fold = pd.DataFrame({"col":x_train.columns, "imp":model.feature_importances_, "nfold":nfold})
# 確認（重要度の上位10個）
display(imp_fold.sort_values("imp", ascending=False)[:10])

# 重要度を格納する5fold用データフレームの作成
imp = pd.DataFrame()
# imp_foldを5fold用データフレームに結合
imp = pd.concat([imp, imp_fold])

Unnamed: 0,col,imp,nfold
41,EXT_SOURCE_3,66225.020483,0
40,EXT_SOURCE_2,52568.833805,0
38,ORGANIZATION_TYPE,20218.523523,0
39,EXT_SOURCE_1,19776.252288,0
6,AMT_CREDIT,8111.321247,0
8,AMT_GOODS_PRICE,7120.960365,0
15,DAYS_BIRTH,7042.223005,0
7,AMT_ANNUITY,6992.551795,0
16,DAYS_EMPLOYED,5236.51412,0
26,OCCUPATION_TYPE,4376.651746,0


#### スクリプト7-14: モデル評価（全foldのサマリ）

In [32]:
# リスト型をarray型に変換
metrics = np.array(metrics)
print(metrics)

# 学習/検証データの評価値の平均値と標準偏差を算出
print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
    metrics[:,1].mean(), metrics[:,1].std(),
    metrics[:,2].mean(), metrics[:,2].std(),
))

# oofの評価値を算出
print("[oof] {:.4f}".format(
    roc_auc_score(y_train, train_oof)
))

[[0.         0.81257796 0.75859528]]
[cv] tr:0.8126+-0.0000, va:0.7586+-0.0000
[oof] 0.5103


#### スクリプト7-15: OOFデータの推論値取得（全foldのサマリ）

In [33]:
train_oof = pd.concat([
    id_train,
    pd.DataFrame({"true": y_train, "pred": train_oof}),
], axis=1)
train_oof.head()

Unnamed: 0,SK_ID_CURR,true,pred
0,100002,1,0.0
1,100003,0,0.0
2,100004,0,0.031866
3,100006,0,0.0
4,100007,0,0.0


#### スクリプト7-16: 説明変数の重要度取得（全foldのサマリ）

In [34]:
imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
imp.columns = ["col", "imp", "imp_std"]
imp.head()

Unnamed: 0,col,imp,imp_std
0,AMT_ANNUITY,6992.551795,
1,AMT_CREDIT,8111.321247,
2,AMT_GOODS_PRICE,7120.960365,
3,AMT_INCOME_TOTAL,1595.740609,
4,AMT_REQ_CREDIT_BUREAU_DAY,128.842901,


#### スクリプト7-17: 学習関数の定義

In [38]:
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):
    train_oof = np.zeros(len(input_x))
    metrics = []
    imp = pd.DataFrame()

    # cross-validation
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        
        # make dataset
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y[idx_tr], input_id.loc[idx_tr, :]
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y[idx_va], input_id.loc[idx_va, :]
        print(x_tr.shape, x_va.shape)
        
        # train
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  early_stopping_rounds=100,
                  verbose=100
                 )
        fname_lgb = "../models/notebooks/model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "wb") as f:
            pickle.dump(model, f, protocol=4)
        
        # evaluate
        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))
        
        # oof
        train_oof[idx_va] = y_va_pred
        
        # imp
        _imp = pd.DataFrame({"col":input_x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp])
      
    print("-"*20, "result", "-"*20)
    # metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))
    print("[oof] {:.4f}".format(
        roc_auc_score(input_y, train_oof)
    ))
    
    # oof
    train_oof = pd.concat([
        input_id,
        pd.DataFrame({"pred":train_oof})
    ], axis=1)
    
    # importance
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
    imp.columns = ["col", "imp", "imp_std"]
    
    return train_oof, imp, metrics

#### スクリプト7-18: 学習処理の実行

In [39]:
# ハイパーパラメータの設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# 学習の実行
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 120) (61503, 120)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
Early stopping, best iteration is:
[217]	training's auc: 0.812578	valid_1's auc: 0.758595
[auc] tr:0.8126, va:0.7586
-------------------- 1 --------------------
(246009, 120) (61502, 120)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782531	valid_1's auc: 0.756239
[200]	training's auc: 0.808862	valid_1's auc: 0.758924
[300]	training's auc: 0.829564	valid_1's auc: 0.758779
Early stopping, best iteration is:
[236]	training's auc: 0.816952	valid_1's auc: 0.759033
[auc] tr:0.8170, va:0.7590
-------------------- 2 --------------------
(246009, 120) (61502, 120)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782101	valid_

#### スクリプト7-19: 説明変数の重要度の確認

In [40]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
38,EXT_SOURCE_3,65353.907478,1558.201212
37,EXT_SOURCE_2,54545.388309,1251.798934
102,ORGANIZATION_TYPE,21441.917474,1450.24619
36,EXT_SOURCE_1,20051.934248,685.852224
1,AMT_CREDIT,8263.228728,410.384434
22,DAYS_BIRTH,7645.58911,689.458833
2,AMT_GOODS_PRICE,7263.054566,405.837031
0,AMT_ANNUITY,6762.95364,479.302045
23,DAYS_EMPLOYED,5810.288375,552.93773
101,OCCUPATION_TYPE,5502.675859,831.872392


# 7.3.6 モデル推論
#### スクリプト7-20: 推論用データセットの作成

In [41]:
# ファイルの読み込み
application_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
application_test = reduce_mem_usage(application_test)

# データセットの作成
x_test = application_test.drop(columns=["SK_ID_CURR" ])
id_test = application_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 14.60 MB
Decreased by 67.6%


#### スクリプト7-21: 学習済モデルの読み込み

In [42]:
with open("../models/notebooks/model_lgb_fold0.pickle", "rb") as f:
    model = pickle.load(f)

#### スクリプト7-22: モデルを用いた推論

In [43]:
# 推論
test_pred_fold = model.predict_proba(x_test)[:,1]

# 推論値を格納する変数を作成
test_pred = np.zeros((len(x_test), 5))

# 1fold目の予測値を格納
test_pred[:, 0] = test_pred_fold

#### スクリプト7-23: 推論用データセットの推論値算出

In [44]:
# 各foldの推論値の平均値を算出
test_pred_mean = test_pred.mean(axis=1)

# 推論値のデータフレームを作成
df_test_pred = pd.concat([
        id_test,
        pd.DataFrame({"pred": test_pred_mean}),
    ], axis=1)
df_test_pred.head()

Unnamed: 0,SK_ID_CURR,pred
0,100001,0.006572
1,100005,0.023874
2,100013,0.004233
3,100028,0.008966
4,100038,0.030794


#### スクリプト7-24: 推論関数の定義

In [45]:
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4],
               ):
    pred = np.zeros((len(input_x), len(list_nfold)))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        fname_lgb = "../models/notebooks/model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "rb") as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(input_x)[:,1]
    
    pred = pd.concat([
        input_id,
        pd.DataFrame({"pred": pred.mean(axis=1)}),
    ], axis=1)
    
    print("Done.")
    
    return pred

#### スクリプト7-25: 推論処理の実行

In [46]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


#### スクリプト7-26: 提出ファイルの作成

In [47]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())

# ファイル出力
df_submit.to_csv("../submit/notebooks/submission_baseline.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.04181
1,100005,0.1264
2,100013,0.022495
3,100028,0.03968
4,100038,0.156628


# 7.4 特徴量エンジニアリング
## 7.4.1 特徴量エンジニアリング: application_train.csv

#### スクリプト7-27: データの確認

In [48]:
display(application_train["DAYS_EMPLOYED"].value_counts())
print("正の値の割合: {:.4f}".format((application_train["DAYS_EMPLOYED"]>0).mean()))
print("正の値の個数: {}".format((application_train["DAYS_EMPLOYED"]>0).sum()))
# -> 正の値が18%。しかもすべて8割が365243と同一値。働き始めてからの日数をマイナス表記しているためこれは欠損と判断。

 365243    55374
-200         156
-224         152
-230         151
-199         151
           ...  
-13961         1
-11827         1
-10176         1
-9459          1
-8694          1
Name: DAYS_EMPLOYED, Length: 12574, dtype: int64

正の値の割合: 0.1801
正の値の個数: 55374


#### スクリプト7-28: 欠損値の対処（nullに変換）

In [49]:
application_train["DAYS_EMPLOYED"] = application_train["DAYS_EMPLOYED"].replace(365243, np.nan)

#### スクリプト7-29: 仮説に基づく特徴量生成

In [50]:
# 特徴量1: 総所得金額を世帯人数で割った値
application_train['INCOME_div_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']

# 特徴量2: 総所得金額を就労期間で割った値
application_train['INCOME_div_EMPLOYED'] = application_train['AMT_INCOME_TOTAL'] / application_train['DAYS_EMPLOYED']

# 特徴量3: 外部スコアの平均値など
application_train["EXT_SOURCE_mean"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_max"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_train["EXT_SOURCE_min"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_train["EXT_SOURCE_std"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_train["EXT_SOURCE_count"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

# 特徴量4: 就労期間を年齢で割った値 (年齢に占める就労期間の割合)
application_train['DAYS_EMPLOYED_div_BIRTH'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']

# 特徴量5: 年金支払額を所得金額で割った値
application_train['ANNUITY_div_INCOME'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']

# 特徴量6: 年金支払額を借入金で割った値
application_train['ANNUITY_div_CREDIT'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']

#### スクリプト7-30: データセットの作成

In [51]:
x_train = application_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

#### スクリプト7-31: モデル学習

In [52]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 130) (61503, 130)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.787668	valid_1's auc: 0.760522
[200]	training's auc: 0.816663	valid_1's auc: 0.764903
Early stopping, best iteration is:
[199]	training's auc: 0.816433	valid_1's auc: 0.764953
[auc] tr:0.8164, va:0.7650
-------------------- 1 --------------------
(246009, 130) (61502, 130)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.787992	valid_1's auc: 0.762948
[200]	training's auc: 0.817175	valid_1's auc: 0.767282
[300]	training's auc: 0.838754	valid_1's auc: 0.767874
[400]	training's auc: 0.856266	valid_1's auc: 0.76818
[500]	training's auc: 0.871141	valid_1's auc: 0.767794
Early stopping, best iteration is:
[401]	training's auc: 0.856446	valid_1's auc: 0.76821
[auc] tr:0.8564, va:0.7682
-------------------- 2 --------------------
(246009, 130) (61502, 130)
Training until validation scores don't improve

#### スクリプト7-32: 説明変数の重要度の確認

In [53]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
44,EXT_SOURCE_mean,113357.410376,1818.497597
10,ANNUITY_div_CREDIT,23009.76087,1262.803683
112,ORGANIZATION_TYPE,21049.869444,2607.626225
41,EXT_SOURCE_3,11125.102983,1519.486904
24,DAYS_BIRTH,7440.306312,1062.141927
45,EXT_SOURCE_min,7153.097732,591.580969
39,EXT_SOURCE_1,6662.390595,1201.178176
0,AMT_ANNUITY,5721.869464,968.783761
2,AMT_GOODS_PRICE,5720.871402,450.62517
27,DAYS_ID_PUBLISH,5071.78566,1268.443378


#### スクリプト7-33: 推論用データのデータセット作成

In [54]:
# nullに置き換え
application_test["DAYS_EMPLOYED"] = application_test["DAYS_EMPLOYED"].replace(365243, np.nan)

# 特徴量の生成
application_test['INCOME_div_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
application_test['INCOME_div_EMPLOYED'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_EMPLOYED']
application_test["EXT_SOURCE_mean"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_max"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_test["EXT_SOURCE_min"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_test["EXT_SOURCE_std"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_test["EXT_SOURCE_count"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)
application_test['DAYS_EMPLOYED_div_BIRTH'] = application_test['DAYS_EMPLOYED'] / application_test['DAYS_BIRTH']
application_test['ANNUITY_div_INCOME'] = application_test['AMT_ANNUITY'] / application_test['AMT_INCOME_TOTAL']
application_test['ANNUITY_div_CREDIT'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']

# データセット作成
x_test = application_test.drop(columns=["SK_ID_CURR"])
id_test = application_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")


#### スクリプト7-34: 推論処理

In [55]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


#### スクリプト7-35: 提出ファイルの作成

In [56]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("../submit/notebooks/submission_FeatureEngineering1.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.027909
1,100005,0.107449
2,100013,0.019986
3,100028,0.043011
4,100038,0.17485


## 7.4.2 特徴量エンジニアリング: POS_CASH_balance.csv

#### スクリプト7-36: ファイル読み込み

In [57]:
pos = pd.read_csv("../input/home-credit-default-risk/POS_CASH_balance.csv")
pos = reduce_mem_usage(pos)
print(pos.shape)
pos.head()

Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 238.45 MB
Decreased by 60.9%
(10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


#### スクリプト7-37: ①カテゴリ変数をone-hot-encodingで数値に変換

In [58]:
pos_ohe = pd.get_dummies(pos, columns=["NAME_CONTRACT_STATUS"], dummy_na=True)
col_ohe = sorted(list(set(pos_ohe.columns) - set(pos.columns)))
print(len(col_ohe))
col_ohe

10


['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Amortized debt',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Canceled',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Returned to the store',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_XNA',
 'NAME_CONTRACT_STATUS_nan']

#### スクリプト7-38: ②SK_ID_CURRをキーに集約処理

In [59]:
pos_ohe_agg = pos_ohe.groupby("SK_ID_CURR").agg(
    {
        # 数値の集約
        "MONTHS_BALANCE": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT_FUTURE": ["mean", "std", "min", "max"],
        "SK_DPD": ["mean", "std", "min", "max"],
        "SK_DPD_DEF": ["mean", "std", "min", "max"],
        # カテゴリ変数をone-hot-encodingした値の集約
        "NAME_CONTRACT_STATUS_Active": ["mean"],
        "NAME_CONTRACT_STATUS_Amortized debt": ["mean"],
        "NAME_CONTRACT_STATUS_Approved": ["mean"],
        "NAME_CONTRACT_STATUS_Canceled": ["mean"],
        "NAME_CONTRACT_STATUS_Completed": ["mean"],
        "NAME_CONTRACT_STATUS_Demand": ["mean"],
        "NAME_CONTRACT_STATUS_Returned to the store": ["mean"],
        "NAME_CONTRACT_STATUS_Signed": ["mean"],
        "NAME_CONTRACT_STATUS_XNA": ["mean"],
        "NAME_CONTRACT_STATUS_nan": ["mean"],
        # IDのユニーク数をカウント (ついでにレコード数もカウント)
        "SK_ID_PREV":["count", "nunique"],
    }
)

# カラム名の付与
pos_ohe_agg.columns = [i + "_" + j for i,j in pos_ohe_agg.columns]
pos_ohe_agg = pos_ohe_agg.reset_index(drop=False)

print(pos_ohe_agg.shape)
pos_ohe_agg.head()

(337252, 33)


Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_mean,MONTHS_BALANCE_std,MONTHS_BALANCE_min,MONTHS_BALANCE_max,CNT_INSTALMENT_mean,CNT_INSTALMENT_std,CNT_INSTALMENT_min,CNT_INSTALMENT_max,CNT_INSTALMENT_FUTURE_mean,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100001,-72.555556,20.863312,-96,-53,4.0,0.0,4.0,4.0,1.444336,...,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,9,2
1,100002,-10.0,5.627314,-19,-1,24.0,0.0,24.0,24.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,1
2,100003,-43.785714,24.640162,-77,-18,10.109375,2.806597,6.0,12.0,5.785156,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28,3
3,100004,-25.5,1.290994,-27,-24,3.75,0.5,3.0,4.0,2.25,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4,1
4,100005,-20.0,3.316625,-25,-15,11.703125,0.948683,9.0,12.0,7.199219,...,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,11,1


#### スクリプト7-39: ③SK_ID_CURRをキーにして結合

In [60]:
df_train = pd.merge(application_train, pos_ohe_agg, on="SK_ID_CURR", how="left")
print(df_train.shape)
df_train.head()

(307511, 164)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.095238,0.0,0.047619,0.0,0.0,0.0,21.0,3.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.045455,0.0,0.0,0.015152,0.0,0.0,66.0,5.0


#### スクリプト7-40: データセット作成

In [61]:
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

#### スクリプト7-41: モデル学習

In [62]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 162) (61503, 162)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.795088	valid_1's auc: 0.765593
[200]	training's auc: 0.825361	valid_1's auc: 0.770839
[300]	training's auc: 0.848039	valid_1's auc: 0.771873
[400]	training's auc: 0.866707	valid_1's auc: 0.771772
Early stopping, best iteration is:
[351]	training's auc: 0.858291	valid_1's auc: 0.772055
[auc] tr:0.8583, va:0.7721
-------------------- 1 --------------------
(246009, 162) (61502, 162)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.794927	valid_1's auc: 0.770144
[200]	training's auc: 0.82505	valid_1's auc: 0.775788
[300]	training's auc: 0.846934	valid_1's auc: 0.77623
Early stopping, best iteration is:
[283]	training's auc: 0.843236	valid_1's auc: 0.776499
[auc] tr:0.8432, va:0.7765
-------------------- 2 --------------------
(246009, 162) (61502, 162)
Training until validation scores don't improve

#### スクリプト7-42: 説明変数の重要度の確認

In [63]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
52,EXT_SOURCE_mean,112696.230138,1466.943131
134,ORGANIZATION_TYPE,21907.995104,3164.458317
10,ANNUITY_div_CREDIT,18585.842348,1070.211982
49,EXT_SOURCE_3,10649.690041,1772.668033
53,EXT_SOURCE_min,7366.997531,720.317856
32,DAYS_BIRTH,6943.280226,1240.753351
47,EXT_SOURCE_1,6385.091305,945.13895
21,CNT_INSTALMENT_FUTURE_mean,6384.904319,1198.580535
108,MONTHS_BALANCE_std,5684.483723,758.402778
0,AMT_ANNUITY,5544.795909,1023.177548


#### スクリプト7-43: 推論用のデータセット作成

In [64]:
# テーブル結合
df_test = pd.merge(application_test, pos_ohe_agg, on="SK_ID_CURR", how="left")

# データセット作成
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

#### スクリプト7-44: 推論用データセットを用いた推論処理

In [65]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


#### スクリプト7-45: 提出ファイルの作成

In [66]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("../submit/notebooks/submission_FeatureEngineering2.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.035337
1,100005,0.111115
2,100013,0.027314
3,100028,0.043957
4,100038,0.209936


# 7.5 モデルチューニング
- LightGBMのハイパーパラメータのチューニング

#### スクリプト7-46: 重要度を用いて絞り込んだ特徴量リストの作成（以降では利用しない）

In [67]:
col_filter = sorted(list(imp.sort_values("imp", ascending=False)[:100]["col"]))
# col_filter

## 7.5.1 optunaによる自動チューニングの実行

#### スクリプト7-47: optunaライブラリのインポート

In [68]:
import optuna

#### スクリプト7-48: 学習用のデータセット作成

In [69]:
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

#### スクリプト7-49: 目的関数の定義

In [70]:
# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "learning_rate": 0.05,
    "n_estimators": 100000,
    "bagging_freq": 1,
    "random_state": 123,
}

# 目的関数の定義
def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e+2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e+2, log=True),
    }
    params_tuning.update(params_base)
    
    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    list_fold = [0]  # 処理高速化のために1つめのfoldのみとする。
    for nfold in list_fold:
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train[idx_tr]
        x_va, y_va = x_train.loc[idx_va, :], y_train[idx_va]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred) # 評価指標をAUCにする
        list_metrics.append(metric_va)
    
    # 評価指標の算出
    metrics = np.mean(list_metrics)
    
    return metrics

#### スクリプト7-50: 最適化処理（探索の実行）
- 注意点
    - optunaではシード固定しても探索結果の再現性がありません。このため，これ以降は書籍と結果が異なります。
    - 再現性を求める場合は，並列化なし（n_jobs=1）としてください。

In [71]:
sampler = optuna.samplers.TPESampler(seed=123)
study = optuna.create_study(sampler=sampler, direction="maximize")
study.optimize(objective, n_trials=50, n_jobs=5)

[32m[I 2022-08-21 21:34:54,268][0m A new study created in memory with name: no-name-380d55aa-9ff4-4984-b4cf-befc2febcb2f[0m




[32m[I 2022-08-21 21:38:34,137][0m Trial 2 finished with value: 0.7723815514715714 and parameters: {'num_leaves': 42, 'min_child_samples': 48, 'min_sum_hessian_in_leaf': 5.48452014988092e-05, 'feature_fraction': 0.9043259491835924, 'bagging_fraction': 0.906359643839176, 'lambda_l1': 15.716753441011617, 'lambda_l2': 21.463911049577813}. Best is trial 2 with value: 0.7723815514715714.[0m




[32m[I 2022-08-21 21:38:42,195][0m Trial 4 finished with value: 0.770205927323804 and parameters: {'num_leaves': 95, 'min_child_samples': 140, 'min_sum_hessian_in_leaf': 1.680280298742381e-05, 'feature_fraction': 0.9239728168223107, 'bagging_fraction': 0.8639423753629069, 'lambda_l1': 0.04050998057439114, 'lambda_l2': 0.2805297379820472}. Best is trial 2 with value: 0.7723815514715714.[0m




[32m[I 2022-08-21 21:41:34,208][0m Trial 0 finished with value: 0.7683445407605262 and parameters: {'num_leaves': 223, 'min_child_samples': 61, 'min_sum_hessian_in_leaf': 8.188012497463723e-05, 'feature_fraction': 0.5397040294456684, 'bagging_fraction': 0.9679591669887634, 'lambda_l1': 0.49244525250715315, 'lambda_l2': 0.047432327623995234}. Best is trial 2 with value: 0.7723815514715714.[0m




[32m[I 2022-08-21 21:42:21,280][0m Trial 3 finished with value: 0.7707029435273274 and parameters: {'num_leaves': 80, 'min_child_samples': 153, 'min_sum_hessian_in_leaf': 8.347220513308408e-05, 'feature_fraction': 0.5870319296708764, 'bagging_fraction': 0.5537508294943094, 'lambda_l1': 91.22214347058602, 'lambda_l2': 0.6980850615572954}. Best is trial 2 with value: 0.7723815514715714.[0m




[32m[I 2022-08-21 21:42:54,177][0m Trial 1 finished with value: 0.7696970590803353 and parameters: {'num_leaves': 256, 'min_child_samples': 193, 'min_sum_hessian_in_leaf': 0.003791879258201651, 'feature_fraction': 0.7493588391159747, 'bagging_fraction': 0.9170175332611694, 'lambda_l1': 0.06495342460248797, 'lambda_l2': 1.0144298733024533}. Best is trial 2 with value: 0.7723815514715714.[0m




[32m[I 2022-08-21 21:44:49,350][0m Trial 6 finished with value: 0.7716175170371739 and parameters: {'num_leaves': 184, 'min_child_samples': 155, 'min_sum_hessian_in_leaf': 1.913416784830372e-05, 'feature_fraction': 0.8620390787205185, 'bagging_fraction': 0.8649616391001265, 'lambda_l1': 5.184428046558282, 'lambda_l2': 48.165441146964}. Best is trial 2 with value: 0.7723815514715714.[0m




[32m[I 2022-08-21 21:45:34,984][0m Trial 5 finished with value: 0.7709259627965641 and parameters: {'num_leaves': 165, 'min_child_samples': 57, 'min_sum_hessian_in_leaf': 3.297325495339889e-05, 'feature_fraction': 0.7965720493309374, 'bagging_fraction': 0.962498848817301, 'lambda_l1': 0.01859106961325507, 'lambda_l2': 13.828628749394621}. Best is trial 2 with value: 0.7723815514715714.[0m




[32m[I 2022-08-21 21:46:26,283][0m Trial 7 finished with value: 0.7714110663996734 and parameters: {'num_leaves': 150, 'min_child_samples': 110, 'min_sum_hessian_in_leaf': 4.1480225645145224e-05, 'feature_fraction': 0.9862171045107081, 'bagging_fraction': 0.9718332909528424, 'lambda_l1': 10.644070295927321, 'lambda_l2': 0.016482547045625122}. Best is trial 2 with value: 0.7723815514715714.[0m




[32m[I 2022-08-21 21:48:37,376][0m Trial 10 finished with value: 0.7727876236631409 and parameters: {'num_leaves': 38, 'min_child_samples': 98, 'min_sum_hessian_in_leaf': 0.002850601245173647, 'feature_fraction': 0.9853969801852647, 'bagging_fraction': 0.9007943724981138, 'lambda_l1': 1.4374313799719816, 'lambda_l2': 61.87523451399158}. Best is trial 10 with value: 0.7727876236631409.[0m




[32m[I 2022-08-21 21:48:46,489][0m Trial 9 finished with value: 0.7692262940587651 and parameters: {'num_leaves': 206, 'min_child_samples': 156, 'min_sum_hessian_in_leaf': 0.0020923782772411517, 'feature_fraction': 0.7478516013363387, 'bagging_fraction': 0.7165852232540617, 'lambda_l1': 3.103731814449755, 'lambda_l2': 7.571708344736487}. Best is trial 10 with value: 0.7727876236631409.[0m




[32m[I 2022-08-21 21:49:04,859][0m Trial 8 finished with value: 0.7698051915782332 and parameters: {'num_leaves': 197, 'min_child_samples': 187, 'min_sum_hessian_in_leaf': 0.003819760032630982, 'feature_fraction': 0.9583625483675053, 'bagging_fraction': 0.6935068486513007, 'lambda_l1': 27.368285231713507, 'lambda_l2': 1.00806822133648}. Best is trial 10 with value: 0.7727876236631409.[0m




[32m[I 2022-08-21 21:51:56,174][0m Trial 14 finished with value: 0.7735956463720343 and parameters: {'num_leaves': 8, 'min_child_samples': 22, 'min_sum_hessian_in_leaf': 0.0008537367695460064, 'feature_fraction': 0.6618652337303669, 'bagging_fraction': 0.7316326062463285, 'lambda_l1': 0.4757958376528302, 'lambda_l2': 3.9882014023017227}. Best is trial 14 with value: 0.7735956463720343.[0m




[32m[I 2022-08-21 21:52:16,503][0m Trial 11 finished with value: 0.7689225263105846 and parameters: {'num_leaves': 161, 'min_child_samples': 139, 'min_sum_hessian_in_leaf': 0.002759576180776497, 'feature_fraction': 0.7121905945353033, 'bagging_fraction': 0.8249893233238543, 'lambda_l1': 0.028989117784750305, 'lambda_l2': 0.12631235500930701}. Best is trial 14 with value: 0.7735956463720343.[0m




[32m[I 2022-08-21 21:53:21,194][0m Trial 15 finished with value: 0.7743778489470154 and parameters: {'num_leaves': 11, 'min_child_samples': 25, 'min_sum_hessian_in_leaf': 0.0005377484945380236, 'feature_fraction': 0.8700519646182532, 'bagging_fraction': 0.8045252953849011, 'lambda_l1': 0.6119046221190197, 'lambda_l2': 97.2197733485338}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 21:55:10,895][0m Trial 12 finished with value: 0.7707402380888513 and parameters: {'num_leaves': 166, 'min_child_samples': 106, 'min_sum_hessian_in_leaf': 0.0010572567404040878, 'feature_fraction': 0.5274996492216553, 'bagging_fraction': 0.7728409310636082, 'lambda_l1': 12.052854078897502, 'lambda_l2': 6.8779315156364795}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 21:56:53,093][0m Trial 16 finished with value: 0.7742569275030987 and parameters: {'num_leaves': 14, 'min_child_samples': 7, 'min_sum_hessian_in_leaf': 0.0006370656145601116, 'feature_fraction': 0.6568605137864092, 'bagging_fraction': 0.6213237656233003, 'lambda_l1': 0.46294047928162146, 'lambda_l2': 96.86920831027837}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 21:57:09,461][0m Trial 18 finished with value: 0.7735479496594311 and parameters: {'num_leaves': 17, 'min_child_samples': 13, 'min_sum_hessian_in_leaf': 0.0005374585600936909, 'feature_fraction': 0.6375425703752879, 'bagging_fraction': 0.6167245335046145, 'lambda_l1': 0.2687709640587736, 'lambda_l2': 4.09992535314581}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 21:57:14,971][0m Trial 17 finished with value: 0.773915181216337 and parameters: {'num_leaves': 18, 'min_child_samples': 8, 'min_sum_hessian_in_leaf': 0.0007083324760446812, 'feature_fraction': 0.6476982553618769, 'bagging_fraction': 0.6366533872241327, 'lambda_l1': 0.451463794135174, 'lambda_l2': 95.6242124403054}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 21:58:56,454][0m Trial 19 finished with value: 0.7733959785070184 and parameters: {'num_leaves': 10, 'min_child_samples': 15, 'min_sum_hessian_in_leaf': 0.0002913867240366815, 'feature_fraction': 0.6598927001875795, 'bagging_fraction': 0.6334560805088645, 'lambda_l1': 0.20097462144205303, 'lambda_l2': 4.308715813475768}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 21:59:14,569][0m Trial 13 finished with value: 0.768148919759766 and parameters: {'num_leaves': 236, 'min_child_samples': 150, 'min_sum_hessian_in_leaf': 0.0021607205429487325, 'feature_fraction': 0.8466180549901794, 'bagging_fraction': 0.9330100185353658, 'lambda_l1': 0.013761581861878694, 'lambda_l2': 2.5873069322178197}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:01:44,991][0m Trial 22 finished with value: 0.769621650609771 and parameters: {'num_leaves': 91, 'min_child_samples': 34, 'min_sum_hessian_in_leaf': 0.009656101907847128, 'feature_fraction': 0.8269598949103292, 'bagging_fraction': 0.5292011024479542, 'lambda_l1': 0.13145339432513747, 'lambda_l2': 25.96489595798558}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:02:10,620][0m Trial 20 finished with value: 0.7737752615971784 and parameters: {'num_leaves': 8, 'min_child_samples': 7, 'min_sum_hessian_in_leaf': 0.0002616829846359229, 'feature_fraction': 0.6384579130093758, 'bagging_fraction': 0.6209132714820212, 'lambda_l1': 0.15212155658205656, 'lambda_l2': 65.50106228300132}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:02:32,380][0m Trial 21 finished with value: 0.7703975976445826 and parameters: {'num_leaves': 101, 'min_child_samples': 36, 'min_sum_hessian_in_leaf': 0.00022286335046331693, 'feature_fraction': 0.837722806918509, 'bagging_fraction': 0.6485046104509826, 'lambda_l1': 0.13577554664706304, 'lambda_l2': 68.17221152510142}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:04:12,613][0m Trial 24 finished with value: 0.7698946714517986 and parameters: {'num_leaves': 92, 'min_child_samples': 35, 'min_sum_hessian_in_leaf': 0.0002263021076537235, 'feature_fraction': 0.7883119056201539, 'bagging_fraction': 0.5008753654809175, 'lambda_l1': 0.09946189132071201, 'lambda_l2': 25.795535011686788}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:04:25,623][0m Trial 23 finished with value: 0.7679614815470293 and parameters: {'num_leaves': 97, 'min_child_samples': 36, 'min_sum_hessian_in_leaf': 0.009591191054901827, 'feature_fraction': 0.8263722048810497, 'bagging_fraction': 0.506500111952988, 'lambda_l1': 0.08876904032962252, 'lambda_l2': 27.456180865036266}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:06:06,701][0m Trial 26 finished with value: 0.7723215004233712 and parameters: {'num_leaves': 55, 'min_child_samples': 76, 'min_sum_hessian_in_leaf': 0.0001621890040525774, 'feature_fraction': 0.709648124306598, 'bagging_fraction': 0.7837580614950457, 'lambda_l1': 0.8362953493097847, 'lambda_l2': 73.48235991742617}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:06:51,334][0m Trial 25 finished with value: 0.7725751775392479 and parameters: {'num_leaves': 57, 'min_child_samples': 6, 'min_sum_hessian_in_leaf': 0.00022628702977767887, 'feature_fraction': 0.5963809969244953, 'bagging_fraction': 0.6298445712145154, 'lambda_l1': 1.1485240167796371, 'lambda_l2': 71.45012425188267}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:07:31,447][0m Trial 27 finished with value: 0.7725169860536721 and parameters: {'num_leaves': 56, 'min_child_samples': 75, 'min_sum_hessian_in_leaf': 0.0005382052780821857, 'feature_fraction': 0.5889218291922617, 'bagging_fraction': 0.7946029314001878, 'lambda_l1': 1.0125480729560796, 'lambda_l2': 96.70883077409039}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:09:02,381][0m Trial 28 finished with value: 0.7727382098831337 and parameters: {'num_leaves': 48, 'min_child_samples': 74, 'min_sum_hessian_in_leaf': 0.0005396332277942413, 'feature_fraction': 0.6955844817303791, 'bagging_fraction': 0.77320309738644, 'lambda_l1': 0.9421152160273982, 'lambda_l2': 99.85221188382968}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:09:19,867][0m Trial 29 finished with value: 0.7715146390505231 and parameters: {'num_leaves': 55, 'min_child_samples': 78, 'min_sum_hessian_in_leaf': 0.0005220392266875909, 'feature_fraction': 0.5885879784580903, 'bagging_fraction': 0.5811970231350583, 'lambda_l1': 0.9208759962726251, 'lambda_l2': 75.88913346000611}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:10:18,609][0m Trial 31 finished with value: 0.7720608552912233 and parameters: {'num_leaves': 34, 'min_child_samples': 78, 'min_sum_hessian_in_leaf': 0.0006280765446565473, 'feature_fraction': 0.6998672667670904, 'bagging_fraction': 0.5818869874266809, 'lambda_l1': 2.601715296568575, 'lambda_l2': 13.158035397868995}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:35:15,490][0m Trial 30 finished with value: 0.7712849225059338 and parameters: {'num_leaves': 66, 'min_child_samples': 76, 'min_sum_hessian_in_leaf': 0.0005772052474626425, 'feature_fraction': 0.5909683742127303, 'bagging_fraction': 0.5801630596092617, 'lambda_l1': 2.4726434246074382, 'lambda_l2': 91.07248537769459}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:37:06,610][0m Trial 32 finished with value: 0.769152855584621 and parameters: {'num_leaves': 121, 'min_child_samples': 23, 'min_sum_hessian_in_leaf': 0.0011694591436236309, 'feature_fraction': 0.696155685443899, 'bagging_fraction': 0.5786623013489685, 'lambda_l1': 2.585091633053706, 'lambda_l2': 12.21106402728719}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:37:24,514][0m Trial 34 finished with value: 0.7729377673143538 and parameters: {'num_leaves': 29, 'min_child_samples': 24, 'min_sum_hessian_in_leaf': 0.001189586086519933, 'feature_fraction': 0.5583392496388081, 'bagging_fraction': 0.6859971415305488, 'lambda_l1': 0.35410464652622836, 'lambda_l2': 9.242984865681791}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:38:19,704][0m Trial 35 finished with value: 0.7738965214672434 and parameters: {'num_leaves': 24, 'min_child_samples': 27, 'min_sum_hessian_in_leaf': 0.0010854048215552905, 'feature_fraction': 0.6313249468259439, 'bagging_fraction': 0.679286485899536, 'lambda_l1': 0.3112176052743112, 'lambda_l2': 34.24149165193612}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:38:33,327][0m Trial 33 finished with value: 0.7701037511261131 and parameters: {'num_leaves': 124, 'min_child_samples': 24, 'min_sum_hessian_in_leaf': 0.0010714299685246624, 'feature_fraction': 0.5728186836546812, 'bagging_fraction': 0.6839786197906818, 'lambda_l1': 2.4347841662054392, 'lambda_l2': 10.185296917613021}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:39:44,150][0m Trial 36 finished with value: 0.7729688918328401 and parameters: {'num_leaves': 24, 'min_child_samples': 23, 'min_sum_hessian_in_leaf': 0.00012205367773072142, 'feature_fraction': 0.6299131067618229, 'bagging_fraction': 0.6769058599490055, 'lambda_l1': 0.33717136037550877, 'lambda_l2': 42.5427504666616}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:41:29,655][0m Trial 37 finished with value: 0.7733490655181268 and parameters: {'num_leaves': 26, 'min_child_samples': 7, 'min_sum_hessian_in_leaf': 0.00012425892083694424, 'feature_fraction': 0.6354557011290742, 'bagging_fraction': 0.6855657760434649, 'lambda_l1': 0.44557725629656164, 'lambda_l2': 38.760723520938335}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:42:52,627][0m Trial 38 finished with value: 0.7728548422209206 and parameters: {'num_leaves': 23, 'min_child_samples': 6, 'min_sum_hessian_in_leaf': 9.11884557943438e-05, 'feature_fraction': 0.6389002825903655, 'bagging_fraction': 0.6651164523515503, 'lambda_l1': 0.5556614027922642, 'lambda_l2': 37.72787263039407}. Best is trial 15 with value: 0.7743778489470154.[0m




[32m[I 2022-08-21 22:43:09,478][0m Trial 40 finished with value: 0.7744227741275845 and parameters: {'num_leaves': 25, 'min_child_samples': 48, 'min_sum_hessian_in_leaf': 0.00011576211875463494, 'feature_fraction': 0.5052669693699762, 'bagging_fraction': 0.6515519634968636, 'lambda_l1': 0.5969701115584823, 'lambda_l2': 40.42049721652675}. Best is trial 40 with value: 0.7744227741275845.[0m




[32m[I 2022-08-21 22:43:45,721][0m Trial 39 finished with value: 0.7744757965990453 and parameters: {'num_leaves': 26, 'min_child_samples': 46, 'min_sum_hessian_in_leaf': 0.00013889526566401947, 'feature_fraction': 0.5105983538090473, 'bagging_fraction': 0.6676157659858872, 'lambda_l1': 0.38362766323687636, 'lambda_l2': 39.72397156052672}. Best is trial 39 with value: 0.7744757965990453.[0m




[32m[I 2022-08-21 22:45:07,101][0m Trial 41 finished with value: 0.7726356988216749 and parameters: {'num_leaves': 72, 'min_child_samples': 52, 'min_sum_hessian_in_leaf': 0.0003756593404049351, 'feature_fraction': 0.5018564168469755, 'bagging_fraction': 0.7331042365408436, 'lambda_l1': 0.0460083311275667, 'lambda_l2': 42.968253699991706}. Best is trial 39 with value: 0.7744757965990453.[0m




[32m[I 2022-08-21 23:16:15,997][0m Trial 45 finished with value: 0.7740011663946255 and parameters: {'num_leaves': 42, 'min_child_samples': 58, 'min_sum_hessian_in_leaf': 5.6419929130068696e-05, 'feature_fraction': 0.5151309845848788, 'bagging_fraction': 0.7139232256788839, 'lambda_l1': 0.05846319482193868, 'lambda_l2': 20.327663533529226}. Best is trial 39 with value: 0.7744757965990453.[0m




[32m[I 2022-08-21 23:16:48,916][0m Trial 44 finished with value: 0.7722637898591638 and parameters: {'num_leaves': 69, 'min_child_samples': 51, 'min_sum_hessian_in_leaf': 1.0811726426435334e-05, 'feature_fraction': 0.8957080658568564, 'bagging_fraction': 0.8311059959309851, 'lambda_l1': 0.07008737795111225, 'lambda_l2': 18.614918849733407}. Best is trial 39 with value: 0.7744757965990453.[0m




[32m[I 2022-08-21 23:16:57,895][0m Trial 42 finished with value: 0.7721495122548917 and parameters: {'num_leaves': 78, 'min_child_samples': 51, 'min_sum_hessian_in_leaf': 0.0016839622609232396, 'feature_fraction': 0.7415269109179271, 'bagging_fraction': 0.7439692935261962, 'lambda_l1': 0.06521264419856194, 'lambda_l2': 21.35980731909481}. Best is trial 39 with value: 0.7744757965990453.[0m




[32m[I 2022-08-21 23:17:54,002][0m Trial 43 finished with value: 0.7719666944496723 and parameters: {'num_leaves': 74, 'min_child_samples': 52, 'min_sum_hessian_in_leaf': 0.000399940301694095, 'feature_fraction': 0.9095119535993936, 'bagging_fraction': 0.7332540411938898, 'lambda_l1': 0.050727657901384136, 'lambda_l2': 20.34289028817362}. Best is trial 39 with value: 0.7744757965990453.[0m
[32m[I 2022-08-21 23:18:20,069][0m Trial 46 finished with value: 0.7729061226883134 and parameters: {'num_leaves': 41, 'min_child_samples': 48, 'min_sum_hessian_in_leaf': 4.768461476953447e-05, 'feature_fraction': 0.5229989238889968, 'bagging_fraction': 0.603379945749202, 'lambda_l1': 0.6506276764727431, 'lambda_l2': 18.41821758774759}. Best is trial 39 with value: 0.7744757965990453.[0m
[32m[I 2022-08-21 23:20:18,921][0m Trial 48 finished with value: 0.7731864179113358 and parameters: {'num_leaves': 38, 'min_child_samples': 63, 'min_sum_hessian_in_leaf': 6.13838143662946e-05, 'feature_fracti

#### スクリプト7-51: 探索結果の確認

In [72]:
trial = study.best_trial
print("acc(best)={:.4f}".format(trial.value))
display(trial.params)

acc(best)=0.7745


{'num_leaves': 26,
 'min_child_samples': 46,
 'min_sum_hessian_in_leaf': 0.00013889526566401947,
 'feature_fraction': 0.5105983538090473,
 'bagging_fraction': 0.6676157659858872,
 'lambda_l1': 0.38362766323687636,
 'lambda_l2': 39.72397156052672}

#### スクリプト7-52: ベストなハイパーパラメータの取得

In [73]:
params_best = trial.params
params_best.update(params_base)
display(params_best)

{'num_leaves': 26,
 'min_child_samples': 46,
 'min_sum_hessian_in_leaf': 0.00013889526566401947,
 'feature_fraction': 0.5105983538090473,
 'bagging_fraction': 0.6676157659858872,
 'lambda_l1': 0.38362766323687636,
 'lambda_l2': 39.72397156052672,
 'boosting_type': 'gbdt',
 'objective': 'binary',
 'metric': 'auc',
 'verbosity': -1,
 'learning_rate': 0.05,
 'n_estimators': 100000,
 'bagging_freq': 1,
 'random_state': 123}

#### スクリプト7-53: ベストなハイパーパラメータを用いたモデル学習

In [74]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                    params=params_best,
                                   )

-------------------- 0 --------------------
(246008, 162) (61503, 162)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.780189	valid_1's auc: 0.763435
[200]	training's auc: 0.797356	valid_1's auc: 0.770639
[300]	training's auc: 0.809694	valid_1's auc: 0.772752
[400]	training's auc: 0.819639	valid_1's auc: 0.773702
[500]	training's auc: 0.828387	valid_1's auc: 0.774074
[600]	training's auc: 0.836468	valid_1's auc: 0.774455
[700]	training's auc: 0.843895	valid_1's auc: 0.774326
Early stopping, best iteration is:
[605]	training's auc: 0.836923	valid_1's auc: 0.774476
[auc] tr:0.8369, va:0.7745
-------------------- 1 --------------------
(246009, 162) (61502, 162)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.779923	valid_1's auc: 0.766506
[200]	training's auc: 0.79712	valid_1's auc: 0.774007
[300]	training's auc: 0.8095	valid_1's auc: 0.776878
[400]	training's auc: 0.819701	valid_1's auc: 0.777985
[500]	training'

#### スクリプト7-54: 推論データ作成とモデル推論

In [75]:
# 推論用のデータセット作成
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

# predict
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

# make submission-file
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("../submit/notebooks/submission_HyperParameterTuning.csv", index=None)

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.
(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.043273
1,100005,0.128469
2,100013,0.029332
3,100028,0.048093
4,100038,0.217571
