In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/home-credit-default-risk/sample_submission.csv
/kaggle/input/home-credit-default-risk/bureau_balance.csv
/kaggle/input/home-credit-default-risk/POS_CASH_balance.csv
/kaggle/input/home-credit-default-risk/application_train.csv
/kaggle/input/home-credit-default-risk/HomeCredit_columns_description.csv
/kaggle/input/home-credit-default-risk/application_test.csv
/kaggle/input/home-credit-default-risk/previous_application.csv
/kaggle/input/home-credit-default-risk/credit_card_balance.csv
/kaggle/input/home-credit-default-risk/installments_payments.csv
/kaggle/input/home-credit-default-risk/bureau.csv


In [2]:
#ライブラリの読み込み
import numpy as np
import pandas as pd
import re
import pickle
import gc

# scikit-learn
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# LightGBM
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

In [3]:
# ファイルの読み込み・データ確認
application_train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
print(application_train.shape)
application_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# メモリ削減のための関数
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
                    
        else:
            pass
        
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
#メモリ削減の実行
application_train = reduce_mem_usage(application_train)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%


In [6]:
#データセットの作成
x_train = application_train.drop(columns = ["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]

In [7]:
for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

In [8]:
# １の割合とそれぞれの件数を確認
print("mean: {:.4f}".format(y_train.mean()))
y_train.value_counts()

mean: 0.0807


0    282686
1     24825
Name: TARGET, dtype: int64

In [9]:
# foldごとのindexのリスト作成
cv = list(StratifiedKFold(n_splits = 5, shuffle = True, random_state = 123).split(x_train, y_train))

# 0fold目のindexのリスト取得
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# 学習データと検証データに分離
x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)


(246008, 120) (246008,) (246008, 1)
(61503, 120) (61503,) (61503, 1)


In [10]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# モデルの学習
model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr, y_tr), (x_va, y_va)],
          early_stopping_rounds=100,
          verbose=100
         )

# モデルの保存
with open("model_lgb_fold0.pickle", "wb") as f:
    pickle.dump(model, f, protocol=4)

[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774


In [11]:
# 学習データの推論値取得とROC計算
y_tr_pred = model.predict_proba(x_tr)[:, 1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)

# 検証データの推論値取得とROC計算
y_va_pred = model.predict_proba(x_va)[:, 1]
metric_va = roc_auc_score(y_va, y_va_pred)

# 評価値を入れる変数の作成
metrics = []

# 評価値を格納
metrics.append([nfold, metric_tr, metric_va])

# 結果の表示
print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))

[auc] tr:0.8126, va:0.7586


In [12]:
# OOFの推論値を入れる変数の作成
train_oof = np.zeros(len(x_train))

# 検証データのindexに推論値を格納
train_oof[idx_va] = y_va_pred

In [13]:
# 重要度の取得
imp_fold = pd.DataFrame({"col": x_train.columns, "imp": model.feature_importances_,
                        "nfold": nfold})

# 確認
display(imp_fold.sort_values("imp", ascending = False)[:10])

# 重要度を格納する5-fold用データフレームの作成
imp = pd.DataFrame()
# imp_foldを5fold用データフレームに結合
imp = pd.concat([imp, imp_fold])

Unnamed: 0,col,imp,nfold
41,EXT_SOURCE_3,66225.020483,0
40,EXT_SOURCE_2,52568.833805,0
38,ORGANIZATION_TYPE,20218.523523,0
39,EXT_SOURCE_1,19776.252288,0
6,AMT_CREDIT,8111.321247,0
8,AMT_GOODS_PRICE,7120.960365,0
15,DAYS_BIRTH,7042.223005,0
7,AMT_ANNUITY,6992.551795,0
16,DAYS_EMPLOYED,5236.51412,0
26,OCCUPATION_TYPE,4376.651746,0


In [14]:
# モデル評価
# リスト型をarray型に変換
metrics = np.array(metrics)
print(metrics)

# 学習/検証データの評価値の平均値と標準偏差を算出
print("[cv] tr:{:.4f} +- {:.4f}, va:{:.4f} +- {:.4f}".format(
metrics[:, 1].mean(), metrics[:, 1].std(),
metrics[:, 2].mean(), metrics[:, 2].std(),
))

# oofの評価値を算出
print("[oof] {:.4f}".format(
roc_auc_score(y_train, train_oof)
))

[[0.         0.81257796 0.75859528]]
[cv] tr:0.8126 +- 0.0000, va:0.7586 +- 0.0000
[oof] 0.5103


In [15]:
# oofの推論値取得
train_oof = pd.concat([
    id_train,
    pd.DataFrame({"true": y_train, "pred": train_oof}),
], axis = 1)
train_oof.head()

Unnamed: 0,SK_ID_CURR,true,pred
0,100002,1,0.0
1,100003,0,0.0
2,100004,0,0.031866
3,100006,0,0.0
4,100007,0,0.0


In [16]:
# 説明変数の重要度取得
imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop = False)
imp.columns = ["col", "imp", "imp_std"]
imp.head()

Unnamed: 0,col,imp,imp_std
0,AMT_ANNUITY,6992.551795,
1,AMT_CREDIT,8111.321247,
2,AMT_GOODS_PRICE,7120.960365,
3,AMT_INCOME_TOTAL,1595.740609,
4,AMT_REQ_CREDIT_BUREAU_DAY,128.842901,


In [17]:
# 学習関数の定義
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):
    train_oof = np.zeros(len(input_x))
    metrics = []
    imp = pd.DataFrame()

    # cross-validation
    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        
        # make dataset
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y[idx_tr], input_id.loc[idx_tr, :]
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y[idx_va], input_id.loc[idx_va, :]
        print(x_tr.shape, x_va.shape)
        
        # train
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr, y_tr), (x_va, y_va)],
                  early_stopping_rounds=100,
                  verbose=100
                 )
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "wb") as f:
            pickle.dump(model, f, protocol=4)
        
        # evaluate
        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print("[auc] tr:{:.4f}, va:{:.4f}".format(metric_tr, metric_va))
        
        # oof
        train_oof[idx_va] = y_va_pred
        
        # imp
        _imp = pd.DataFrame({"col":input_x.columns, "imp":model.feature_importances_, "nfold":nfold})
        imp = pd.concat([imp, _imp])
      
    print("-"*20, "result", "-"*20)
    # metric
    metrics = np.array(metrics)
    print(metrics)
    print("[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}".format(
        metrics[:,1].mean(), metrics[:,1].std(),
        metrics[:,2].mean(), metrics[:,2].std(),
    ))
    print("[oof] {:.4f}".format(
        roc_auc_score(input_y, train_oof)
    ))
    
    # oof
    train_oof = pd.concat([
        input_id,
        pd.DataFrame({"pred":train_oof})
    ], axis=1)
    
    # importance
    imp = imp.groupby("col")["imp"].agg(["mean", "std"]).reset_index(drop=False)
    imp.columns = ["col", "imp", "imp_std"]
    
    return train_oof, imp, metrics

In [18]:
# ハイパーパラメータの設定
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# 学習の実行
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5,
                                   )

-------------------- 0 --------------------
(246008, 120) (61503, 120)
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
[auc] tr:0.8126, va:0.7586
-------------------- 1 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.782531	valid_1's auc: 0.756239
[200]	training's auc: 0.808862	valid_1's auc: 0.758924
[300]	training's auc: 0.829564	valid_1's auc: 0.758779
[auc] tr:0.8170, va:0.7590
-------------------- 2 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.782101	valid_1's auc: 0.758221
[200]	training's auc: 0.809587	valid_1's auc: 0.760104
[300]	training's auc: 0.830474	valid_1's auc: 0.760275
[400]	training's auc: 0.847388	valid_1's auc: 0.759875
[auc] tr:0.8362, va:0.7604
-------------------- 3 --------------------
(246009, 120) (61502, 120)
[100]	training's auc: 0.783853	valid_1's auc: 0.754567
[200]	training's auc: 0.811501

In [19]:
imp.sort_values("imp", ascending = False)[:10]

Unnamed: 0,col,imp,imp_std
38,EXT_SOURCE_3,65353.907478,1558.201212
37,EXT_SOURCE_2,54545.388309,1251.798934
102,ORGANIZATION_TYPE,21441.917474,1450.24619
36,EXT_SOURCE_1,20051.934248,685.852224
1,AMT_CREDIT,8263.228728,410.384434
22,DAYS_BIRTH,7645.58911,689.458833
2,AMT_GOODS_PRICE,7263.054566,405.837031
0,AMT_ANNUITY,6762.95364,479.302045
23,DAYS_EMPLOYED,5810.288375,552.93773
101,OCCUPATION_TYPE,5502.675859,831.872392


In [20]:
# ファイルの読み込み
application_test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")
application_test = reduce_mem_usage(application_test)

# データセットの作成
x_test = application_test.drop(columns=["SK_ID_CURR" ])
id_test = application_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型に変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 14.60 MB
Decreased by 67.6%


In [21]:
# 学習済みモデルの読み込み
with open("model_lgb_fold0.pickle", "rb") as f:
    model = pickle.load(f)

In [22]:
# 推論
test_pred_fold = model.predict_proba(x_test)[:, 1]

# 推論値を格納する変数を作成
test_pred = np.zeros((len(x_test), 5))

# 1-fold目の推論値を格納
test_pred[:, 0] = test_pred_fold

In [23]:
# 各foldの推論値の平均を算出
test_pred_mean = test_pred.mean(axis = 1)

# 推論値のデータフレームを作成
df_test_pred = pd.concat([
    id_test,
    pd.DataFrame({"pred": test_pred_mean}),
], axis = 1)
df_test_pred.head()

Unnamed: 0,SK_ID_CURR,pred
0,100001,0.006572
1,100005,0.023874
2,100013,0.004233
3,100028,0.008966
4,100038,0.030794


In [24]:
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4],
               ):
    pred = np.zeros((len(input_x), len(list_nfold)))
    for nfold in list_nfold:
        print("-"*20, nfold, "-"*20)
        fname_lgb = "model_lgb_fold{}.pickle".format(nfold)
        with open(fname_lgb, "rb") as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(input_x)[:,1]
    
    pred = pd.concat([
        input_id,
        pd.DataFrame({"pred": pred.mean(axis=1)}),
    ], axis=1)
    
    print("Done.")
    
    return pred

In [25]:
# 推論処理の実行
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold = [0,  1, 2, 3, 4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [26]:
# 提出ファイルの作成
df_submit = test_pred.rename(columns = {"pred": "TARGET"})
print(df_submit.shape)
display(df_submit.head())

# ファイル出力
df_submit.to_csv("submission_baseline.csv", index = None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.04181
1,100005,0.1264
2,100013,0.022495
3,100028,0.03968
4,100038,0.156628


In [27]:
# データの確認
display(application_train["DAYS_EMPLOYED"].value_counts())
print("正の値の割合：{:.4f}".format((application_train["DAYS_EMPLOYED"] > 0).mean()))
print("正の値の個数：{}".format((application_train["DAYS_EMPLOYED"] > 0).sum()))

 365243    55374
-200         156
-224         152
-230         151
-199         151
           ...  
-13961         1
-11827         1
-10176         1
-9459          1
-8694          1
Name: DAYS_EMPLOYED, Length: 12574, dtype: int64

正の値の割合：0.1801
正の値の個数：55374


In [28]:
application_train["DAYS_EMPLOYED"] = application_train["DAYS_EMPLOYED"].replace(365243, np.nan)

In [29]:
# 特徴量1: 総所得金額を世帯人数で割った値
application_train['INCOME_div_PERSON'] = application_train['AMT_INCOME_TOTAL'] / application_train['CNT_FAM_MEMBERS']

# 特徴量2: 総所得金額を就労期間で割った値
application_train['INCOME_div_EMPLOYED'] = application_train['AMT_INCOME_TOTAL'] / application_train['DAYS_EMPLOYED']

# 特徴量3: 外部スコアの平均値など
application_train["EXT_SOURCE_mean"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_train["EXT_SOURCE_max"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_train["EXT_SOURCE_min"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_train["EXT_SOURCE_std"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_train["EXT_SOURCE_count"] = application_train[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)

# 特徴量4: 就労期間を年齢で割った値 (年齢に占める就労期間の割合)
application_train['DAYS_EMPLOYED_div_BIRTH'] = application_train['DAYS_EMPLOYED'] / application_train['DAYS_BIRTH']

# 特徴量5: 年金支払額を所得金額で割った値
application_train['ANNUITY_div_INCOME'] = application_train['AMT_ANNUITY'] / application_train['AMT_INCOME_TOTAL']

# 特徴量6: 年金支払額を借入金で割った値
application_train['ANNUITY_div_CREDIT'] = application_train['AMT_ANNUITY'] / application_train['AMT_CREDIT']

In [30]:
x_train = application_train.drop(columns = ["TARGET", "SK_ID_CURR"])
y_train = application_train["TARGET"]
id_train = application_train[["SK_ID_CURR"]]
for col in x_train.columns:
    if x_train[col].dtype == "O":
        x_train[col] = x_train[col].astype("category")

In [31]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold = [0, 1, 2, 3, 4],
                                    n_splits = 5,
                                   )

-------------------- 0 --------------------
(246008, 130) (61503, 130)
[100]	training's auc: 0.787817	valid_1's auc: 0.760032
[200]	training's auc: 0.816788	valid_1's auc: 0.763696
[300]	training's auc: 0.838351	valid_1's auc: 0.764008
[400]	training's auc: 0.856611	valid_1's auc: 0.764045
[500]	training's auc: 0.871304	valid_1's auc: 0.764075
[auc] tr:0.8585, va:0.7641
-------------------- 1 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.788378	valid_1's auc: 0.763077
[200]	training's auc: 0.816816	valid_1's auc: 0.766784
[300]	training's auc: 0.838169	valid_1's auc: 0.767287
[400]	training's auc: 0.856163	valid_1's auc: 0.767434
[auc] tr:0.8471, va:0.7675
-------------------- 2 --------------------
(246009, 130) (61502, 130)
[100]	training's auc: 0.787655	valid_1's auc: 0.764182
[200]	training's auc: 0.817121	valid_1's auc: 0.767566
[300]	training's auc: 0.837872	valid_1's auc: 0.767677
[400]	training's auc: 0.855451	valid_1's auc: 0.76783
[auc] tr:0.8519, va

In [32]:
imp.sort_values("imp", ascending = False)[:10]

Unnamed: 0,col,imp,imp_std
44,EXT_SOURCE_mean,114005.214702,1381.645644
10,ANNUITY_div_CREDIT,23720.30155,805.397477
112,ORGANIZATION_TYPE,22660.210567,1372.230448
41,EXT_SOURCE_3,12046.854638,886.653726
24,DAYS_BIRTH,8108.684084,578.972393
45,EXT_SOURCE_min,7727.391587,314.203161
39,EXT_SOURCE_1,7155.619219,472.422492
2,AMT_GOODS_PRICE,6148.167858,364.159044
0,AMT_ANNUITY,6091.80521,581.9879
46,EXT_SOURCE_std,5830.39069,679.963947


In [33]:
# nullに置き換え
application_test["DAYS_EMPLOYED"] = application_test["DAYS_EMPLOYED"].replace(365243, np.nan)

# 特徴量の生成
application_test['INCOME_div_PERSON'] = application_test['AMT_INCOME_TOTAL'] / application_test['CNT_FAM_MEMBERS']
application_test['INCOME_div_EMPLOYED'] = application_test['AMT_INCOME_TOTAL'] / application_test['DAYS_EMPLOYED']
application_test["EXT_SOURCE_mean"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].mean(axis=1)
application_test["EXT_SOURCE_max"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].max(axis=1)
application_test["EXT_SOURCE_min"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].min(axis=1)
application_test["EXT_SOURCE_std"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].std(axis=1)
application_test["EXT_SOURCE_count"] = application_test[["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"]].notnull().sum(axis=1)
application_test['DAYS_EMPLOYED_div_BIRTH'] = application_test['DAYS_EMPLOYED'] / application_test['DAYS_BIRTH']
application_test['ANNUITY_div_INCOME'] = application_test['AMT_ANNUITY'] / application_test['AMT_INCOME_TOTAL']
application_test['ANNUITY_div_CREDIT'] = application_test['AMT_ANNUITY'] / application_test['AMT_CREDIT']

# データセット作成
x_test = application_test.drop(columns=["SK_ID_CURR"])
id_test = application_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")


In [34]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [35]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("submission_FeatureEngineering1.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.029002
1,100005,0.121782
2,100013,0.022668
3,100028,0.044435
4,100038,0.18194


In [36]:
pos = pd.read_csv("../input/home-credit-default-risk/POS_CASH_balance.csv")
pos = reduce_mem_usage(pos)
print(pos.shape)
pos.head()

Memory usage of dataframe is 610.43 MB
Memory usage after optimization is: 238.45 MB
Decreased by 60.9%
(10001358, 8)


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,MONTHS_BALANCE,CNT_INSTALMENT,CNT_INSTALMENT_FUTURE,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF
0,1803195,182943,-31,48.0,45.0,Active,0,0
1,1715348,367990,-33,36.0,35.0,Active,0,0
2,1784872,397406,-32,12.0,9.0,Active,0,0
3,1903291,269225,-35,48.0,42.0,Active,0,0
4,2341044,334279,-35,36.0,35.0,Active,0,0


In [37]:
pos_ohe = pd.get_dummies(pos, columns = ["NAME_CONTRACT_STATUS"], dummy_na = True)
col_ohe = sorted(list(set(pos_ohe.columns) - set(pos.columns)))
print(len(col_ohe))
col_ohe

10


['NAME_CONTRACT_STATUS_Active',
 'NAME_CONTRACT_STATUS_Amortized debt',
 'NAME_CONTRACT_STATUS_Approved',
 'NAME_CONTRACT_STATUS_Canceled',
 'NAME_CONTRACT_STATUS_Completed',
 'NAME_CONTRACT_STATUS_Demand',
 'NAME_CONTRACT_STATUS_Returned to the store',
 'NAME_CONTRACT_STATUS_Signed',
 'NAME_CONTRACT_STATUS_XNA',
 'NAME_CONTRACT_STATUS_nan']

In [38]:
pos_ohe_agg = pos_ohe.groupby("SK_ID_CURR").agg(
    {
        # 数値の集約
        "MONTHS_BALANCE": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT": ["mean", "std", "min", "max"],
        "CNT_INSTALMENT_FUTURE": ["mean", "std", "min", "max"],
        "SK_DPD": ["mean", "std", "min", "max"],
        "SK_DPD_DEF": ["mean", "std", "min", "max"],
        # カテゴリ変数をone-hot-encodingした値の集約
        "NAME_CONTRACT_STATUS_Active": ["mean"],
        "NAME_CONTRACT_STATUS_Amortized debt": ["mean"],
        "NAME_CONTRACT_STATUS_Approved": ["mean"],
        "NAME_CONTRACT_STATUS_Canceled": ["mean"],
        "NAME_CONTRACT_STATUS_Completed": ["mean"],
        "NAME_CONTRACT_STATUS_Demand": ["mean"],
        "NAME_CONTRACT_STATUS_Returned to the store": ["mean"],
        "NAME_CONTRACT_STATUS_Signed": ["mean"],
        "NAME_CONTRACT_STATUS_XNA": ["mean"],
        "NAME_CONTRACT_STATUS_nan": ["mean"],
        # IDのユニーク数をカウント (ついでにレコード数もカウント)
        "SK_ID_PREV":["count", "nunique"],
    }
)

# カラム名の付与
pos_ohe_agg.columns = [i + "_" + j for i,j in pos_ohe_agg.columns]
pos_ohe_agg = pos_ohe_agg.reset_index(drop=False)

print(pos_ohe_agg.shape)
pos_ohe_agg.head()

(337252, 33)


Unnamed: 0,SK_ID_CURR,MONTHS_BALANCE_mean,MONTHS_BALANCE_std,MONTHS_BALANCE_min,MONTHS_BALANCE_max,CNT_INSTALMENT_mean,CNT_INSTALMENT_std,CNT_INSTALMENT_min,CNT_INSTALMENT_max,CNT_INSTALMENT_FUTURE_mean,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100001,-72.555556,20.863312,-96,-53,4.0,0.0,4.0,4.0,1.444336,...,0.0,0.0,0.222222,0.0,0.0,0.0,0.0,0.0,9,2
1,100002,-10.0,5.627314,-19,-1,24.0,0.0,24.0,24.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19,1
2,100003,-43.785714,24.640162,-77,-18,10.109375,2.806597,6.0,12.0,5.785156,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28,3
3,100004,-25.5,1.290994,-27,-24,3.75,0.5,3.0,4.0,2.25,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4,1
4,100005,-20.0,3.316625,-25,-15,11.703125,0.948683,9.0,12.0,7.199219,...,0.0,0.0,0.090909,0.0,0.0,0.090909,0.0,0.0,11,1


In [39]:
df_train = pd.merge(application_train, pos_ohe_agg, on = "SK_ID_CURR", how = "left")
print(df_train.shape)
df_train.head()

(307511, 164)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,NAME_CONTRACT_STATUS_Approved_mean,NAME_CONTRACT_STATUS_Canceled_mean,NAME_CONTRACT_STATUS_Completed_mean,NAME_CONTRACT_STATUS_Demand_mean,NAME_CONTRACT_STATUS_Returned to the store_mean,NAME_CONTRACT_STATUS_Signed_mean,NAME_CONTRACT_STATUS_XNA_mean,NAME_CONTRACT_STATUS_nan_mean,SK_ID_PREV_count,SK_ID_PREV_nunique
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,28.0,3.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,4.0,1.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0.0,0.0,0.095238,0.0,0.047619,0.0,0.0,0.0,21.0,3.0
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.045455,0.0,0.0,0.015152,0.0,0.0,66.0,5.0


In [40]:
x_train = df_train.drop(columns = ["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype == "O":
        x_train[col] = x_train[col].astype("category")

In [41]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train, 
                                    id_train,
                                    params, 
                                    list_nfold = [0, 1, 2, 3, 4],
                                    n_splits = 5,
                                   )

-------------------- 0 --------------------
(246008, 162) (61503, 162)
[100]	training's auc: 0.794548	valid_1's auc: 0.76534
[200]	training's auc: 0.825571	valid_1's auc: 0.77062
[300]	training's auc: 0.848468	valid_1's auc: 0.771422
[400]	training's auc: 0.866087	valid_1's auc: 0.771575
[auc] tr:0.8578, va:0.7719
-------------------- 1 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.794349	valid_1's auc: 0.769227
[200]	training's auc: 0.824921	valid_1's auc: 0.774694
[300]	training's auc: 0.847296	valid_1's auc: 0.775643
[400]	training's auc: 0.864781	valid_1's auc: 0.775882
[500]	training's auc: 0.880069	valid_1's auc: 0.775861
[auc] tr:0.8744, va:0.7761
-------------------- 2 --------------------
(246009, 162) (61502, 162)
[100]	training's auc: 0.794977	valid_1's auc: 0.768857
[200]	training's auc: 0.82562	valid_1's auc: 0.773055
[300]	training's auc: 0.847297	valid_1's auc: 0.773516
[400]	training's auc: 0.865995	valid_1's auc: 0.774559
[500]	training's auc:

In [42]:
imp.sort_values("imp", ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
52,EXT_SOURCE_mean,112438.907936,1217.139287
134,ORGANIZATION_TYPE,21573.968751,1044.080966
10,ANNUITY_div_CREDIT,18349.279658,1039.471604
49,EXT_SOURCE_3,10710.855987,490.719084
53,EXT_SOURCE_min,7021.835349,444.955386
32,DAYS_BIRTH,6666.389282,814.801948
47,EXT_SOURCE_1,6605.474412,601.782028
21,CNT_INSTALMENT_FUTURE_mean,6289.278576,365.694448
0,AMT_ANNUITY,5563.190447,368.625974
108,MONTHS_BALANCE_std,5340.370365,466.201881


In [43]:
# 推論用のデータセットの作成
# テーブル結合
df_test = pd.merge(application_test, pos_ohe_agg, on="SK_ID_CURR", how="left")

# データセット作成
x_test = df_test.drop(columns=["SK_ID_CURR"])
id_test = df_test[["SK_ID_CURR"]]

# カテゴリ変数をcategory型へ変換
for col in x_test.columns:
    if x_test[col].dtype=="O":
        x_test[col] = x_test[col].astype("category")

In [44]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done.


In [45]:
df_submit = test_pred.rename(columns={"pred":"TARGET"})
print(df_submit.shape)
display(df_submit.head())
df_submit.to_csv("submission_FeatureEngineering2.csv", index=None)

(48744, 2)


Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.032163
1,100005,0.1044
2,100013,0.025425
3,100028,0.047522
4,100038,0.210907


In [46]:
import optuna

In [47]:
# 学習用のデータセットの作成
x_train = df_train.drop(columns=["TARGET", "SK_ID_CURR"])
y_train = df_train["TARGET"]
id_train = df_train[["SK_ID_CURR"]]

for col in x_train.columns:
    if x_train[col].dtype=="O":
        x_train[col] = x_train[col].astype("category")

In [48]:
# 目的関数の定義
# 探索しないハイパーパラメータ
params_base = {
    "boosting_type": "gbdt",
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "learning_rate": 0.05,
    "n_estimators": 100000,
    "bagging_freq": 1,
    "random_state": 123,
}

# 目的関数の定義
def objective(trial):
    # 探索するハイパーパラメータ
    params_tuning = {
        "num_leaves": trial.suggest_int("num_leaves", 8, 256),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 200),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-5, 1e-2, log=True),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-2, 1e+2, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-2, 1e+2, log=True),
    }
    params_tuning.update(params_base)
    
    # モデル学習・評価
    list_metrics = []
    cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))
    list_fold = [0]  # 処理高速化のために1つめのfoldのみとする。
    for nfold in list_fold:
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr = x_train.loc[idx_tr, :], y_train[idx_tr]
        x_va, y_va = x_train.loc[idx_va, :], y_train[idx_va]
        model = lgb.LGBMClassifier(**params_tuning)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  early_stopping_rounds=100,
                  verbose=0,
                 )
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_va = roc_auc_score(y_va, y_va_pred) # 評価指標をAUCにする
        list_metrics.append(metric_va)
    
    # 評価指標の算出
    metrics = np.mean(list_metrics)
    
    return metrics

In [49]:
# 最適化処理（探索の実行）
sampler = optuna.samplers.TPESampler(seed = 123)
study = optuna.create_study(sampler = sampler, direction = "maximize")
study.optimize(objective, n_trials = 50, n_jobs = 5)

[32m[I 2022-11-01 02:23:35,250][0m A new study created in memory with name: no-name-9b56e447-9bf9-4803-8edc-d3e7de6cf687[0m




[32m[I 2022-11-01 02:25:47,161][0m Trial 3 finished with value: 0.7653039207524232 and parameters: {'num_leaves': 118, 'min_child_samples': 18, 'min_sum_hessian_in_leaf': 0.003721616950344956, 'feature_fraction': 0.6022357972638153, 'bagging_fraction': 0.5441165649416986, 'lambda_l1': 0.436571347462061, 'lambda_l2': 0.016737983263480426}. Best is trial 3 with value: 0.7653039207524232.[0m




[32m[I 2022-11-01 02:26:44,981][0m Trial 2 finished with value: 0.7690069191047866 and parameters: {'num_leaves': 140, 'min_child_samples': 126, 'min_sum_hessian_in_leaf': 0.0001041577484564094, 'feature_fraction': 0.782135915688872, 'bagging_fraction': 0.6955640487618923, 'lambda_l1': 1.261398044551576, 'lambda_l2': 0.11516306724740524}. Best is trial 2 with value: 0.7690069191047866.[0m




[32m[I 2022-11-01 02:27:37,713][0m Trial 4 finished with value: 0.7744257344657857 and parameters: {'num_leaves': 46, 'min_child_samples': 62, 'min_sum_hessian_in_leaf': 1.1101357225698176e-05, 'feature_fraction': 0.5163872942874758, 'bagging_fraction': 0.95593532135131, 'lambda_l1': 5.965343508859252, 'lambda_l2': 0.20706220120553695}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:28:06,388][0m Trial 1 finished with value: 0.7716445804418828 and parameters: {'num_leaves': 192, 'min_child_samples': 145, 'min_sum_hessian_in_leaf': 7.53918706663546e-05, 'feature_fraction': 0.7173862404580059, 'bagging_fraction': 0.9742570330953388, 'lambda_l1': 0.03523667998385843, 'lambda_l2': 5.551308877230651}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:29:00,956][0m Trial 0 finished with value: 0.7699728906405827 and parameters: {'num_leaves': 102, 'min_child_samples': 32, 'min_sum_hessian_in_leaf': 1.0146346600714198e-05, 'feature_fraction': 0.9623064898719655, 'bagging_fraction': 0.8068907818287293, 'lambda_l1': 73.29970063323677, 'lambda_l2': 1.5909144973999934}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:29:48,066][0m Trial 5 finished with value: 0.7733974355206457 and parameters: {'num_leaves': 41, 'min_child_samples': 144, 'min_sum_hessian_in_leaf': 1.3764937137933532e-05, 'feature_fraction': 0.6551170723439054, 'bagging_fraction': 0.8182758485571343, 'lambda_l1': 13.392532564142021, 'lambda_l2': 0.28636817994508307}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:31:22,610][0m Trial 8 finished with value: 0.7643234289536822 and parameters: {'num_leaves': 252, 'min_child_samples': 48, 'min_sum_hessian_in_leaf': 3.609926703599386e-05, 'feature_fraction': 0.6387513990502529, 'bagging_fraction': 0.5744521236619495, 'lambda_l1': 3.556671334685309, 'lambda_l2': 0.07031787355279097}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:31:56,915][0m Trial 6 finished with value: 0.7733252901906256 and parameters: {'num_leaves': 27, 'min_child_samples': 45, 'min_sum_hessian_in_leaf': 5.013597139267152e-05, 'feature_fraction': 0.9692690651608784, 'bagging_fraction': 0.8116578984464313, 'lambda_l1': 21.015870621167018, 'lambda_l2': 0.029733512668800675}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:32:27,777][0m Trial 7 finished with value: 0.7709093977272083 and parameters: {'num_leaves': 199, 'min_child_samples': 162, 'min_sum_hessian_in_leaf': 0.0032844269466265367, 'feature_fraction': 0.7033024709360329, 'bagging_fraction': 0.9161166025210515, 'lambda_l1': 3.0364569725133013, 'lambda_l2': 0.04558324776312107}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:33:32,576][0m Trial 9 finished with value: 0.7688488099707611 and parameters: {'num_leaves': 152, 'min_child_samples': 124, 'min_sum_hessian_in_leaf': 7.284419045611347e-05, 'feature_fraction': 0.9261072011783531, 'bagging_fraction': 0.6096925147026322, 'lambda_l1': 39.12105324937943, 'lambda_l2': 14.326578942189075}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:35:57,049][0m Trial 10 finished with value: 0.7717665812870932 and parameters: {'num_leaves': 51, 'min_child_samples': 130, 'min_sum_hessian_in_leaf': 0.0015496748097451488, 'feature_fraction': 0.9088850364751726, 'bagging_fraction': 0.84464610571984, 'lambda_l1': 27.13442164447572, 'lambda_l2': 0.12115385706878644}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:36:34,010][0m Trial 13 finished with value: 0.7667412807263779 and parameters: {'num_leaves': 188, 'min_child_samples': 95, 'min_sum_hessian_in_leaf': 3.5582912810946214e-05, 'feature_fraction': 0.9088781166290808, 'bagging_fraction': 0.5442891489455854, 'lambda_l1': 7.257252275410165, 'lambda_l2': 0.4841181450169909}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:36:43,902][0m Trial 12 finished with value: 0.7701697264130958 and parameters: {'num_leaves': 170, 'min_child_samples': 59, 'min_sum_hessian_in_leaf': 1.9304712871383336e-05, 'feature_fraction': 0.5816124555014457, 'bagging_fraction': 0.9895401026995844, 'lambda_l1': 0.12216205029262311, 'lambda_l2': 0.6818182283104645}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:38:00,426][0m Trial 11 finished with value: 0.7725530979048677 and parameters: {'num_leaves': 17, 'min_child_samples': 65, 'min_sum_hessian_in_leaf': 0.0002517679925368491, 'feature_fraction': 0.8354257942897757, 'bagging_fraction': 0.9791269047963713, 'lambda_l1': 0.3131993899765362, 'lambda_l2': 0.02044587747250195}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:38:47,078][0m Trial 14 finished with value: 0.7729564733743939 and parameters: {'num_leaves': 76, 'min_child_samples': 79, 'min_sum_hessian_in_leaf': 0.0005488996163725538, 'feature_fraction': 0.5079332318605906, 'bagging_fraction': 0.9960853291945045, 'lambda_l1': 0.0901997052467977, 'lambda_l2': 81.71523044647883}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:42:02,627][0m Trial 15 finished with value: 0.7742720996816764 and parameters: {'num_leaves': 69, 'min_child_samples': 194, 'min_sum_hessian_in_leaf': 1.0717619069100371e-05, 'feature_fraction': 0.5039996299599608, 'bagging_fraction': 0.9655995317776533, 'lambda_l1': 7.583237282950942, 'lambda_l2': 0.41706008511122317}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:42:52,267][0m Trial 17 finished with value: 0.7726405543463055 and parameters: {'num_leaves': 74, 'min_child_samples': 195, 'min_sum_hessian_in_leaf': 0.0003904709399950931, 'feature_fraction': 0.501764553242751, 'bagging_fraction': 0.8992133668426198, 'lambda_l1': 9.56838338894332, 'lambda_l2': 0.31315937024068213}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:43:22,706][0m Trial 16 finished with value: 0.7737483656243535 and parameters: {'num_leaves': 74, 'min_child_samples': 191, 'min_sum_hessian_in_leaf': 1.0981299989475014e-05, 'feature_fraction': 0.5200055252134665, 'bagging_fraction': 0.9101278618704485, 'lambda_l1': 0.268712781840344, 'lambda_l2': 88.87703187854248}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:44:54,270][0m Trial 19 finished with value: 0.7728866186550397 and parameters: {'num_leaves': 72, 'min_child_samples': 102, 'min_sum_hessian_in_leaf': 1.0566025205328285e-05, 'feature_fraction': 0.5179261224519915, 'bagging_fraction': 0.7139064024477928, 'lambda_l1': 9.145060132607137, 'lambda_l2': 0.3108094007221659}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:45:49,848][0m Trial 18 finished with value: 0.7731133606119058 and parameters: {'num_leaves': 77, 'min_child_samples': 197, 'min_sum_hessian_in_leaf': 1.1013538503754288e-05, 'feature_fraction': 0.5214767626718245, 'bagging_fraction': 0.717904898898126, 'lambda_l1': 8.965019771409157, 'lambda_l2': 75.34408246575613}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:46:27,370][0m Trial 21 finished with value: 0.7713664760828719 and parameters: {'num_leaves': 77, 'min_child_samples': 190, 'min_sum_hessian_in_leaf': 0.00016134399310473283, 'feature_fraction': 0.5658092588722657, 'bagging_fraction': 0.7143665777651858, 'lambda_l1': 0.010626566892689999, 'lambda_l2': 1.996020337058608}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:46:31,984][0m Trial 20 finished with value: 0.7730789551409729 and parameters: {'num_leaves': 74, 'min_child_samples': 195, 'min_sum_hessian_in_leaf': 0.00020376023895565476, 'feature_fraction': 0.5329484787348077, 'bagging_fraction': 0.9157840042746674, 'lambda_l1': 1.4150452402873372, 'lambda_l2': 2.008532489225577}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:49:05,893][0m Trial 23 finished with value: 0.7721478450608146 and parameters: {'num_leaves': 103, 'min_child_samples': 171, 'min_sum_hessian_in_leaf': 0.009355430663992777, 'feature_fraction': 0.5676428194128588, 'bagging_fraction': 0.883120472988597, 'lambda_l1': 2.416826846565396, 'lambda_l2': 1.860483293316247}. Best is trial 4 with value: 0.7744257344657857.[0m




[32m[I 2022-11-01 02:49:59,636][0m Trial 22 finished with value: 0.774731906820808 and parameters: {'num_leaves': 8, 'min_child_samples': 100, 'min_sum_hessian_in_leaf': 0.0001903300486102632, 'feature_fraction': 0.5601270975914935, 'bagging_fraction': 0.7178023170261237, 'lambda_l1': 1.4599197790043443, 'lambda_l2': 1.8703690617268085}. Best is trial 22 with value: 0.774731906820808.[0m
[32m[I 2022-11-01 02:50:01,427][0m Trial 24 finished with value: 0.7710904414669355 and parameters: {'num_leaves': 107, 'min_child_samples': 6, 'min_sum_hessian_in_leaf': 0.0008104063906650389, 'feature_fraction': 0.5738102069117577, 'bagging_fraction': 0.8660845318810947, 'lambda_l1': 1.3260547367602078, 'lambda_l2': 1.736160819633697}. Best is trial 22 with value: 0.774731906820808.[0m




[32m[I 2022-11-01 02:50:40,497][0m Trial 25 finished with value: 0.771981635073517 and parameters: {'num_leaves': 106, 'min_child_samples': 172, 'min_sum_hessian_in_leaf': 2.5373712008489578e-05, 'feature_fraction': 0.558608341545467, 'bagging_fraction': 0.9017227951596727, 'lambda_l1': 1.1239880880949127, 'lambda_l2': 4.520166764995526}. Best is trial 22 with value: 0.774731906820808.[0m




[32m[I 2022-11-01 02:50:48,232][0m Trial 26 finished with value: 0.7725569773372394 and parameters: {'num_leaves': 111, 'min_child_samples': 166, 'min_sum_hessian_in_leaf': 2.306167467252196e-05, 'feature_fraction': 0.6436941796162339, 'bagging_fraction': 0.8724954872913451, 'lambda_l1': 0.6130611642196279, 'lambda_l2': 14.420898031646393}. Best is trial 22 with value: 0.774731906820808.[0m




[32m[I 2022-11-01 02:53:08,605][0m Trial 27 finished with value: 0.7741694033764313 and parameters: {'num_leaves': 49, 'min_child_samples': 172, 'min_sum_hessian_in_leaf': 2.3686173653554142e-05, 'feature_fraction': 0.624682962644563, 'bagging_fraction': 0.9403244348712648, 'lambda_l1': 0.5575294288419524, 'lambda_l2': 19.35623295245607}. Best is trial 22 with value: 0.774731906820808.[0m




[32m[I 2022-11-01 02:53:42,970][0m Trial 28 finished with value: 0.7741687443360376 and parameters: {'num_leaves': 14, 'min_child_samples': 83, 'min_sum_hessian_in_leaf': 2.7739343263081984e-05, 'feature_fraction': 0.6182506753001209, 'bagging_fraction': 0.6553974061472854, 'lambda_l1': 0.5224664282334281, 'lambda_l2': 5.253349153928458}. Best is trial 22 with value: 0.774731906820808.[0m




[32m[I 2022-11-01 02:54:31,783][0m Trial 29 finished with value: 0.7747016443984043 and parameters: {'num_leaves': 15, 'min_child_samples': 92, 'min_sum_hessian_in_leaf': 2.4793856741635692e-05, 'feature_fraction': 0.6308334931021669, 'bagging_fraction': 0.6592174267732481, 'lambda_l1': 0.6096078053214884, 'lambda_l2': 5.371612542474097}. Best is trial 22 with value: 0.774731906820808.[0m




[32m[I 2022-11-01 02:54:44,663][0m Trial 30 finished with value: 0.7734817250058129 and parameters: {'num_leaves': 15, 'min_child_samples': 85, 'min_sum_hessian_in_leaf': 2.2879664807188902e-05, 'feature_fraction': 0.6201211250451174, 'bagging_fraction': 0.6390072470640829, 'lambda_l1': 4.924759380727232, 'lambda_l2': 7.792040227982967}. Best is trial 22 with value: 0.774731906820808.[0m




[32m[I 2022-11-01 02:56:31,570][0m Trial 31 finished with value: 0.7749170864843036 and parameters: {'num_leaves': 8, 'min_child_samples': 82, 'min_sum_hessian_in_leaf': 0.0001412956616562311, 'feature_fraction': 0.6159117428658247, 'bagging_fraction': 0.7647941559256276, 'lambda_l1': 3.7724856303957695, 'lambda_l2': 0.12663152692648993}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 02:57:10,462][0m Trial 32 finished with value: 0.7740732084156109 and parameters: {'num_leaves': 8, 'min_child_samples': 84, 'min_sum_hessian_in_leaf': 0.00011740553124594452, 'feature_fraction': 0.6007701517082358, 'bagging_fraction': 0.6450797220440333, 'lambda_l1': 4.424066456930606, 'lambda_l2': 0.9175096565905501}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 02:58:52,979][0m Trial 34 finished with value: 0.7738814312234172 and parameters: {'num_leaves': 9, 'min_child_samples': 85, 'min_sum_hessian_in_leaf': 0.00010484842522728884, 'feature_fraction': 0.6774647025498868, 'bagging_fraction': 0.7586551418377274, 'lambda_l1': 0.15877987511295905, 'lambda_l2': 0.9274029110757303}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:00:13,556][0m Trial 35 finished with value: 0.772595365549579 and parameters: {'num_leaves': 34, 'min_child_samples': 112, 'min_sum_hessian_in_leaf': 1.5481056596449657e-05, 'feature_fraction': 0.6889622583884853, 'bagging_fraction': 0.7662043447148346, 'lambda_l1': 54.0006990448841, 'lambda_l2': 1.0267697441653028}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:01:31,542][0m Trial 33 finished with value: 0.7719098032329814 and parameters: {'num_leaves': 30, 'min_child_samples': 85, 'min_sum_hessian_in_leaf': 0.000143880495014633, 'feature_fraction': 0.7105121097564762, 'bagging_fraction': 0.7593061045758498, 'lambda_l1': 71.75256614982968, 'lambda_l2': 0.158772289545369}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:02:38,455][0m Trial 36 finished with value: 0.7729381841128731 and parameters: {'num_leaves': 34, 'min_child_samples': 111, 'min_sum_hessian_in_leaf': 0.0001396672064575004, 'feature_fraction': 0.7015125491269648, 'bagging_fraction': 0.7576142755899409, 'lambda_l1': 74.50732039620759, 'lambda_l2': 0.8659292644316416}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:03:09,315][0m Trial 38 finished with value: 0.7738139953604268 and parameters: {'num_leaves': 35, 'min_child_samples': 115, 'min_sum_hessian_in_leaf': 6.346249235824247e-05, 'feature_fraction': 0.6880863213717948, 'bagging_fraction': 0.7548980106356918, 'lambda_l1': 2.086831709380685, 'lambda_l2': 3.348474063636923}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:03:55,331][0m Trial 37 finished with value: 0.7719060734205909 and parameters: {'num_leaves': 38, 'min_child_samples': 114, 'min_sum_hessian_in_leaf': 5.962760263635491e-05, 'feature_fraction': 0.68484026753346, 'bagging_fraction': 0.7777832806939879, 'lambda_l1': 86.46128324374324, 'lambda_l2': 0.15621014296446828}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:04:31,095][0m Trial 40 finished with value: 0.7710822016808236 and parameters: {'num_leaves': 55, 'min_child_samples': 69, 'min_sum_hessian_in_leaf': 6.188453888469134e-05, 'feature_fraction': 0.7661433715591288, 'bagging_fraction': 0.6667137106590266, 'lambda_l1': 1.7759221458923427, 'lambda_l2': 3.241845098544399}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:04:48,121][0m Trial 39 finished with value: 0.7723736002382805 and parameters: {'num_leaves': 54, 'min_child_samples': 70, 'min_sum_hessian_in_leaf': 5.448871934812632e-05, 'feature_fraction': 0.7541028056349516, 'bagging_fraction': 0.7827767753886063, 'lambda_l1': 2.0790940373886784, 'lambda_l2': 0.14522752513011886}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:05:41,768][0m Trial 41 finished with value: 0.7710868399002434 and parameters: {'num_leaves': 50, 'min_child_samples': 68, 'min_sum_hessian_in_leaf': 6.479451121875727e-05, 'feature_fraction': 0.7423475607814158, 'bagging_fraction': 0.681397560581009, 'lambda_l1': 1.9443608464468995, 'lambda_l2': 0.2057418457641208}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:06:13,991][0m Trial 42 finished with value: 0.7714118608105264 and parameters: {'num_leaves': 51, 'min_child_samples': 63, 'min_sum_hessian_in_leaf': 4.997511336526356e-05, 'feature_fraction': 0.7333215226304909, 'bagging_fraction': 0.6909107030524742, 'lambda_l1': 0.8635654058010898, 'lambda_l2': 0.2034280707295092}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:07:16,912][0m Trial 43 finished with value: 0.7711264998824237 and parameters: {'num_leaves': 57, 'min_child_samples': 37, 'min_sum_hessian_in_leaf': 4.2665943608409e-05, 'feature_fraction': 0.5465538078219976, 'bagging_fraction': 0.6791561818365981, 'lambda_l1': 21.1683761938986, 'lambda_l2': 0.061439073878648075}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:08:41,216][0m Trial 44 finished with value: 0.7700339819038907 and parameters: {'num_leaves': 131, 'min_child_samples': 32, 'min_sum_hessian_in_leaf': 0.00037170847621819704, 'feature_fraction': 0.5445954326977858, 'bagging_fraction': 0.5876332847148488, 'lambda_l1': 17.306389798856976, 'lambda_l2': 0.0756046356073124}. Best is trial 31 with value: 0.7749170864843036.[0m




[32m[I 2022-11-01 03:10:40,850][0m Trial 46 finished with value: 0.7743574685681371 and parameters: {'num_leaves': 24, 'min_child_samples': 36, 'min_sum_hessian_in_leaf': 1.5121804150349985e-05, 'feature_fraction': 0.5429079012757987, 'bagging_fraction': 0.950063069861047, 'lambda_l1': 17.16225232624328, 'lambda_l2': 0.06981445370555651}. Best is trial 31 with value: 0.7749170864843036.[0m
[32m[I 2022-11-01 03:11:20,276][0m Trial 45 finished with value: 0.774779200984414 and parameters: {'num_leaves': 23, 'min_child_samples': 34, 'min_sum_hessian_in_leaf': 1.6293095635961432e-05, 'feature_fraction': 0.5455116530346666, 'bagging_fraction': 0.9591246725535754, 'lambda_l1': 15.682016082466644, 'lambda_l2': 0.04351695269673103}. Best is trial 31 with value: 0.7749170864843036.[0m
[32m[I 2022-11-01 03:12:17,860][0m Trial 47 finished with value: 0.7745125603658735 and parameters: {'num_leaves': 25, 'min_child_samples': 39, 'min_sum_hessian_in_leaf': 1.6095634185747283e-05, 'feature_f