In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pprint import pprint
import random

import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import mlflow
import mlflow.lightgbm

import warnings
warnings.filterwarnings("ignore")

import sys

sys.path.append('../scripts')
#sys.path.append('../scripts/utils')

from utils import reduce_mem_usage
from utils import le_lgb
from utils import fetch_logged_data

In [2]:
# ランダムシードの設定
np.random.seed(1234)
random.seed(1234)

In [3]:
# データの読み込み
train = pd.read_csv('../input/home-credit-default-risk/application_train.csv')
test = pd.read_csv('../input/home-credit-default-risk/application_test.csv')

In [4]:
# メモリの削減
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 92.38 MB
Decreased by 67.7%
Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 14.60 MB
Decreased by 67.6%


In [5]:
# 学習データとテストデータの連結
df = pd.concat([train, test], sort=False).reset_index(drop=True)

In [6]:
# ラベルエンコーディング
df = le_lgb(df)

NAME_CONTRACT_TYPE
CODE_GENDER
FLAG_OWN_CAR
FLAG_OWN_REALTY
NAME_TYPE_SUITE
NAME_INCOME_TYPE
NAME_EDUCATION_TYPE
NAME_FAMILY_STATUS
NAME_HOUSING_TYPE
OCCUPATION_TYPE
WEEKDAY_APPR_PROCESS_START
ORGANIZATION_TYPE
FONDKAPREMONT_MODE
HOUSETYPE_MODE
WALLSMATERIAL_MODE
EMERGENCYSTATE_MODE


In [7]:
# trainとtestに再分割
train = df[~df['TARGET'].isnull()]
test = df[df['TARGET'].isnull()]

In [8]:
# 目的変数と説明変数
X_train = train.drop(columns=['TARGET', 'SK_ID_CURR'])
Y_train = train['TARGET']

In [9]:
# 5分割する
folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=1234)

In [10]:
models = []
aucs = []
imp = pd.DataFrame()

params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# auto logging
mlflow.lightgbm.autolog()

for nfold, (train_index, val_index) in enumerate(skf.split(X_train, Y_train)):
    x_train = X_train.iloc[train_index]
    x_valid = X_train.iloc[val_index]
    y_train = Y_train.iloc[train_index]
    y_valid = Y_train.iloc[val_index]
    
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_eval = lgb.Dataset(x_valid, y_valid, reference=lgb_train)

    model = lgb.train(
        params,
        lgb_train, 
        valid_sets=lgb_eval, 
        num_boost_round=1000, # 学習回数の実行回数
        early_stopping_rounds=100, # early_stoppingの判定基準
        verbose_eval=10
    )

    y_pred = model.predict(x_valid, num_iteration=model.best_iteration)
    auc = roc_auc_score(y_valid, y_pred)
    print(auc)
    aucs.append(auc)

    models.append(model)

    run_id = mlflow.last_active_run().info.run_id
    print("Logged data and model in run {}".format(run_id))

    # show logged data
    for key, data in fetch_logged_data(run_id).items():
        print("\n---------- logged {} ----------".format(key))
        pprint(data)

    _imp = pd.DataFrame(
            {'col': x_train.columns, 'imp': model.feature_importance(), "nfold": nfold+1})
    imp = pd.concat([imp, _imp], axis=0, ignore_index=True)

aucs = np.array(aucs)
print(f'aucs: {np.mean(aucs):.2f} ± {np.std(aucs):.2f}')

imp = imp.groupby("col")["imp"].agg(["mean", "std"])
imp.columns = ["imp", "imp_std"]
imp = imp.reset_index(drop=False).sort_values('imp', ascending=False)
imp.to_csv('../output/dataframe/feature_importance_baseline.csv', index=False)

2022/08/25 11:22:22 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '1cdb6c531b2d443bb1e75185a0b3aa60', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11296
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
[10]	valid_0's auc: 0.724592
[20]	valid_0's auc: 0.730172
[30]	valid_0's auc: 0.735282
[40]	valid_0's auc: 0.740212
[50]	valid_0's auc: 0.744082
[60]	valid_0's auc: 0.746509
[70]	valid_0's auc: 0.74842
[80]	valid_0's auc: 0.749668
[90]	valid_0's auc: 0.751235
[100]	valid_0's auc: 0.752072
[110]	valid_0's auc: 0.752737
[120]	valid_0's auc: 0.753077
[130]	valid_0's auc: 0.753518
[140]	valid_0's auc: 0.7539
[150]	valid_0's auc: 0.754228
[160]	valid_0's auc: 0.754392
[170]	valid_



0.7545545480074769
Logged data and model in run 1cdb6c531b2d443bb1e75185a0b3aa60

---------- logged params ----------
{'boosting_type': 'gbdt',
 'categorical_feature': 'auto',
 'early_stopping_rounds': '100',
 'feature_name': 'auto',
 'importance_type': 'gain',
 'keep_training_booster': 'False',
 'learning_rate': '0.05',
 'metric': 'auc',
 'n_estimators': '100000',
 'num_boost_round': '1000',
 'num_leaves': '32',
 'objective': 'binary',
 'random_state': '123',
 'verbose_eval': '10'}

---------- logged metrics ----------
{'best_iteration': 202.0,
 'stopped_iteration': 302.0,
 'valid_0-auc': 0.7545545480074769}

---------- logged tags ----------
{}

---------- logged artifacts ----------
['feature_importance_gain.json',
 'feature_importance_gain.png',
 'feature_importance_split.json',
 'feature_importance_split.png',
 'model/MLmodel',
 'model/conda.yaml',
 'model/model.lgb',
 'model/python_env.yaml',
 'model/requirements.txt']


2022/08/25 11:22:58 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '23950d3bfa564f669696e3740fab2f7e', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11311
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 100 rounds
[10]	valid_0's auc: 0.725327
[20]	valid_0's auc: 0.732656
[30]	valid_0's auc: 0.738489
[40]	valid_0's auc: 0.74413
[50]	valid_0's auc: 0.747921
[60]	valid_0's auc: 0.75094
[70]	valid_0's auc: 0.753024
[80]	valid_0's auc: 0.754394
[90]	valid_0's auc: 0.755794
[100]	valid_0's auc: 0.756547
[110]	valid_0's auc: 0.757254
[120]	valid_0's auc: 0.757627
[130]	valid_0's auc: 0.757797
[140]	valid_0's auc: 0.758046
[150]	valid_0's auc: 0.758033
[160]	valid_0's auc: 0.758267
[170]	valid



0.758903530472367
Logged data and model in run 23950d3bfa564f669696e3740fab2f7e

---------- logged params ----------
{'boosting_type': 'gbdt',
 'categorical_feature': 'auto',
 'early_stopping_rounds': '100',
 'feature_name': 'auto',
 'importance_type': 'gain',
 'keep_training_booster': 'False',
 'learning_rate': '0.05',
 'metric': 'auc',
 'n_estimators': '100000',
 'num_boost_round': '1000',
 'num_leaves': '32',
 'objective': 'binary',
 'random_state': '123',
 'verbose_eval': '10'}

---------- logged metrics ----------
{'best_iteration': 230.0,
 'stopped_iteration': 330.0,
 'valid_0-auc': 0.758903530472367}

---------- logged tags ----------
{}

---------- logged artifacts ----------
['feature_importance_gain.json',
 'feature_importance_gain.png',
 'feature_importance_split.json',
 'feature_importance_split.png',
 'model/MLmodel',
 'model/conda.yaml',
 'model/model.lgb',
 'model/python_env.yaml',
 'model/requirements.txt']


2022/08/25 11:23:38 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '9f7574188fb0478297c48bb0ee151a87', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11294
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 115
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 100 rounds
[10]	valid_0's auc: 0.720039
[20]	valid_0's auc: 0.726994
[30]	valid_0's auc: 0.732763
[40]	valid_0's auc: 0.737766
[50]	valid_0's auc: 0.741657
[60]	valid_0's auc: 0.744213
[70]	valid_0's auc: 0.746091
[80]	valid_0's auc: 0.747401
[90]	valid_0's auc: 0.748398
[100]	valid_0's auc: 0.749494
[110]	valid_0's auc: 0.749916
[120]	valid_0's auc: 0.750364
[130]	valid_0's auc: 0.750824
[140]	valid_0's auc: 0.751076
[150]	valid_0's auc: 0.751226
[160]	valid_0's auc: 0.751439
[170]	val



0.7522186159012766
Logged data and model in run 9f7574188fb0478297c48bb0ee151a87

---------- logged params ----------
{'boosting_type': 'gbdt',
 'categorical_feature': 'auto',
 'early_stopping_rounds': '100',
 'feature_name': 'auto',
 'importance_type': 'gain',
 'keep_training_booster': 'False',
 'learning_rate': '0.05',
 'metric': 'auc',
 'n_estimators': '100000',
 'num_boost_round': '1000',
 'num_leaves': '32',
 'objective': 'binary',
 'random_state': '123',
 'verbose_eval': '10'}

---------- logged metrics ----------
{'best_iteration': 352.0,
 'stopped_iteration': 452.0,
 'valid_0-auc': 0.7522186159012766}

---------- logged tags ----------
{}

---------- logged artifacts ----------
['feature_importance_gain.json',
 'feature_importance_gain.png',
 'feature_importance_split.json',
 'feature_importance_split.png',
 'model/MLmodel',
 'model/conda.yaml',
 'model/model.lgb',
 'model/python_env.yaml',
 'model/requirements.txt']


2022/08/25 11:24:28 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '732303c25a5241f6a63acd634f47b562', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11315
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 100 rounds
[10]	valid_0's auc: 0.725116
[20]	valid_0's auc: 0.731376
[30]	valid_0's auc: 0.736955
[40]	valid_0's auc: 0.742046
[50]	valid_0's auc: 0.74548
[60]	valid_0's auc: 0.748439
[70]	valid_0's auc: 0.750086
[80]	valid_0's auc: 0.751529
[90]	valid_0's auc: 0.752884
[100]	valid_0's auc: 0.753928
[110]	valid_0's auc: 0.754788
[120]	valid_0's auc: 0.755261
[130]	valid_0's auc: 0.755561
[140]	valid_0's auc: 0.755737
[150]	valid_0's auc: 0.755839
[160]	valid_0's auc: 0.756023
[170]	vali



0.756817595108024
Logged data and model in run 732303c25a5241f6a63acd634f47b562

---------- logged params ----------
{'boosting_type': 'gbdt',
 'categorical_feature': 'auto',
 'early_stopping_rounds': '100',
 'feature_name': 'auto',
 'importance_type': 'gain',
 'keep_training_booster': 'False',
 'learning_rate': '0.05',
 'metric': 'auc',
 'n_estimators': '100000',
 'num_boost_round': '1000',
 'num_leaves': '32',
 'objective': 'binary',
 'random_state': '123',
 'verbose_eval': '10'}

---------- logged metrics ----------
{'best_iteration': 291.0,
 'stopped_iteration': 391.0,
 'valid_0-auc': 0.756817595108024}

---------- logged tags ----------
{}

---------- logged artifacts ----------
['feature_importance_gain.json',
 'feature_importance_gain.png',
 'feature_importance_split.json',
 'feature_importance_split.png',
 'model/MLmodel',
 'model/conda.yaml',
 'model/model.lgb',
 'model/python_env.yaml',
 'model/requirements.txt']


2022/08/25 11:25:15 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'ff24af2604e64c67ba23a862f8057031', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current lightgbm workflow


[LightGBM] [Info] Number of positive: 19860, number of negative: 226149
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11305
[LightGBM] [Info] Number of data points in the train set: 246009, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432486
[LightGBM] [Info] Start training from score -2.432486
Training until validation scores don't improve for 100 rounds
[10]	valid_0's auc: 0.731556
[20]	valid_0's auc: 0.737397
[30]	valid_0's auc: 0.743109
[40]	valid_0's auc: 0.748886
[50]	valid_0's auc: 0.752531
[60]	valid_0's auc: 0.75566
[70]	valid_0's auc: 0.757728
[80]	valid_0's auc: 0.759252
[90]	valid_0's auc: 0.760051
[100]	valid_0's auc: 0.760889
[110]	valid_0's auc: 0.761321
[120]	valid_0's auc: 0.76173
[130]	valid_0's auc: 0.761895
[140]	valid_0's auc: 0.762121
[150]	valid_0's auc: 0.762208
[160]	valid_0's auc: 0.762142
[170]	valid



0.7623866989331426
Logged data and model in run ff24af2604e64c67ba23a862f8057031

---------- logged params ----------
{'boosting_type': 'gbdt',
 'categorical_feature': 'auto',
 'early_stopping_rounds': '100',
 'feature_name': 'auto',
 'importance_type': 'gain',
 'keep_training_booster': 'False',
 'learning_rate': '0.05',
 'metric': 'auc',
 'n_estimators': '100000',
 'num_boost_round': '1000',
 'num_leaves': '32',
 'objective': 'binary',
 'random_state': '123',
 'verbose_eval': '10'}

---------- logged metrics ----------
{'best_iteration': 183.0,
 'stopped_iteration': 283.0,
 'valid_0-auc': 0.7623866989331426}

---------- logged tags ----------
{}

---------- logged artifacts ----------
['feature_importance_gain.json',
 'feature_importance_gain.png',
 'feature_importance_split.json',
 'feature_importance_split.png',
 'model/MLmodel',
 'model/conda.yaml',
 'model/model.lgb',
 'model/python_env.yaml',
 'model/requirements.txt']
aucs: 0.76 ± 0.00


In [11]:
# 説明変数と目的変数を指定
X_test = test.drop(['TARGET', 'SK_ID_CURR'], axis=1)

In [12]:
# テストデータにおける予測
preds = []

for model in models:
    pred = model.predict(X_test)
    preds.append(pred)

# predsの平均を計算
preds_array = np.array(preds)
preds_mean = np.mean(preds_array, axis = 0)

In [13]:
# 提出用サンプルの読み込み
sub = pd.read_csv('../input/home-credit-default-risk/sample_submission.csv')

# 目的変数カラムの置き換え
sub['TARGET'] = preds_mean

# ファイルのエクスポート
sub.to_csv('../submit/submission_baseline.csv', index=False)