## 健診データから喫煙者か否かを予測する
## Binary Prediction of Smoker Status using Bio-Signals

##### 前回に引き続き二値分類の課題ですので、Objectiveとしてbinary、Metricとしてbinary_loglossを使います。与えられたデータには欠損値もありませんし、整数または小数値なので、一旦はそのまま処理していきます（0/1および1/2で分類されているものもありますので、後でここに手を加えるかもしれません）。トレーニングデータに一部外れ値らしきものがありますので、これだけは除外しようと思います。
##### Continuing from last time, this is a binary classification problem, so we will use binary as the objective and binary_logloss as the metric. The given data has no missing values and is an integer or decimal value, so we will process it as is (some data is classified as 0/1 and 1/2, so we will deal with this later). There are some outliers in the training data, so I would like to exclude them.

이상치가 있으므로 이상치를 제거!

# 1. ライブラリと設定
## Libraries and settings

In [None]:
import numpy as np
import pandas as pd

import sys
import datetime
import time
import math

import lightgbm as lgb
import optuna.integration.lightgbm as lgbo

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, PowerTransformer, QuantileTransformer
le = preprocessing.LabelEncoder()
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error # 平均絶対誤差
from sklearn.metrics import mean_squared_error # 平均二乗誤差
from sklearn.metrics import mean_squared_log_error # 対数平均二乗誤差
from sklearn.metrics import r2_score # 決定係数
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import auc
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import missingno as msno
import plotly.express as px

import warnings
warnings.filterwarnings("ignore")

In [None]:
start_time = time.time()

In [None]:
# Pandas setting to display more dataset rows and columns
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_columns', 600)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [None]:
# 2. データのインポート
## Import data

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
origin = pd.read_csv('./origin_train_dataset.csv')

In [None]:
sample_submission

In [None]:
train

In [None]:
train.info()

In [None]:
test

In [None]:
test.info()

In [None]:
origin

In [None]:
origin.info()

# 3. 前処理
## Preprosessing

In [None]:
# id 값을 제거하고 피처를 확인
# Check features included in train

feat_train = train.columns.drop('id').tolist()
feat_train

In [None]:
# test に含まれる特徴量を確認
# Check features included in test

feat_test = test.columns.drop('id').tolist()
feat_test

### smoking train에 존재

1. age: 나이
2. height(cm): 키(cm)
3. weight(kg): 체중(kg)
4. waist(cm): 허리둘레(cm)
5. eyesight(left): 시력(왼쪽)
6. eyesight(right): 시력(오른쪽)
7. hearing(left): 청력(왼쪽)
8. hearing(right): 청력(오른쪽)
9. systolic: 수축기 혈압
10. relaxation: 이완기 혈압
11. fasting blood sugar: 공복 혈당
12. Cholesterol: 콜레스테롤
13. triglyceride: 중성지방
14. HDL: HDL 콜레스테롤
15. LDL: LDL 콜레스테롤
16. hemoglobin: 헤모글로빈
17. Urine protein: 요단백
18. serum creatinine: 혈청 크레아티닌
19. AST: 아스파타미노트랜스페라아제 (간 기능)
20. ALT: 알라닌 아미노트랜스페라아제 (간 기능)
21. Gtp: 감마GTP
22. dental caries: 치아 충치
23. smoking: 흡연

In [None]:
# trainとorigin、testを結合
# Concat train, origin and test

data_t_o = pd.concat([train, origin], ignore_index=True)
data_t_o = data_t_o.drop_duplicates() # 重複データを削除

data_t_o

train : 159256rows<br>
origin:  38984rows<br>
total : 198240rows<br>
198240 - 192723 = 5517<br>

5517の重複データがあります<br>
5517 duplications

In [None]:
# trainとorigin、testを結合
# Concat train, origin and test

data_all = pd.concat([data_t_o, test], ignore_index=True)

data_all

# 4. 探索的データ解析
## EDA

In [None]:
# 欠損値の有無を可視化します
# Search for missing data

msno.matrix(df=data_all, figsize=(10,6), color=(0,.3,.3))


##### 欠損値はありません（idの欠損はoriginの分）
##### No missing value(Missing "id" is "origin")

In [None]:
data_all[feat_train].describe().T\
        .style.bar(subset=['mean'], color=px.colors.qualitative.G10[0])\
        .background_gradient(subset=['std'], cmap='Greens')\
        .background_gradient(subset=['50%'], cmap='BuGn')

In [None]:
# get_dummiesメソッドで特徴量を分割
# Divide features with get_dummies method

list_feat = ['hearing(left)', 'hearing(right)', 'dental caries']
data_all = pd.get_dummies(data_all, columns=list_feat, dtype='uint8')
data_all

In [None]:
#feat_train = data_all.columns.drop(['id']).tolist()
feat_test = data_all.columns.drop(['id', 'smoking']).tolist()

In [None]:
# データをトレーニング用と予測用に分けます
# Split train and test

train = data_all.loc[data_t_o.index[0]:data_t_o.index[-1]-5517]
test = data_all.loc[data_t_o.index[-1]+1-5517:]

In [None]:
train

In [None]:
test

In [None]:
# トレーニングデータとテストデータの分布を可視化
# Plot histograms (train and test)

for feat in feat_test:
    plt.figure(figsize=(12,3))
    ax1 = plt.subplot(1,2,1)
    train[feat].plot(kind='hist', bins=50, color='blue')
    plt.title(feat + ' / train')
    ax2 = plt.subplot(1,2,2, sharex=ax1)
    test[feat].plot(kind='hist', bins=50, color='green')
    plt.title(feat + ' / test')
    plt.show()


In [None]:
# トレーニングデータの「smoking」の分布を可視化
# Visualize distribution of "smoking" in train

plt.figure(figsize=(12,3))
train['smoking'].plot(kind='hist', bins=50, color='green')
plt.title('smoking / train')
plt.show()

In [None]:
# トレーニングデータおよびテストデータの分布を可視化
# Visualize the distributions of data in train and test

for feat in feat_test:
    plt.figure(figsize=(12,4))
    ax1 = plt.subplot(1,2,1)
    sns.boxplot(data=train, x='smoking', y=feat)
    plt.title('smoking vs ' + feat + ' / train')
    x1 = plt.subplot(1,2,2)
    sns.boxplot(data=test, y=feat)
    plt.title(feat + ' / test')
    plt.show()


In [None]:
# 外れ値のデータを削除
# Remove outliers

train = train.drop(train[train['triglyceride'] > 700].index)
train = train.drop(train[train['HDL'] > 350].index)
#train = train.drop(train[train['LDL'] > 1500].index)

train

In [None]:
# Heatmap(train)

corr = train.drop(columns=['id']).corr().round(1)
plt.figure(figsize=(20,10))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, square=False, annot=True, cmap='coolwarm')
plt.show()


In [None]:
# Heatmap(test)

corr = test.drop(columns=['id', 'smoking']).corr().round(1)
plt.figure(figsize=(20,10))
sns.heatmap(corr, vmin=-1, vmax=1, center=0, square=False, annot=True, cmap='coolwarm')
plt.show()


# 5. モデルの作成
## Modeling

### <font color='#bb0000'>処理を指定<br>Specify processing</font>

In [None]:
#----------------------------------------------------
# 基本設定（Base setting）
#----------------------------------------------------
test_size = 0.2
random_state = 45
objective = 'binary'
metric = 'binary_logloss' # binary_logloss, binary_error, auc

#----------------------------------------------------
# for optuna
#----------------------------------------------------
optuna_switch = 'off'
opt_count = 5
num_choose = 5

if opt_count < num_choose:
    num_choose = opt_count

#----------------------------------------------------
# for lightGBM
#----------------------------------------------------
learning_rate = 0.005 # 0.0001
num_iterations = 300000 # 100
max_depth = -1


In [None]:
# 学習用データと検証用データを作成する関数

def make_lgb_data(test_size, random_state, metric, X, value):
    X_train, X_test, t_train, t_test = train_test_split(
        X,
        value,
        test_size=test_size,
        random_state=random_state
    )

    lgb_train = lgb.Dataset(
        X_train,
        t_train
    )
    
    lgb_eval = lgb.Dataset(
        X_test,
        t_test,
        reference=lgb_train
    )
    
    dic_return = {
        'X_train' : X_train,
        'X_test' : X_test,
        't_train' : t_train,
        't_test' : t_test,
        'lgb_train' : lgb_train,
        'lgb_eval' : lgb_eval
    }
    
    return dic_return

In [None]:
# optuna

def tuneParams(test_size, random_state, objective, metric, X, value):
    opt_params = {
        'force_row_wise' : True,
        'force_col_wise' : False,
        'objective' : objective,
        'metric' : metric
    }

    dic = make_lgb_data(test_size, random_state, metric, X, value)
    lgb_train = dic['lgb_train']
    lgb_eval = dic['lgb_eval']
    
    opt = lgbo.train(
        opt_params,
        lgb_train,
        valid_sets = lgb_eval,
        verbose_eval = False,
        num_boost_round = 10,
        early_stopping_rounds = 10
    )
    
    return opt

In [None]:
# 学習（モデル作成）関数

def make_model(X, value, test_size, random_state, objective, metric, learning_rate, num_iterations, max_depth, paramObj):
    dic = make_lgb_data(test_size, random_state, metric, X, value)
    lgb_train = dic['lgb_train']
    lgb_eval = dic['lgb_eval']
    X_test = dic['X_test'] # 検証用
    t_test = dic['t_test'] # 　〃

    params = {
        'task': 'train',
        'objective': objective,
        'metric': metric,
        'boosting_type': 'gbdt',
        'learning_rate': learning_rate,
        'num_iterations': num_iterations,
        'max_depth': max_depth,
        'force_row_wise' : paramObj['force_row_wise'],
        'force_col_wise' : paramObj['force_col_wise'],
        'feature_pre_filter': paramObj['feature_pre_filter'],
        'lambda_l1': paramObj['lambda_l1'],
        'lambda_l2': paramObj['lambda_l2'],
        'num_leaves': paramObj['num_leaves'],
        'feature_fraction': paramObj['feature_fraction'],
        'bagging_fraction': paramObj['bagging_fraction'],
        'bagging_freq': paramObj['bagging_freq'],
        'min_child_samples': paramObj['min_child_samples'],
        'verbosity': -1
    }

    evaluation_results = {}                       # 学習の経過を保存する
    model = lgb.train(
        params,
        valid_names=['train', 'valid'],           # 学習経過で表示する名称
        valid_sets=[lgb_train, lgb_eval],         # モデル検証のデータセット
        #evals_result=evaluation_results,          # 学習の経過を保存
        train_set=lgb_train,
        #early_stopping_rounds=100,#100
        #verbose_eval=100
        callbacks=[lgb.early_stopping(100),
                   lgb.record_evaluation(evaluation_results),
                   lgb.log_evaluation(100) ]
    )
    
    resultObj = {'paramObj' : paramObj,
                 'evaluation_results' : evaluation_results,
                 'model' : model,
                 'X_test' : X_test, # 検証用
                 't_test' : t_test} #   〃
    return resultObj

In [None]:
# trainを学習用データセットと検証用データセットに分割

X = train[feat_test]
value = train['smoking']

In [None]:
# optuna

if optuna_switch == 'on':
    param_ary = []
    for i in range(opt_count):
        print('=' * 80)
        print(f'【Round : {i + 1}】')
        print('=' * 80)
        opt = tuneParams(test_size, random_state, objective, metric, X, value)
        score = opt.best_score['valid_0'][metric]
        dic = {'score' : score, 'params' : opt.params}
        param_ary.append(dic)
    
    if metric == 'auc':
        # スコアの高い順にソート
        param_ary = sorted(param_ary, key=lambda x: x['score'], reverse=True)
    elif metric == 'binary_logloss':
        # スコアの低い順にソート
        param_ary = sorted(param_ary, key=lambda x: x['score'], reverse=False)
    

In [None]:
if optuna_switch == 'on':
    count = 0
    for dic in param_ary:
        score = dic['score']
        params = dic['params']
        count += 1
        print('')
        print('=' * 100)
        print(f'#{count}----->')
        print(f'opt_score : {score}')
        print(f'params : {params}')
        print('=' * 100)
        print('')

### MEMO
===========================================================<br>
##### そのまま処理
opt score : 0.5040630631568116<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.005237888351029057, 'lambda_l2': 0.3242533955071764, 'num_leaves': 253, 'feature_fraction': 1.0, 'bagging_fraction': 0.4442385446335148, 'bagging_freq': 1, 'min_child_samples': 20, 'num_iterations': 10, 'early_stopping_round': None}<br>
[2802]	train's binary_logloss: 0.295543	valid's binary_logloss: 0.436024<br>
auc : 0.9470267965782442<br>
Public score : 0.87064(ver1)<br>
<br>
opt score : 0.5049468249268089<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 253, 'feature_fraction': 1.0, 'bagging_fraction': 0.929906079778351, 'bagging_freq': 1, 'min_child_samples': 100, 'num_iterations': 10, 'early_stopping_round': None}<br>
[2984]	train's binary_logloss: 0.298777	valid's binary_logloss: 0.435815<br>
auc : 0.9451385135464268<br>
Public score : 0.87108(ver2)<br>
<br>
opt score : 0.5049887487864972<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 256, 'feature_fraction': 1.0, 'bagging_fraction': 0.7879076169748924, 'bagging_freq': 3, 'min_child_samples': 20, 'num_iterations': 10, 'early_stopping_round': None}<br>
[2475]	train's binary_logloss: 0.303928	valid's binary_logloss: 0.437543<br>
auc : 0.9435765314271563<br>
Public score : 0.87015(ver3)<br>
<br>
opt score : 0.504366240260844<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 253, 'feature_fraction': 1.0, 'bagging_fraction': 0.7015814956634723, 'bagging_freq': 1, 'min_child_samples': 20, 'num_iterations': 10, 'early_stopping_round': None}<br>
[3219]	train's binary_logloss: 0.275103	valid's binary_logloss: 0.436324<br>
auc : 0.9558782470416705<br>
Public score : 0.87082(ver4)<br>
<br>
opt score : 0.5045660034598182<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 256, 'feature_fraction': 0.9520000000000001, 'bagging_fraction': 0.8000982346621339, 'bagging_freq': 2, 'min_child_samples': 50, 'num_iterations': 10, 'early_stopping_round': None}<br>
[2836]	train's binary_logloss: 0.296497	valid's binary_logloss: 0.435991<br>
auc : 0.9467362233470888<br>
Public score : 0.87113(ver5)<br>
<br>
===========================================================<br>
##### 0/1および1/2で分類されているものをget_dummiesで分割
opt score : 0.5041088576389983<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.08349324800598144, 'lambda_l2': 0.0002547437878290678, 'num_leaves': 253, 'feature_fraction': 1.0, 'bagging_fraction': 0.6010621550744889, 'bagging_freq': 1, 'min_child_samples': 50, 'num_iterations': 10, 'early_stopping_round': None}<br>
[2399]	train's binary_logloss: 0.314423	valid's binary_logloss: 0.436021<br>
auc : 0.9384999714460561<br>
Public score : 0.87089(ver6)<br>
<br>
opt score : 0.504113204524721<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.06895642700812259, 'lambda_l2': 0.20930335338134826, 'num_leaves': 253, 'feature_fraction': 1.0, 'bagging_fraction': 0.40138468521841814, 'bagging_freq': 1, 'min_child_samples': 50, 'num_iterations': 10, 'early_stopping_round': None}<br>
[2458]	train's binary_logloss: 0.316906	valid's binary_logloss: 0.435809<br>
auc : 0.936922167044993<br>
Public score : 0.87084(ver7)<br>
<br>
opt score : 0.504154931439968<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 256, 'feature_fraction': 1.0, 'bagging_fraction': 0.40041264987744657, 'bagging_freq': 1, 'min_child_samples': 50, 'num_iterations': 10, 'early_stopping_round': None}<br>
[2559]	train's binary_logloss: 0.310193	valid's binary_logloss: 0.436043<br>
auc : 0.9398432564750474<br>
Public score : 0.87082(ver8)<br>
<br>
opt score : 0.5045048354161731<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.013074788480004389, 'lambda_l2': 1.4633019310098181e-06, 'num_leaves': 254, 'feature_fraction': 1.0, 'bagging_fraction': 0.8719391793357675, 'bagging_freq': 1, 'min_child_samples': 10, 'num_iterations': 10, 'early_stopping_round': None}<br>
[3134]	train's binary_logloss: 0.276515	valid's binary_logloss: 0.436928<br>
auc : 0.9556551378703616<br>
Public score : 0.86985(ver9)<br>
<br>
opt score : 0.5048692211610079<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 255, 'feature_fraction': 1.0, 'bagging_fraction': 0.5987872588749034, 'bagging_freq': 3, 'min_child_samples': 20, 'num_iterations': 10, 'early_stopping_round': None}<br>
[2386]	train's binary_logloss: 0.308038	valid's binary_logloss: 0.436756<br>
auc : 0.9415711466212641<br>
Public score : 0.87005(ver10)<br>
<br>
===========================================================<br>
##### originalデータを追加
opt score : 0.5058638546348095<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.06531133091916129, 'lambda_l2': 0.0016935724215372185, 'num_leaves': 254, 'feature_fraction': 1.0, 'bagging_fraction': 0.40054229472860436, 'bagging_freq': 1, 'min_child_samples': 50, 'num_iterations': 10, 'early_stopping_round': None}<br>
[3733]	train's binary_logloss: 0.29066	valid's binary_logloss: 0.434586<br>
auc : 0.9485050992304989<br>
Public score : 0.87402(ver11)<br>
<br>
opt score : 0.5058902378672291<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 254, 'feature_fraction': 0.9520000000000001, 'bagging_fraction': 0.40186575137769553, 'bagging_freq': 1, 'min_child_samples': 50, 'num_iterations': 10, 'early_stopping_round': None}<br>
[3771]	train's binary_logloss: 0.290268	valid's binary_logloss: 0.434193<br>
auc : 0.9487470881904436<br>
Public score : 0.87428(ver12)<br>
<br>
opt score : 0.5065616831631313<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 253, 'feature_fraction': 0.9520000000000001, 'bagging_fraction': 0.8502785052939205, 'bagging_freq': 2, 'min_child_samples': 100, 'num_iterations': 10, 'early_stopping_round': None}<br>
<font color='#ff0000'>[4212]	train's binary_logloss: 0.282832	valid's binary_logloss: 0.43429<br>
auc : 0.95221340654535<br>
Public score : 0.87464(ver13)</font><br>
<br>
opt score : 0.5065674065678668<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 255, 'feature_fraction': 0.9520000000000001, 'bagging_fraction': 0.6039958569612263, 'bagging_freq': 2, 'min_child_samples': 20, 'num_iterations': 10, 'early_stopping_round': None}<br>
[3996]	train's binary_logloss: 0.273236	valid's binary_logloss: 0.435504<br>
auc : 0.9566086490435851<br>
Public score : 0.8737(ver14)<br>
<br>
opt score : 0.5068112432954752<br>
params : {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 1.0787883028785273e-08, 'lambda_l2': 0.08889804258123069, 'num_leaves': 253, 'feature_fraction': 1.0, 'bagging_fraction': 0.9470614021461423, 'bagging_freq': 1, 'min_child_samples': 25, 'num_iterations': 10, 'early_stopping_round': None}<br>
<font color='#0000ff'>[4634]	train's binary_logloss: 0.261319	valid's binary_logloss: 0.43641<br>
auc : 0.961287814554566<br>
Public score : 0.87325(ver15)</font><br>
<br>
===========================================================<br>
##### ver13のパラメーターセットを使いrandom_stateを変更
random_state=1<br>
[3452]	train's binary_logloss: 0.302086	valid's binary_logloss: 0.442282<br>
auc : 0.94293273644475<br>
Public score : 0.87395(ver16)<br>
<br>
random_state=2<br>
[4040]	train's binary_logloss: 0.286478	valid's binary_logloss: 0.438882<br>
auc : 0.9501506193765014<br>
Public score : 0.8742(ver17)<br>
<br>
random_state=3<br>
[3528]	train's binary_logloss: 0.299667	valid's binary_logloss: 0.444664<br>
auc : 0.943688633413089<br>
Public score : 0.878401(ver18)<br>
<br>
random_state=4<br>
[4175]	train's binary_logloss: 0.282961	valid's binary_logloss: 0.439568<br>
auc : 0.9515577190407067<br>
Public score : 0.87411(ver19)<br>
<br>
random_state=5<br>
[3858]	train's binary_logloss: 0.292413	valid's binary_logloss: 0.436085<br>
auc : 0.9480502022756125<br>
Public score : 0.87414(ver20)<br>
<br>
random_state=6<br>
[3766]	train's binary_logloss: 0.293456	valid's binary_logloss: 0.441555<br>
auc : 0.9467988084161657<br>
Public score : 0.87384(ver21)<br>
<br>
random_state=7<br>
[3387]	train's binary_logloss: 0.30503	valid's binary_logloss: 0.437331<br>
auc : 0.942486051760207<br>
Public score : 0.87446(ver22)<br>
<br>
random_state=8<br>
[3668]	train's binary_logloss: 0.295916	valid's binary_logloss: 0.441873<br>
auc : 0.9457732574999436<br>
Public score : 0.87458(ver23)<br>
<br>
random_state=9<br>
[4328]	train's binary_logloss: 0.279519	valid's binary_logloss: 0.435955<br>
auc : 0.9531459971129521<br>
Public score : 0.87458(ver24)<br>
<br>
random_state=10<br>
[3653]	train's binary_logloss: 0.297825	valid's binary_logloss: 0.436764<br>
auc : 0.9457190688333114<br>
Public score : 0.874(ver25)<br>
<br>
random_state=11<br>
[4175]	train's binary_logloss: 0.283224	valid's binary_logloss: 0.436392<br>
auc : 0.9516833922936344<br>
Public score : 0.87418(ver26)<br>
<br>
random_state=12<br>
[4362]	train's binary_logloss: 0.277996	valid's binary_logloss: 0.439815<br>
auc : 0.9532379299637425<br>
<font color='#ff0000'>Public score : 0.87513(ver27)</font><br>
<br>
random_state=13<br>
[4088]	train's binary_logloss: 0.285521	valid's binary_logloss: 0.435692<br>
auc : 0.9510060343463467<br>
Public score : 0.87504(ver28)<br>
<br>
random_state=14<br>
[3550]	train's binary_logloss: 0.299278	valid's binary_logloss: 0.442666<br>
auc : 0.944096628716832<br>
Public score : 0.87467(ver29)<br>
<br>
random_state=15<br>
[3889]	train's binary_logloss: 0.291653	valid's binary_logloss: 0.43343<br>
auc : 0.9488926980522882<br>
Public score : 0.87399(ver30)<br>
<br>
random_state=16<br>
[4352]	train's binary_logloss: 0.279429	valid's binary_logloss: 0.433857<br>
auc : 0.9536355129001919<br>
Public score : 0.87422(ver32)<br>
<br>
random_state=17<br>
[3508]	train's binary_logloss: 0.300507	valid's binary_logloss: 0.441769<br>
auc : 0.9435071462929625<br>
Public score : 0.8736(ver33)<br>
<br>
random_state=18<br>
[3720]	train's binary_logloss: 0.294955	valid's binary_logloss: 0.441252<br>
auc : 0.946106826307993<br>
Public score : 0.87396(ver34)<br>
<br>
random_state=19<br>
[4024]	train's binary_logloss: 0.286817	valid's binary_logloss: 0.439192<br>
auc : 0.9500282031352443<br>
Public score : 0.87454(ver35)<br>
<br>
random_state=20<br>
[3821]	train's binary_logloss: 0.29161	valid's binary_logloss: 0.441436<br>
auc : 0.9475424494485658<br>
Public score : 0.87419(ver36)<br>
<br>
random_state=21<br>
[3480]	train's binary_logloss: 0.302125	valid's binary_logloss: 0.439475<br>
auc : 0.9435922954104948<br>
Public score : 0.87375(ver37)<br>
<br>
random_state=22<br>
[4338]	train's binary_logloss: 0.279182	valid's binary_logloss: 0.435957<br>
auc : 0.9536687777843542<br>
Public score : 0.87431(ver38)<br>
<br>
random_state=23<br>
[3280]	train's binary_logloss: 0.307257	valid's binary_logloss: 0.442347<br>
auc : 0.9406026086855058<br>
Public score : 0.87418(ver39)<br>
<br>
random_state=24<br>
[3811]	train's binary_logloss: 0.292786	valid's binary_logloss: 0.438483<br>
auc : 0.9476582777386412<br>
Public score : 0.87403(ver40)<br>
<br>
random_state=25<br>
[3767]	train's binary_logloss: 0.293101	valid's binary_logloss: 0.443812<br>
auc : 0.946475218764873<br>
Public score : 0.87414(ver41)<br>
<br>
random_state=26<br>
[4287]	train's binary_logloss: 0.280351	valid's binary_logloss: 0.436617<br>
auc : 0.9527397587798169<br>
Public score : 0.87433(ver42)<br>
<br>
random_state=27<br>
[4100]	train's binary_logloss: 0.285207	valid's binary_logloss: 0.437525<br>
auc : 0.9510133712713913<br>
Public score : 0.87477(ver43)<br>
<br>
random_state=28<br>
[3908]	train's binary_logloss: 0.289786	valid's binary_logloss: 0.439277<br>
auc : 0.9484440006795241<br>
Public score : 0.87439(ver44)<br>
<br>
random_state=29<br>
[3240]	train's binary_logloss: 0.309026	valid's binary_logloss: 0.440846<br>
auc : 0.9399203016493798<br>
Public score : 0.87412(ver45)<br>
<br>
random_state=30<br>
[3745]	train's binary_logloss: 0.295392	valid's binary_logloss: 0.436838<br>
auc : 0.946872102637015<br>
Public score : 0.87394(ver46)<br>
<br>
random_state=31<br>
[4059]	train's binary_logloss: 0.286286	valid's binary_logloss: 0.436752<br>
auc : 0.9502921308154677<br>
Public score : 0.87424(ver47)<br>
<br>
random_state=32<br>
[3055]	train's binary_logloss: 0.314233	valid's binary_logloss: 0.442497<br>
auc : 0.937290434175597<br>
Public score : 0.87357(ver48)<br>
<br>
random_state=33<br>
[3515]	train's binary_logloss: 0.302082	valid's binary_logloss: 0.43448<br>
auc : 0.9442086270820144<br>
Public score : 0.87387(ver49)<br>
<br>
random_state=34<br>
[3782]	train's binary_logloss: 0.293592	valid's binary_logloss: 0.437717<br>
auc : 0.9475141356364027<br>
Public score : 0.87512(ver50)<br>
<br>
random_state=35<br>
[3940]	train's binary_logloss: 0.288501	valid's binary_logloss: 0.443715<br>
auc : 0.9485836833133784<br>
Public score : 0.87459(ver51)<br>
<br>
random_state=36<br>
3826]	train's binary_logloss: 0.292955	valid's binary_logloss: 0.43624<br>
auc : 0.9481317944834375<br>
Public score : 0.8743(ver52)<br>
<br>
random_state=37<br>
[3361]	train's binary_logloss: 0.306133	valid's binary_logloss: 0.437755<br>
auc : 0.9417649440659054<br>
Public score : 0.8746(ver53)<br>
<br>
random_state=38<br>
[4032]	train's binary_logloss: 0.286989	valid's binary_logloss: 0.437809<br>
auc : 0.9499984386406961<br>
Public score : 0.87441(ver54)<br>
<br>
random_state=39<br>
[3673]	train's binary_logloss: 0.297258	valid's binary_logloss: 0.436774<br>
auc : 0.9459823533082667<br>
Public score : 0.87412(ver55)<br>
<br>
random_state=40<br>
[3772]	train's binary_logloss: 0.294246	valid's binary_logloss: 0.436595<br>
auc : 0.9471880757375403<br>
Public score : 0.87397(ver56)<br>
<br>


In [None]:
if optuna_switch == 'on':
    param_ary = param_ary[0 : num_choose]
else:
    # optunaを使わないときはここにパラメーターをセット
    # In case of unusing optuna, set params here
    param_ary = [
        {'force_row_wise': True, 'force_col_wise': False, 'objective': 'binary', 'metric': 'binary_logloss', 'feature_pre_filter': False, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'num_leaves': 253, 'feature_fraction': 0.9520000000000001, 'bagging_fraction': 0.8502785052939205, 'bagging_freq': 2, 'min_child_samples': 100, 'num_iterations': 10, 'early_stopping_round': None}
    ]

result_ary = []
for i in range(len(param_ary)):
    print()
    print('='*80)
    print(f'【Round : {i + 1}】')
    
    if optuna_switch == 'on':
        score = param_ary[i]['score']
        print(f'opt score : {score}')
        paramObj = param_ary[i]['params']
    else:
        paramObj = param_ary[i]
        
    print(f'params : {paramObj}')
    print('-'*80)
    resultObj = make_model(X, value, test_size, random_state, objective, metric, learning_rate, num_iterations, max_depth, paramObj)
    result_ary.append(resultObj)

# 6. 学習結果
## Learning result

In [None]:
# 学習過程を可視化

count = 0
for resultObj in result_ary:
    count += 1
    print()
    print(f'【round {count}】')

    evaluation_results = resultObj['evaluation_results']

    plt.plot(evaluation_results['train'][metric], label='train')
    plt.plot(evaluation_results['valid'][metric], label='valid')
    plt.ylabel(metric)
    plt.xlabel('Boosting round')
    plt.title('Training performance')
    plt.legend()
    plt.show()

In [None]:
# First round's feature importance

model_0 = result_ary[0]['model']
fig, ax = plt.subplots(figsize=(10, 10))
lgb.plot_importance(model_0, ax=ax)

In [None]:
# ロック曲線
# Roc Curve

def show_roc_curve(X, model):
    for_verifi = model.predict(X)
    true = train['smoking']
    fpr, tpr, thresholds = roc_curve(true, for_verifi)
    plt.plot(fpr, tpr, marker='.')
    plt.xlabel('FPR: False positive rate')
    plt.ylabel('TPR: True positive rate')
    plt.title('ROC Curve')
    plt.show()
    
    return {'for_verifi' : for_verifi,
            'fpr' : fpr,
            'tpr' : tpr,
            'thresholds' : thresholds}

In [None]:
# 混同行列
# Confusion matrix

def show_cm(for_verifi):
    cm = confusion_matrix(train['smoking'], np.round(for_verifi))
    sns.heatmap(cm, annot=True, cmap='Blues', fmt=',')
    plt.xlabel('Prediction')
    plt.ylabel('Result')
    plt.show()


In [None]:
%%time

for i in range(len(result_ary)):
    print('-'*80)
    print(f'Round {i+1} / {len(result_ary)}')
    model = result_ary[i]['model']
    val_score = model.best_score['valid'][metric]
    print(f'Valid Score : {val_score}')
    print('-'*80)

    # Rock curve
    obj = show_roc_curve(X, model)

    # Auc
    fpr = obj['fpr']
    tpr = obj['tpr']
    acc = auc(fpr, tpr)
    result_ary[i]['auc'] = acc # result_aryにaucを保存
    print(f'auc : {acc}')
    print()

    # Confusion Matrix
    print('Confusion Matrix')
    for_verify = obj['for_verifi']
    show_cm(for_verify)

# 7. 予測
## Prediction

In [None]:
# 学習したモデルで予測
# Predict with learned model

def get_pred(test, feat_test):
    fold = []
    print()
    for i in range(len(result_ary)):
        paramObj = result_ary[i]['paramObj']
        model = result_ary[i]['model']
        auc = result_ary[i]['auc']
        if auc > 0.5:
            print(f'params : {paramObj}')
            print(f'auc : {auc}')
            print()
            result = model.predict(test[feat_test])
            fold.append(result)
    print('-'*80)
    print(f'Result : {fold}')
    print('-'*80)
    return fold


In [None]:
fold = get_pred(test, feat_test)
# 複数のモデルで予測した結果をアンサンブル
# Ensemble predicted results
state = 0
if len(fold) > 0:
    df_result = pd.DataFrame(fold).transpose().mean(axis=1)
    state = 1


# 8. 投稿用ファイルの作成
## Make submission file

In [None]:
if state == 1:
    sample_submission['smoking'] = df_result
sample_submission

In [None]:
sample_submission.to_csv('lgbm_submission.csv', index=False)

In [None]:
def changeHMS(s):
    h = math.floor(s / 3600)
    if h > 0:
        s = s - h * 3600
        indi_h = str(h) + 'h'
    else:
        indi_h = ''
    m = math.floor(s / 60)
    if m > 0:
        indi_m = str(m) + 'm'
    else:
        indi_m = ''
    s = math.floor(s % 60)
    time = indi_h + indi_m + str(s) + 's'
    return time

In [None]:
end_time = time.time()
lap = end_time - start_time
indi_time = changeHMS(lap)
indi_time