In [None]:
import pandas as pd
import numpy as np

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import time
from lightgbm import LGBMClassifier
import lightgbm as lgb

import warnings
warnings.simplefilter('ignore', UserWarning)
from bayes_opt import BayesianOptimization
import gc
gc.enable()

In [None]:
!pip install bayesian-optimization matplotlib tqdm

In [None]:
feature_path=""
print("Load numerical features")
df_num = pd.concat([
            pd.read_feather(feature_path+"feature_num_features.ftr"),
            #pd.read_feather(feature_path+"feature_arithmetic+_combi2.ftr"),
            #pd.read_feather(feature_path+"feature_arithmetic-_combi2.ftr"),
            pd.read_feather(feature_path+"feature_arithmetic*_combi2.ftr"),
            pd.read_feather(feature_path+"feature_arithmeticdiv_combi2.ftr"),
            pd.read_feather(feature_path+"train_test_7th_df.ftr"),
            pd.read_feather(feature_path"train_test_oliverFE_df.ftr")
            ],axis=1,)
y_train = df_num["TARGET"].dropna()
df_num.drop(["TARGET"], axis=1, inplace=True)

print("Load categorical features")
df = pd.concat([
            pd.read_feather(feature_path+"feature_1way_label_encoding_with_te.ftr"),
            pd.read_feather(feature_path+"feature_2way_label_encoding_with_te.ftr"),
            #pd.read_feather(feature_path+"feature_3way_including_CODE_GENDER_label_encoding_with_te.ftr"),
            pd.read_feather(feature_path+"feature_round_num_label_encoding.ftr"),
            ],axis=1,)
df = pd.concat([df, df_num], axis=1)

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
del df_num

In [None]:
input_path=""
y_train=pd.read_csv(input_path+"train.csv").TARGET

In [None]:
df=reduce_mem_usage(df)

In [None]:
X_train = df.iloc[:y_train.shape[0],:]
X_test = df.iloc[y_train.shape[0]:,:]

In [None]:
actual_imp_df=pd.read_csv(feature_path+"actual_imp_df.csv")

In [None]:
null_imp_df=pd.read_csv(feature_path+"null_imp_df.csv")

In [None]:
THRESHOLD = 97

# 閾値を超える特徴量を取得
imp_features = []
for feature in actual_imp_df["feature"]:
    actual_value = actual_imp_df.query(f"feature=='{feature}'")["importance"].values
    null_value = null_imp_df.query(f"feature=='{feature}'")["importance"].values
    percentage = (null_value < actual_value).sum() / null_value.size * 100
    if percentage >= THRESHOLD:
        imp_features.append(feature)
len(imp_features)


In [None]:
add_feature2=pd.read_feather(feature_path+"github_feature2.fhr")
train_df=pd.concat([X_train.loc[:, imp_features], add_feature2.iloc[:y_train.shape[0],:]], axis=1)
test_df=pd.concat([X_test.loc[:, imp_features], add_feature2.iloc[y_train.shape[0]:,:]], axis=1)

In [None]:
del df

In [None]:
gc.collect()

ベイズ最適化

In [None]:
def bayes_parameter_opt_lgb(X, y, init_round=25, opt_round=35, n_folds=5, random_seed=1001, n_estimators=10000, learning_rate=0.005, output_process=False):
    # prepare data
    train_data = lgb.Dataset(data=X, label=y,  free_raw_data=False)
    # parameters

    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
      params = {'application':'binary','num_iterations':10000, 'learning_rate':0.005134, 'early_stopping_round':200, 'metric':'auc'}
      params["num_leaves"] = int(num_leaves)
      params['feature_fraction'] = max(min(feature_fraction, 1), 0)
      params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
      params['max_depth'] = int(max_depth)
      params['lambda_l1'] = max(lambda_l1, 0)
      params['lambda_l2'] = max(lambda_l2, 0)
      params['min_split_gain'] = min_split_gain
      params['min_child_weight'] = min_child_weight
      cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
      return max(cv_result['auc-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
                                            'feature_fraction': (0.1, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 8.99),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (5, 50)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv(feature_path+"bayes_opt_result.csv")
    
    # return best parameters
    return lgbBO.res['max']['max_params']

opt_params = bayes_parameter_opt_lgb(train_df, y_train, init_round=25, opt_round=35, n_folds=5, random_seed=1001, n_estimators=10000, learning_rate=0.005)

In [None]:
 iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |

In [None]:
print(opt_params)